In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from datetime import datetime

In [2]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=400'

user_agent = {'User-agent': 'Mozilla/5.0'}

response  = requests.get(url, headers = user_agent)

page = response.text

soup = BeautifulSoup(page,"lxml")

In [3]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
</th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
</th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Lifetime Gross">Lifetime Gross</span>
</th><th class="a-text-left mojo-field-type-year a-nowrap"><span title="Year">Year</span>
</th></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">401</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt0232500/?ref_=bo_cso_table_1">The Fast and the Furious</a></td><td class="a-text-right mojo-field-type-money">$144,533,925</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2001/?ref_=bo_cso_table_1">2001</a></td></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-fie

In [4]:
rows = [row for row in table.find_all('tr')]

In [5]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = row.find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
movies

{'The Fast and the Furious': ['/title/tt0232500/?ref_=bo_cso_table_1',
  '401',
  'The Fast and the Furious',
  '$144,533,925',
  '2001'],
 'Mamma Mia!': ['/title/tt0795421/?ref_=bo_cso_table_2',
  '402',
  'Mamma Mia!',
  '$144,169,664',
  '2008'],
 'Doctor Dolittle': ['/title/tt0118998/?ref_=bo_cso_table_3',
  '403',
  'Doctor Dolittle',
  '$144,156,605',
  '1998'],
 'Pokémon Detective Pikachu': ['/title/tt5884052/?ref_=bo_cso_table_4',
  '404',
  'Pokémon Detective Pikachu',
  '$144,105,346',
  '2019'],
 'Rio': ['/title/tt1436562/?ref_=bo_cso_table_5',
  '405',
  'Rio',
  '$143,619,809',
  '2011'],
 'Kung Fu Panda 3': ['/title/tt2267968/?ref_=bo_cso_table_6',
  '406',
  'Kung Fu Panda 3',
  '$143,528,619',
  '2016'],
 'Juno': ['/title/tt0467406/?ref_=bo_cso_table_7',
  '407',
  'Juno',
  '$143,495,265',
  '2007'],
 'Marley & Me': ['/title/tt0822832/?ref_=bo_cso_table_8',
  '408',
  'Marley & Me',
  '$143,153,751',
  '2008'],
 'The Smurfs': ['/title/tt0472181/?ref_=bo_cso_table_9',
 

In [6]:
top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank_all_movies','title', 
                    'lifetime_gross', 'year']

top_movies.shape

(200, 5)

In [7]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
        # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [8]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    else:
        pass

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def clean_genres(genre_list):
    genres = [item.strip() for item in genre_list if item.strip()]
    return genres 

In [9]:
def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list


def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [10]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic distributor
        - budget
        - domestic gross
        - earliest release
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_distributor','budget', 'domestic_opening_sales',
               'earliest_release',
               'rating', 'runtime_minutes','genres','summary_sales','crew_list','cast_list']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    
    #Get domestic distributor
    domdist_regex = re.compile('Domestic Distributor')
    domdist_string = soup.find(text=domdist_regex).findNext().text
    domestic_dist = domdist_string.split('See')[0].strip()
    
    # Domestic distributor
    domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    # Domestic opening sales
    raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
    domestic_opening_sales = money_to_int(raw_domestic_opening)

    #Earliest release date (raw)
    earliest_release_str = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()
    earliest_release = datetime.strptime(earliest_release_str, '%B %d, %Y')
    earliest_release.strftime('%Y-%m-%d')

    #Rating
    rating = get_movie_value(soup,'MPAA')

    # Runtime
    raw_runtime = get_movie_value(soup,'Running Time')
    runtime_minutes = runtime_to_minutes(raw_runtime)

    #Genres
    raw_genres = get_movie_value(soup, 'Genres').split('\n')
    genres = clean_genres(raw_genres)
    
    #Sales list (domestic, international, worldwide)
    raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]
    sales_list = clean_sales(raw_sales_list)
    
    # Regions list
    raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]
    regions_list = clean_regions(raw_sales_regions)
    
    # Zip region and associated sales per region - believe should solve for any missing regions
    summary_sales = [list(i) for i in zip(regions_list,sales_list)]
    
    #Find link for cast and crew
    credits = soup.find('a', class_='a-size-base a-link-normal mojo-navigation-tab',href=True)
    credits_href = str(credits.get('href'))
    credits_url = base_url + credits_href
    
    #Refer to cast and crew page
    response1 = requests.get(credits_url,headers = user_agent)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, 'lxml')
    
    #Find crew table, cast table
    table_crew = soup1.find('table',id='principalCrew').find_all('td')
    table_cast = soup1.find('table',id='principalCast').find_all('td')
    
    #clean the tables
    def clean_credits(table1):
        lst1 = [item.text for item in table1]
        lst2 = [item.split('\n')[0].strip() for item in lst1]
        lst2 = [item for item in lst2[::2]]

        return lst2

    crew_list = clean_credits(table_crew)
    cast_list = clean_credits(table_cast)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_dist, budget,domestic_opening_sales,
                                    earliest_release,
                                    rating, runtime_minutes,genres,summary_sales,crew_list,cast_list]))

    return movie_dict

In [11]:
top_movies_page_info_list = []

for link in top_movies.link_stub:
    top_movies_page_info_list.append(get_movie_dict(link))

In [12]:
top_movies_page_info = pd.DataFrame(top_movies_page_info_list)  #convert list of dict to df
top_movies_page_info.set_index('movie_title', inplace=True)

In [13]:
top_movies_page_info.to_csv('mojo_pg3.csv')

In [14]:
top_movies_page_info.head()

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
The Fast and the Furious,Universal Pictures,38000000.0,40089015.0,2001-06-22,PG-13,106.0,"[Action, Crime, Thriller]","[[Domestic, 144533925], [International, 627677...","[Rob Cohen, Ken Li, Gary Scott Thompson, Gary ...","[Vin Diesel, Paul Walker, Michelle Rodriguez, ..."
Mamma Mia!,Universal Pictures,52000000.0,27751240.0,2008-06-27,PG-13,108.0,"[Comedy, Musical, Romance]","[[Domestic, 144169664], [International, 465745...","[Phyllida Lloyd, Catherine Johnson, Catherine ...","[Meryl Streep, Pierce Brosnan, Amanda Seyfried..."
Doctor Dolittle,Twentieth Century Fox,,29014324.0,1998-06-26,PG-13,85.0,"[Comedy, Family, Fantasy]","[[Domestic, 144156605], [International, 150300...","[Betty Thomas, Hugh Lofting, Nat Mauldin, Larr...","[Eddie Murphy, Peter Boyle, Ossie Davis, Olive..."
Pokémon Detective Pikachu,Warner Bros.,150000000.0,54365242.0,2019-05-03,PG,104.0,"[Action, Adventure, Comedy, Family, Mystery, S...","[[Domestic, 144105346], [International, 288900...","[Rob Letterman, Dan Hernandez, Benji Samit, Ro...","[Ryan Reynolds, Justice Smith, Kathryn Newton,..."
Rio,Twentieth Century Fox,90000000.0,39225962.0,2011-04-04,PG,96.0,"[Adventure, Animation, Comedy, Crime, Family, ...","[[Domestic, 143619809], [International, 340246...","[Carlos Saldanha, Carlos Saldanha, Earl Richey...","[Jesse Eisenberg, Anne Hathaway, George Lopez,..."


In [15]:
top_movies_page_info['domestic_sales'] = top_movies_page_info['summary_sales'].apply(lambda x: [y[1] for y in x if y[0] == 'Domestic'])
top_movies_page_info['international_sales'] = top_movies_page_info['summary_sales'].apply(lambda x: [y[1] for y in x if y[0] == 'International'])
top_movies_page_info['worldwide_sales'] = top_movies_page_info['summary_sales'].apply(lambda x: [y[1] for y in x if y[0] == 'Worldwide'])

In [18]:
top_movies_page_info.head()

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list,domestic_sales,international_sales,worldwide_sales
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
The Fast and the Furious,Universal Pictures,38000000.0,40089015.0,2001-06-22,PG-13,106.0,"[Action, Crime, Thriller]","[[Domestic, 144533925], [International, 627677...","[Rob Cohen, Ken Li, Gary Scott Thompson, Gary ...","[Vin Diesel, Paul Walker, Michelle Rodriguez, ...",[144533925],[62767742],[207301667]
Mamma Mia!,Universal Pictures,52000000.0,27751240.0,2008-06-27,PG-13,108.0,"[Comedy, Musical, Romance]","[[Domestic, 144169664], [International, 465745...","[Phyllida Lloyd, Catherine Johnson, Catherine ...","[Meryl Streep, Pierce Brosnan, Amanda Seyfried...",[144169664],[465745071],[609914735]
Doctor Dolittle,Twentieth Century Fox,,29014324.0,1998-06-26,PG-13,85.0,"[Comedy, Family, Fantasy]","[[Domestic, 144156605], [International, 150300...","[Betty Thomas, Hugh Lofting, Nat Mauldin, Larr...","[Eddie Murphy, Peter Boyle, Ossie Davis, Olive...",[144156605],[150300000],[294456605]
Pokémon Detective Pikachu,Warner Bros.,150000000.0,54365242.0,2019-05-03,PG,104.0,"[Action, Adventure, Comedy, Family, Mystery, S...","[[Domestic, 144105346], [International, 288900...","[Rob Letterman, Dan Hernandez, Benji Samit, Ro...","[Ryan Reynolds, Justice Smith, Kathryn Newton,...",[144105346],[288900000],[433005346]
Rio,Twentieth Century Fox,90000000.0,39225962.0,2011-04-04,PG,96.0,"[Adventure, Animation, Comedy, Crime, Family, ...","[[Domestic, 143619809], [International, 340246...","[Carlos Saldanha, Carlos Saldanha, Earl Richey...","[Jesse Eisenberg, Anne Hathaway, George Lopez,...",[143619809],[340246709],[483866518]
