In [58]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from datetime import datetime

In [2]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200'

user_agent = {'User-agent': 'Mozilla/5.0'}

response  = requests.get(url, headers = user_agent)

page = response.text

soup = BeautifulSoup(page,"lxml")

In [3]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
</th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
</th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Lifetime Gross">Lifetime Gross</span>
</th><th class="a-text-left mojo-field-type-year a-nowrap"><span title="Year">Year</span>
</th></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">201</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt0372784/?ref_=bo_cso_table_1">Batman Begins</a></td><td class="a-text-right mojo-field-type-money">$206,852,432</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2005/?ref_=bo_cso_table_1">2005</a></td></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-ran

In [4]:
rows = [row for row in table.find_all('tr')]

In [5]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = row.find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
movies

{'Batman Begins': ['/title/tt0372784/?ref_=bo_cso_table_1',
  '201',
  'Batman Begins',
  '$206,852,432',
  '2005'],
 'Charlie and the Chocolate Factory': ['/title/tt0367594/?ref_=bo_cso_table_2',
  '202',
  'Charlie and the Chocolate Factory',
  '$206,459,076',
  '2005'],
 'Ratatouille': ['/title/tt0382932/?ref_=bo_cso_table_3',
  '203',
  'Ratatouille',
  '$206,445,654',
  '2007'],
 'Thor: The Dark World': ['/title/tt1981115/?ref_=bo_cso_table_4',
  '204',
  'Thor: The Dark World',
  '$206,362,140',
  '2013'],
 'Bad Boys for Life': ['/title/tt1502397/?ref_=bo_cso_table_5',
  '205',
  'Bad Boys for Life',
  '$206,305,244',
  '2020'],
 'Austin Powers: The Spy Who Shagged Me': ['/title/tt0145660/?ref_=bo_cso_table_6',
  '206',
  'Austin Powers: The Spy Who Shagged Me',
  '$206,040,086',
  '1999'],
 'Terminator 2: Judgment Day': ['/title/tt0103064/?ref_=bo_cso_table_7',
  '207',
  'Terminator 2: Judgment Day',
  '$205,881,154',
  '1991'],
 'The Amazing Spider-Man 2': ['/title/tt1872181/?

In [29]:
top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank_all_movies','title', 
                    'lifetime_gross', 'year']

top_movies.head()

Unnamed: 0,link_stub,rank_all_movies,title,lifetime_gross,year
Batman Begins,/title/tt0372784/?ref_=bo_cso_table_1,201,Batman Begins,"$206,852,432",2005
Charlie and the Chocolate Factory,/title/tt0367594/?ref_=bo_cso_table_2,202,Charlie and the Chocolate Factory,"$206,459,076",2005
Ratatouille,/title/tt0382932/?ref_=bo_cso_table_3,203,Ratatouille,"$206,445,654",2007
Thor: The Dark World,/title/tt1981115/?ref_=bo_cso_table_4,204,Thor: The Dark World,"$206,362,140",2013
Bad Boys for Life,/title/tt1502397/?ref_=bo_cso_table_5,205,Bad Boys for Life,"$206,305,244",2020


In [13]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
        # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [14]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    else:
        pass

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def clean_genres(genre_list):
    genres = [item.strip() for item in genre_list if item.strip()]
    return genres 

In [15]:
def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list


def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [59]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic distributor
        - budget
        - domestic gross
        - earliest release
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_distributor','budget', 'domestic_opening_sales',
               'earliest_release',
               'rating', 'runtime_minutes','genres','summary_sales','crew_list','cast_list']
    
    #Get title
    title_string = soup.find('title').text
    if title_string:
        movie_title = title_string.split('-')[0].strip()
    else:
        pass
    
    #Get domestic distributor
    domdist_regex = re.compile('Domestic Distributor')
    if domdist_regex:
        try:
            domdist_string = soup.find(text=domdist_regex).findNext().text
            try:
                domestic_dist = domdist_string.split('See')[0].strip()
            except:
                domestic_dist = None
            #return domestic_dist
        except:
            domestic_dist = None
    
    # Domestic distributor
    if domestic_dist != None:
        domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()
    else:
        pass

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    # Domestic opening sales
    raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
    domestic_opening_sales = money_to_int(raw_domestic_opening)

    #Earliest release date (raw)
    earliest_release_str = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()
    earliest_release = datetime.strptime(earliest_release_str, '%B %d, %Y')
    earliest_release.strftime('%Y-%m-%d')

    #Rating
    rating = get_movie_value(soup,'MPAA')

    # Runtime
    raw_runtime = get_movie_value(soup,'Running Time')
    runtime_minutes = runtime_to_minutes(raw_runtime)

    #Genres
    raw_genres = get_movie_value(soup, 'Genres').split('\n')
    genres = clean_genres(raw_genres)
    
    #Sales list (domestic, international, worldwide)
    raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]
    sales_list = clean_sales(raw_sales_list)
    
    # Regions list
    raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]
    regions_list = clean_regions(raw_sales_regions)
    
    # Zip region and associated sales per region - believe should solve for any missing regions
    summary_sales = [list(i) for i in zip(regions_list,sales_list)]
    
    #Find link for cast and crew
    credits = soup.find('a', class_='a-size-base a-link-normal mojo-navigation-tab',href=True)
    credits_href = str(credits.get('href'))
    credits_url = base_url + credits_href
    
    #Refer to cast and crew page
    response1 = requests.get(credits_url,headers = user_agent)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, 'lxml')
    
    #Find crew table, cast table
    table_crew = soup1.find('table',id='principalCrew').find_all('td')
    table_cast = soup1.find('table',id='principalCast').find_all('td')
    
    #clean the tables
    def clean_credits(table1):
        lst1 = [item.text for item in table1]
        lst2 = [item.split('\n')[0].strip() for item in lst1]
        lst2 = [item for item in lst2[::2]]

        return lst2

    crew_list = clean_credits(table_crew)
    cast_list = clean_credits(table_cast)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [movie_title, domestic_dist, budget,domestic_opening_sales,
                                    earliest_release,
                                    rating, runtime_minutes,genres,summary_sales,crew_list,cast_list]))

    return movie_dict

In [60]:
top_movies_page_info_list = []

for link in top_movies.link_stub:
    top_movies_page_info_list.append(get_movie_dict(link))

ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.com', port=443): Max retries exceeded with url: /title/tt0372784/credits/?ref_=bo_tt_tab (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fe6a006e400>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [54]:
top_movies_page_info = pd.DataFrame(top_movies_page_info_list)  #convert list of dict to df
top_movies_page_info.set_index('movie_title', inplace=True)
top_movies_page_info

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Batman Begins,Warner Bros.,150000000.0,48745440.0,"June 15, 2005",PG-13,140.0,"[Action, Adventure]","[[Domestic, 206852432], [International, 166809...","[Christopher Nolan, Bob Kane, David S. Goyer, ...","[Christian Bale, Michael Caine, Ken Watanabe, ..."
Charlie and the Chocolate Factory,Warner Bros.,150000000.0,56178450.0,"July 14, 2005",PG,115.0,"[Adventure, Comedy, Family, Fantasy, Musical]","[[Domestic, 206459076], [International, 268509...","[Tim Burton, Roald Dahl, John August, Brad Gre...","[Johnny Depp, Freddie Highmore, David Kelly, H..."
Ratatouille,Walt Disney Studios Motion Pictures,150000000.0,47027395.0,"June 28, 2007",G,111.0,"[Adventure, Animation, Comedy, Family, Fantasy]","[[Domestic, 206445654], [International, 417280...","[Brad Bird, Jan Pinkava, Brad Bird, Jan Pinkav...","[Brad Garrett, Lou Romano, Patton Oswalt, Ian ..."
Thor: The Dark World,Walt Disney Studios Motion Pictures,170000000.0,85737841.0,"October 30, 2013",PG-13,112.0,"[Action, Adventure, Fantasy]","[[Domestic, 206362140], [International, 438421...","[Alan Taylor, Christopher L. Yost, Christopher...","[Chris Hemsworth, Natalie Portman, Tom Hiddles..."
Bad Boys for Life,Sony Pictures Entertainment (SPE),90000000.0,62504105.0,"January 15, 2020",R,124.0,"[Action, Comedy, Crime, Thriller]","[[Domestic, 206305244], [International, 220200...","[Adil El Arbi, Bilall Fallah, Peter Craig, Joe...","[Will Smith, Martin Lawrence, Vanessa Hudgens,..."
...,...,...,...,...,...,...,...,...,...,...
Taken,Twentieth Century Fox,25000000.0,24717037.0,"February 27, 2008",PG-13,90.0,"[Action, Thriller]","[[Domestic, 145000989], [International, 818367...","[Pierre Morel, Luc Besson, Robert Mark Kamen, ...","[Liam Neeson, Maggie Grace, Famke Janssen, Lel..."
One Hundred and One Dalmatians,Walt Disney Studios Motion Pictures,,,"January 25, 1961",,79.0,"[Adventure, Animation, Comedy, Family]","[[Domestic, 144880014], [International, 710000...","[Clyde Geronimi, Hamilton Luske, Wolfgang Reit...","[Rod Taylor, Betty Lou Gerson, J. Pat O'Malley..."
The Great Gatsby,Warner Bros.,105000000.0,50085185.0,"May 10, 2013",PG-13,143.0,"[Drama, Romance]","[[Domestic, 144840419], [International, 208801...","[Baz Luhrmann, Baz Luhrmann, Craig Pearce, F. ...","[Leonardo DiCaprio, Carey Mulligan, Joel Edger..."
"I, Robot",Twentieth Century Fox,120000000.0,52179887.0,"July 15, 2004",PG-13,115.0,"[Action, Drama, Sci-Fi, Thriller]","[[Domestic, 144801023], [International, 208332...","[Alex Proyas, Jeff Vintar, Akiva Goldsman, Jef...","[Will Smith, Bridget Moynahan, Bruce Greenwood..."


In [55]:
top_movies_page_info.to_csv('mojo_pg2.csv')

In [56]:
top_movies_page_info.head()

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Batman Begins,Warner Bros.,150000000.0,48745440.0,"June 15, 2005",PG-13,140.0,"[Action, Adventure]","[[Domestic, 206852432], [International, 166809...","[Christopher Nolan, Bob Kane, David S. Goyer, ...","[Christian Bale, Michael Caine, Ken Watanabe, ..."
Charlie and the Chocolate Factory,Warner Bros.,150000000.0,56178450.0,"July 14, 2005",PG,115.0,"[Adventure, Comedy, Family, Fantasy, Musical]","[[Domestic, 206459076], [International, 268509...","[Tim Burton, Roald Dahl, John August, Brad Gre...","[Johnny Depp, Freddie Highmore, David Kelly, H..."
Ratatouille,Walt Disney Studios Motion Pictures,150000000.0,47027395.0,"June 28, 2007",G,111.0,"[Adventure, Animation, Comedy, Family, Fantasy]","[[Domestic, 206445654], [International, 417280...","[Brad Bird, Jan Pinkava, Brad Bird, Jan Pinkav...","[Brad Garrett, Lou Romano, Patton Oswalt, Ian ..."
Thor: The Dark World,Walt Disney Studios Motion Pictures,170000000.0,85737841.0,"October 30, 2013",PG-13,112.0,"[Action, Adventure, Fantasy]","[[Domestic, 206362140], [International, 438421...","[Alan Taylor, Christopher L. Yost, Christopher...","[Chris Hemsworth, Natalie Portman, Tom Hiddles..."
Bad Boys for Life,Sony Pictures Entertainment (SPE),90000000.0,62504105.0,"January 15, 2020",R,124.0,"[Action, Comedy, Crime, Thriller]","[[Domestic, 206305244], [International, 220200...","[Adil El Arbi, Bilall Fallah, Peter Craig, Joe...","[Will Smith, Martin Lawrence, Vanessa Hudgens,..."
