In [70]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from datetime import datetime

In [16]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_cso_ac'

user_agent = {'User-agent': 'Mozilla/5.0'}

response  = requests.get(url, headers = user_agent)

page = response.text

soup = BeautifulSoup(page,"lxml")

In [17]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
</th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
</th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Lifetime Gross">Lifetime Gross</span>
</th><th class="a-text-left mojo-field-type-year a-nowrap"><span title="Year">Year</span>
</th></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt2488496/?ref_=bo_cso_table_1">Star Wars: Episode VII - The Force Awakens</a></td><td class="a-text-right mojo-field-type-money">$936,662,225</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2015/?ref_=bo_cso_table_1">2015</a></td></tr><tr><td class="a-text-right mojo-header-column mojo-t

In [18]:
rows = [row for row in table.find_all('tr')]

In [19]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = row.find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
movies

{'Star Wars: Episode VII - The Force Awakens': ['/title/tt2488496/?ref_=bo_cso_table_1',
  '1',
  'Star Wars: Episode VII - The Force Awakens',
  '$936,662,225',
  '2015'],
 'Avengers: Endgame': ['/title/tt4154796/?ref_=bo_cso_table_2',
  '2',
  'Avengers: Endgame',
  '$858,373,000',
  '2019'],
 'Avatar': ['/title/tt0499549/?ref_=bo_cso_table_3',
  '3',
  'Avatar',
  '$760,507,625',
  '2009'],
 'Black Panther': ['/title/tt1825683/?ref_=bo_cso_table_4',
  '4',
  'Black Panther',
  '$700,426,566',
  '2018'],
 'Avengers: Infinity War': ['/title/tt4154756/?ref_=bo_cso_table_5',
  '5',
  'Avengers: Infinity War',
  '$678,815,482',
  '2018'],
 'Titanic': ['/title/tt0120338/?ref_=bo_cso_table_6',
  '6',
  'Titanic',
  '$659,363,944',
  '1997'],
 'Jurassic World': ['/title/tt0369610/?ref_=bo_cso_table_7',
  '7',
  'Jurassic World',
  '$652,295,625',
  '2015'],
 'The Avengers': ['/title/tt0848228/?ref_=bo_cso_table_8',
  '8',
  'The Avengers',
  '$623,357,910',
  '2012'],
 'Star Wars: Episode V

In [29]:
top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank_all_movies','title', 
                    'lifetime_gross', 'year']

top_movies.head()

Unnamed: 0,link_stub,rank_all_movies,title,lifetime_gross,year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018


In [21]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
        # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [22]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    else:
        pass

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def clean_genres(genre_list):
    genres = [item.strip() for item in genre_list if item.strip()]
    return genres 

In [23]:
def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list


def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [71]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic distributor
        - budget
        - domestic gross
        - earliest release
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_distributor','budget', 'domestic_opening_sales',
               'earliest_release',
               'rating', 'runtime_minutes','genres','summary_sales','crew_list','cast_list']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    
    #Get domestic distributor
    domdist_regex = re.compile('Domestic Distributor')
    domdist_string = soup.find(text=domdist_regex).findNext().text
    domestic_dist = domdist_string.split('See')[0].strip()
    
    # Domestic distributor
    domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    # Domestic opening sales
    raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
    domestic_opening_sales = money_to_int(raw_domestic_opening)

    #Earliest release date (raw)
    earliest_release_str = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()
    earliest_release = datetime.strptime(earliest_release_str, '%B %d, %Y')
    earliest_release.strftime('%Y-%m-%d')

    #Rating
    rating = get_movie_value(soup,'MPAA')

    # Runtime
    raw_runtime = get_movie_value(soup,'Running Time')
    runtime_minutes = runtime_to_minutes(raw_runtime)

    #Genres
    raw_genres = get_movie_value(soup, 'Genres').split('\n')
    genres = clean_genres(raw_genres)
        
    
    #Sales list (domestic, international, worldwide)
    raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]
    sales_list = clean_sales(raw_sales_list)
    
    # Regions list
    raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]
    regions_list = clean_regions(raw_sales_regions)
    
    # Zip region and associated sales per region - believe should solve for any missing regions
    summary_sales = [list(i) for i in zip(regions_list,sales_list)]
    
    #Find link for cast and crew
    credits = soup.find('a', class_='a-size-base a-link-normal mojo-navigation-tab',href=True)
    credits_href = str(credits.get('href'))
    credits_url = base_url + credits_href
    
    #Refer to cast and crew page
    response1 = requests.get(credits_url,headers = user_agent)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, 'lxml')
    
    #Find crew table, cast table
    table_crew = soup1.find('table',id='principalCrew').find_all('td')
    table_cast = soup1.find('table',id='principalCast').find_all('td')
    
    #clean the tables
    def clean_credits(table1):
        lst1 = [item.text for item in table1]
        lst2 = [item.split('\n')[0].strip() for item in lst1]
        lst2 = [item for item in lst2[::2]]

        return lst2

    crew_list = clean_credits(table_crew)
    cast_list = clean_credits(table_cast)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_dist, budget,domestic_opening_sales,
                                    earliest_release,
                                    rating, runtime_minutes,genres,summary_sales,crew_list,cast_list]))
S
    return movie_dict

In [72]:
top_movies_page_info_list = []

for link in top_movies.link_stub:
    top_movies_page_info_list.append(get_movie_dict(link))

In [73]:
top_movies_page_info = pd.DataFrame(top_movies_page_info_list)  #convert list of dict to df
top_movies_page_info.set_index('movie_title', inplace=True)

In [74]:
top_movies_page_info.to_csv('mojo_pg1.csv')

In [75]:
top_movies_page_info.head()

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Star Wars: Episode VII,Walt Disney Studios Motion Pictures,245000000.0,247966675.0,2015-12-16,PG-13,138.0,"[Action, Adventure, Sci-Fi]","[[Domestic, 936662225], [International, 113179...","[J.J. Abrams, Lawrence Kasdan, J.J. Abrams, Mi...","[Daisy Ridley, John Boyega, Oscar Isaac, Domhn..."
Avengers: Endgame,Walt Disney Studios Motion Pictures,356000000.0,357115007.0,2019-04-24,PG-13,181.0,"[Action, Adventure, Drama, Sci-Fi]","[[Domestic, 858373000], [International, 193942...","[Anthony Russo, Joe Russo, Christopher Markus,...","[Robert Downey Jr., Chris Evans, Mark Ruffalo,..."
Avatar,Twentieth Century Fox,237000000.0,77025481.0,2009-12-16,PG-13,162.0,"[Action, Adventure, Fantasy, Sci-Fi]","[[Domestic, 760507625], [International, 202993...","[James Cameron, James Cameron, James Cameron, ...","[Sam Worthington, Zoe Saldana, Sigourney Weave..."
Black Panther,Walt Disney Studios Motion Pictures,,202003951.0,2018-02-13,PG-13,134.0,"[Action, Adventure, Sci-Fi]","[[Domestic, 700426566], [International, 647171...","[Ryan Coogler, Ryan Coogler, Joe Robert Cole, ...","[Chadwick Boseman, Michael B. Jordan, Lupita N..."
Avengers: Infinity War,Walt Disney Studios Motion Pictures,,257698183.0,2018-04-25,PG-13,149.0,"[Action, Adventure, Sci-Fi]","[[Domestic, 678815482], [International, 136954...","[Anthony Russo, Joe Russo, Christopher Markus,...","[Robert Downey Jr., Chris Hemsworth, Mark Ruff..."
