In [18]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from datetime import datetime

In [2]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=800'

user_agent = {'User-agent': 'Mozilla/5.0'}

response  = requests.get(url, headers = user_agent)

page = response.text

soup = BeautifulSoup(page,"lxml")

In [3]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
</th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
</th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Lifetime Gross">Lifetime Gross</span>
</th><th class="a-text-left mojo-field-type-year a-nowrap"><span title="Year">Year</span>
</th></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">801</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt0252076/?ref_=bo_cso_table_1">Maid in Manhattan</a></td><td class="a-text-right mojo-field-type-money">$94,011,225</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2002/?ref_=bo_cso_table_1">2002</a></td></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-

In [4]:
rows = [row for row in table.find_all('tr')]

In [5]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = row.find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
movies

{'Maid in Manhattan': ['/title/tt0252076/?ref_=bo_cso_table_1',
  '801',
  'Maid in Manhattan',
  '$94,011,225',
  '2002'],
 "He's Just Not That Into You": ['/title/tt1001508/?ref_=bo_cso_table_2',
  '802',
  "He's Just Not That Into You",
  '$93,953,653',
  '2009'],
 'Master and Commander: The Far Side of the World': ['/title/tt0311113/?ref_=bo_cso_table_3',
  '803',
  'Master and Commander: The Far Side of the World',
  '$93,927,920',
  '2003'],
 'Flight': ['/title/tt1907668/?ref_=bo_cso_table_4',
  '804',
  'Flight',
  '$93,772,375',
  '2012'],
 'The Fighter': ['/title/tt0964517/?ref_=bo_cso_table_5',
  '805',
  'The Fighter',
  '$93,617,009',
  '2010'],
 "America's Sweethearts": ['/title/tt0265029/?ref_=bo_cso_table_6',
  '806',
  "America's Sweethearts",
  '$93,607,673',
  '2001'],
 'Lady and the Tramp': ['/title/tt0048280/?ref_=bo_cso_table_7',
  '807',
  'Lady and the Tramp',
  '$93,602,326',
  '1955'],
 'The Bucket List': ['/title/tt0825232/?ref_=bo_cso_table_8',
  '808',
  'Th

In [6]:
top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank_all_movies','title', 
                    'lifetime_gross', 'year']

top_movies.shape

(200, 5)

In [7]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
        # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [8]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    else:
        pass

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def clean_genres(genre_list):
    genres = [item.strip() for item in genre_list if item.strip()]
    return genres 

In [9]:
def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list


def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [19]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic distributor
        - budget
        - domestic gross
        - earliest release
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_distributor','budget', 'domestic_opening_sales',
               'earliest_release',
               'rating', 'runtime_minutes','genres','summary_sales','crew_list','cast_list']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    
    #Get domestic distributor
    domdist_regex = re.compile('Domestic Distributor')
    domdist_string = soup.find(text=domdist_regex).findNext().text
    domestic_dist = domdist_string.split('See')[0].strip()
    
    # Domestic distributor
    domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    # Domestic opening sales
    raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
    domestic_opening_sales = money_to_int(raw_domestic_opening)

    #Earliest release date (raw)
    earliest_release_str = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()
    earliest_release = datetime.strptime(earliest_release_str, '%B %d, %Y')
    earliest_release.strftime('%Y-%m-%d')

    #Rating
    rating = get_movie_value(soup,'MPAA')

    # Runtime
    raw_runtime = get_movie_value(soup,'Running Time')
    runtime_minutes = runtime_to_minutes(raw_runtime)

    #Genres
    raw_genres = get_movie_value(soup, 'Genres').split('\n')
    genres = clean_genres(raw_genres)
    
    #Sales list (domestic, international, worldwide)
    raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]
    sales_list = clean_sales(raw_sales_list)
    
    # Regions list
    raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]
    regions_list = clean_regions(raw_sales_regions)
    
    # Zip region and associated sales per region - believe should solve for any missing regions
    summary_sales = [list(i) for i in zip(regions_list,sales_list)]
    
    #Find link for cast and crew
    credits = soup.find('a', class_='a-size-base a-link-normal mojo-navigation-tab',href=True)
    credits_href = str(credits.get('href'))
    credits_url = base_url + credits_href
    
    #Refer to cast and crew page
    response1 = requests.get(credits_url,headers = user_agent)
    page1 = response1.text
    soup1 = BeautifulSoup(page1, 'lxml')
    
    #Find crew table, cast table
    table_crew = soup1.find('table',id='principalCrew').find_all('td')
    table_cast = soup1.find('table',id='principalCast').find_all('td')
    
    #clean the tables
    def clean_credits(table1):
        lst1 = [item.text for item in table1]
        lst2 = [item.split('\n')[0].strip() for item in lst1]
        lst2 = [item for item in lst2[::2]]

        return lst2

    crew_list = clean_credits(table_crew)
    cast_list = clean_credits(table_cast)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_dist, budget,domestic_opening_sales,
                                    earliest_release,
                                    rating, runtime_minutes,genres,summary_sales,crew_list,cast_list]))

    return movie_dict

In [20]:
top_movies_page_info_list = []

for link in top_movies.link_stub:
    top_movies_page_info_list.append(get_movie_dict(link))

In [21]:
top_movies_page_info = pd.DataFrame(top_movies_page_info_list)  #convert list of dict to df
top_movies_page_info.set_index('movie_title', inplace=True)

In [22]:
top_movies_page_info.to_csv('mojo_pg5.csv')

In [23]:
top_movies_page_info.head()

Unnamed: 0_level_0,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,cast_list
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Maid in Manhattan,Revolution Studios,55000000.0,18711407.0,2002-12-13,PG-13,105.0,"[Comedy, Drama, Romance]","[[Domestic, 94011225], [International, 6089546...","[Wayne Wang, John Hughes, Kevin Wade, Elaine G...","[Jennifer Lopez, Ralph Fiennes, Natasha Richar..."
He's Just Not That Into You,Warner Bros.,,27785487.0,2009-02-05,PG-13,129.0,"[Comedy, Drama, Romance]","[[Domestic, 93953653], [International, 8491250...","[Ken Kwapis, Abby Kohn, Marc Silverstein, Greg...","[Ginnifer Goodwin, Jennifer Aniston, Jennifer ..."
Master and Commander: The Far Side of the World,Twentieth Century Fox,150000000.0,25105990.0,2003-11-14,PG-13,138.0,"[Action, Adventure, Drama, History, War]","[[Domestic, 93927920], [International, 1176946...","[Peter Weir, Patrick O'Brian, Peter Weir, John...","[Russell Crowe, Paul Bettany, Billy Boyd, Jame..."
Flight,Paramount Pictures,31000000.0,24900566.0,2012-11-02,R,138.0,"[Drama, Thriller]","[[Domestic, 93772375], [International, 6800000...","[Robert Zemeckis, John Gatins, Laurie MacDonal...","[Denzel Washington, Nadine Velazquez, Don Chea..."
The Fighter,Paramount Pictures,25000000.0,300010.0,2009-11-20,R,116.0,"[Biography, Drama, Sport]","[[Domestic, 93617009], [International, 3557386...","[David O. Russell, Scott Silver, Paul Tamasy, ...","[Mark Wahlberg, Christian Bale, Amy Adams, Mel..."
