In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time
import pickle

###  This code is adapted from METIS example scraper. 

In [57]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

def to_float(string_number):
    if string_number == '-':
        return float('nan')
    elif string_number =='':
        return float('nan')
    else: 
        return float(string_number.replace(',',''))
    
def to_int(string_number):
    if string_number.isdigit() == True:
        return float(string_number)
    else: 
        return float('nan')

In [58]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring == None:
        return float('Nan')
    else:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [128]:
def get_movie_dict(link):
    #start_time = time.sleep(1)
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross','international_gross',
               'runtime_minutes', 'rating', 'release_date',
              'distributor','budget','opening','genres','widest_release','opening_rank',
              'imdb_user_score','metascore','rating_dict', 'rating_dist_dict']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get international gross
    raw_international_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[1]
                                    .text
                               )
    international_total_gross = money_to_int(raw_international_total_gross)
    
    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if raw_runtime == None:
        runtime = float('nan')
    else:
        runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0].split('(')[0]
    release_date = to_date(raw_release_date)

    #Get distributor
    distributor_raw = get_movie_value(soup,'Distributor')
    remover = soup.find(text=re.compile('Distributor')).find_next().find('a')
    if remover == None:
        distributor = distributor_raw
    else:
        distributor = distributor_raw.removesuffix(remover.text)
    
    #Get Budget
    raw_budget = get_movie_value(soup,'Budget')
    
    budget = money_to_int(raw_budget) 
    
    #Get Opening
    raw_opening = get_movie_value(soup,'Opening')
    if raw_opening == None:
        opening = float('nan')
    else:
        opening = money_to_int(raw_opening.removesuffix('\n            theaters'))
    
    #Get genres
    
    genres_raw = get_movie_value(soup,'Genres')
    if genres_raw == None:
        genres = [float('nan')]
    else:
        genres = genres_raw.replace(' ','').split('\n\n')
    
    #get widest release

    raw_widest_release = get_movie_value(soup,'Widest Release').removesuffix(' theaters')
    widest_release = money_to_int(raw_widest_release)

    
    #Opening Rank 
    if len(soup.find(text=re.compile('Rank')).find_next().find_all('a'))>20:
        opening_rank = int(soup.find(text=re.compile('Rank')).find_next().find_all('a')[6].find_next().text)
    else:
        opening_rank = float('nan')
    
    #Request HTML and parse from IMDB
    imdbpro_url = soup.find(text=re.compile('IMDbPro')).find_next().find('a')['href']
    imdb_url = imdbpro_url.replace('pro','www')

    imdb_response = requests.get(imdb_url)
    imdb_page = imdb_response.text
    imdb_soup = BeautifulSoup(imdb_page,"lxml")
    
    #IMDb User Score
    if imdb_soup.find(text=re.compile('IMDb RATING')).find_next() == None:
        imdb_user_socre = float('nan')
    elif imdb_soup.find(text=re.compile('IMDb RATING')) == None:
        imdb_user_socre = float('nan')
    else:
        imdb_user_score = float(imdb_soup.find(text=re.compile('IMDb RATING')).find_next().text.split('/')[0])
    
    #Metascore
    metascore = to_int(imdb_soup.find(text=re.compile('Metascore')).find_previous().find_previous().text)
    
    # Request HTML and parse from IMDB User Ratings
    imdb_rating_url = imdb_url.split('?')[0]+'/ratings/'
    imdb_rating_response = requests.get(imdb_rating_url)
    imdb_rating_page = imdb_rating_response.text
    imdb_rating_soup = BeautifulSoup(imdb_rating_page,"lxml")
    
    rating_dict = {}
    rating_dict['all_all'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[1]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[1]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['all_<18'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[2]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[2]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['all_18-29'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[3]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[3]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['all_30-44'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[4]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[4]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['all_45+'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[5]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[5]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
 
    rating_dict['men_all'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[7]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[7]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['men_<18'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[8]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[8]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['men_18-29'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[9]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[9]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['men_30-44'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[10]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[10]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['men_45+'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[11]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[11]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    
    rating_dict['women_all'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[13]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[13]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['women_<18'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[14]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[14]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['women_18-29'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[15]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[15]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['women_30-44'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[16]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[16]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['women_45+'] = [to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[17]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[1]
                                 .find_all('td')[17]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
 
    
    rating_dict['top_users'] = [to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[0]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[0]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['US_users'] = [to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[1]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[1]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    rating_dict['non_US_users'] = [to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[2]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[0:3]),
                              to_float((imdb_rating_soup.find_all('table')[2]
                                 .find_all('td')[2]
                                 .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0',''))[3:])]
    
    rating_dist_dict = {}
    top = 10
    for x in imdb_rating_soup.find_all('table')[0].find_all('td')[2::3]:
        rating_dist_dict[top] = (money_to_int(x .text.replace('\n','')
                                 .replace(' ','')
                                 .replace('\xa0','')))
        top = top -1

        
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                international_total_gross,
                                runtime,
                                rating, 
                                release_date,
                                distributor,
                                budget,
                                opening,
                                genres,
                                widest_release,
                                opening_rank,
                                imdb_user_score,
                                metascore,
                                rating_dict,
                                rating_dist_dict]))
    return movie_dict

In [101]:
def get_links(start_url):
    time.sleep(1)

    response_s = requests.get(start_url)
    page_s = response_s.text

    soup_s = BeautifulSoup(page_s,"lxml")

    table = soup_s.find('table')

    links = []
    for row in table:
        if len(row.find_all('td')) == 0:
            pass
        elif row.find_all('td')[2].text =='Limited':
            pass
        else:
            links.append(row.find_all('td')[0].find('a')['href'].split('?')[0])
    return(links)

In [102]:
### Heres where we start getting URLS

year = 2010
url_list = []
while year<2020:
    month = 1
    while month <13:
        url_list.append('https://www.boxofficemojo.com/calendar/'+str(year)+'-'+str(month)+'-01/')
        month = month+1
    year = year+1

In [7]:
links = []

for x in url_list:
    links = links+get_links(x)
links = list(set(links))

In [149]:
len(links)

1672

In [147]:
### Uncomment for restarting list
movie_data = []

#x = len(movie_data)
for link in links:#[x-1:]:
    movie_data = [get_movie_dict(link)]+movie_data

/release/rl3330246145/
/release/rl1342408193/
/release/rl1618707969/
/release/rl1027966465/
/release/rl3410920961/
/release/rl4266296833/
/release/rl1566606849/
/release/rl2002617857/
/release/rl493389313/
/release/rl3260843521/
/release/rl2051507713/
/release/rl3540157953/
/release/rl939755009/
/release/rl21464577/
/release/rl1560839681/
/release/rl2214823425/
/release/rl805602817/
/release/rl1191806465/
/release/rl3011282433/
/release/rl1497662977/
/release/rl2186249729/
/release/rl275351041/
/release/rl324240897/
/release/rl122652161/
/release/rl369657345/
/release/rl1887340033/
/release/rl2773255681/
/release/rl1298826753/
/release/rl1265468929/
/release/rl1046382081/
/release/rl1216185857/
/release/rl2105378305/
/release/rl3862988289/
/release/rl2893252097/
/release/rl2373682689/
/release/rl486835713/
/release/rl3081142785/
/release/rl256542209/
/release/rl1985119745/
/release/rl3915417089/
/release/rl3044115969/
/release/rl67274241/
/release/rl4165764609/
/release/rl1828750849/
/

In [153]:
with open('movie_data.pkl', 'wb') as f:
    pickle.dump(movie_data, f)

In [90]:
### Debugger Window

base_url = 'https://www.boxofficemojo.com'
link = '/release/rl2724955649/'

#Create full url to scrape
url = base_url + link

#Request HTML and parse
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page,"lxml")

len(soup.find(text=re.compile('Rank')).find_next().find_all('a'))#.find_next().find_all('a')[6].find_next().text

99