In [23]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [24]:
url = 'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac'

user_agent = {'User-agent': 'Mozilla/5.0'}

response  = requests.get(url, headers = user_agent)

page = response.text

soup = BeautifulSoup(page,"lxml")

In [25]:
table = soup.find('table')

In [26]:
rows = [row for row in table.find_all('tr')]

In [27]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td')
    link = row.find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
movies

{'Star Wars: Episode VII - The Force Awakens': ['/title/tt2488496/?ref_=bo_cso_table_1',
  'Star Wars: Episode VII - The Force Awakens',
  '1',
  '$936,662,225',
  '1',
  '2015'],
 'Avengers: Endgame': ['/title/tt4154796/?ref_=bo_cso_table_2',
  'Avengers: Endgame',
  '2',
  '$858,373,000',
  '2',
  '2019'],
 'Avatar': ['/title/tt0499549/?ref_=bo_cso_table_3',
  'Avatar',
  '3',
  '$760,507,625',
  '3',
  '2009'],
 'Black Panther': ['/title/tt1825683/?ref_=bo_cso_table_4',
  'Black Panther',
  '4',
  '$700,426,566',
  '4',
  '2018'],
 'Avengers: Infinity War': ['/title/tt4154756/?ref_=bo_cso_table_5',
  'Avengers: Infinity War',
  '5',
  '$678,815,482',
  '5',
  '2018'],
 'Titanic': ['/title/tt0120338/?ref_=bo_cso_table_6',
  'Titanic',
  '6',
  '$659,363,944',
  '6',
  '1997'],
 'Jurassic World': ['/title/tt0369610/?ref_=bo_cso_table_7',
  'Jurassic World',
  '7',
  '$652,295,625',
  '7',
  '2015'],
 'The Avengers': ['/title/tt0848228/?ref_=bo_cso_table_8',
  'The Avengers',
  '8',
  

In [28]:
pg13_movies = pd.DataFrame(movies).T  #transpose
pg13_movies.columns = ['link_stub', 'title', 'rank_pg13_movies', 
                    'lifetime_gross', 'rank_overall', 'year']

pg13_movies.head()

Unnamed: 0,link_stub,title,rank_pg13_movies,lifetime_gross,rank_overall,year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,"$936,662,225",1,2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,"$858,373,000",2,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,"$760,507,625",3,2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,"$700,426,566",4,2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,"$678,815,482",5,2018


In [29]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
        # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [30]:
import dateutil.parser

def money_to_int(moneystring):
    if moneystring:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    else:
        pass

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def clean_genres(genre_list):
    genres = [item.strip() for item in genre_list if item.strip()]
    return genres 

In [31]:
def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list


def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [32]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_distributor','budget', 'domestic_opening_sales',
               'earliest_release',
               'rating', 'runtime_minutes','genres','summary_sales']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    
    #Get domestic distributor
    domdist_regex = re.compile('Domestic Distributor')
    domdist_string = soup.find(text=domdist_regex).findNext().text
    domestic_dist = domdist_string.split('See')[0].strip()
    
    # Domestic distributor
    domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()

    # Budget
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    # Domestic opening sales
    raw_domestic_opening = get_movie_value(soup,'Domestic Opening')
    domestic_opening_sales = money_to_int(raw_domestic_opening)

    #Earliest release date (raw)
    earliest_release = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()

    #Rating
    rating = get_movie_value(soup,'MPAA')

    # Runtime
    raw_runtime = get_movie_value(soup,'Running Time')
    runtime_minutes = runtime_to_minutes(raw_runtime)

    #Genres
    raw_genres = get_movie_value(soup, 'Genres').split('\n')
    genres = clean_genres(raw_genres)
    
    #Sales list (domestic, international, worldwide)
    raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]
    sales_list = clean_sales(raw_sales_list)
    
    # Regions list
    raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]
    regions_list = clean_regions(raw_sales_regions)
    
    # Zip region and associated sales per region - believe should solve for any missing regions
    summary_sales = zip(regions_list, sales_list)
    list(summary_sales)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title, domestic_dist, budget,domestic_opening_sales,
                                    earliest_release,
                                    rating, runtime_minutes,genres,summary_sales]))
    


    return movie_dict

In [33]:
pg13_movies_page_info_list = []

for link in pg13_movies.link_stub:
    pg13_movies_page_info_list.append(get_movie_dict(link))

ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.com', port=443): Max retries exceeded with url: /title/tt2283362/?ref_=bo_cso_table_24 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff84c00b5e0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [None]:
pg13_movies_page_info = pd.DataFrame(pg13_movies_page_info_list)  #convert list of dict to df
pg13_movies_page_info.set_index('movie_title', inplace=True)

In [None]:
pg13_movies_page_info