In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np
import pandas as pd
import time
import random
import re  #regular expressions

### Step 1: Get Top Lifetime Grosses by MPAA Rating - PG-13 (1000 entries), then put data into a dataframe

In [2]:
#get infomation from the table pages - 5 pages, 200 entries each
def get_tables_info(url):
    '''
    Go to the main table pages, then extract import features from the tables
    '''
    response = requests.get(url)
    #response.status_code
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    # print(soup.prettify())
    rows = [row for row in soup.find('table').find_all('tr')]
    
    for row in rows[1:]:
        #getting title      
        title = row.find_all('td')[0].text
        #getting link
        link = row.find_all('td')[0].find('a').get('href')
        #getting rank
        rank = row.find_all('td')[1].text
        #getting Lifetime gross
        lifetime_gross = int(row.find_all('td')[2].text.replace('$', '').replace(',', ''))
        #getting overall rank, ranging (1, 2571)-- different from rank  
        overall_rank = row.find_all('td')[3].text
        #getting year info
        year_of_release = row.find_all('td')[4].text
        
        #from the failed experience, I found out there's 4 movies are remade which are with same title with the original movie in this table
        #so, I added a mark '2' for the second movie which showed up on this table ('2' doesn't mean the old movie or new movie) 
        if title in movies.keys():
            title = title + '2'
        
        movies[title] = [link, title, rank, lifetime_gross, overall_rank, year_of_release]

In [3]:
url_list = ["https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac",
           "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=200&by_mpaa=PG-13",
           "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=400",
           "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?offset=600&by_mpaa=PG-13",
           "https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&offset=800"]
movies = {}
for url in url_list:
    get_tables_info(url)
    print(len(movies))
    time.sleep(10+10*random.random())  #being more human like for web scraping

200
400
600
800
1000


In [4]:
#put our data into dataframe
pg13_movies = pd.DataFrame(movies).T  #transpose
pg13_movies.columns = ['link_stub', 'title', 'rank', 'lifetime_gross', 'rank_overall', 'year']
pg13_movies

Unnamed: 0,link_stub,title,rank,lifetime_gross,rank_overall,year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,858373000,2,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,760507625,3,2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,700426566,4,2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,678815482,5,2018
...,...,...,...,...,...,...
Arthur,/title/tt1334512/?ref_=bo_cso_table_196,Arthur,996,33035397,2560,2011
Daylight,/title/tt0116040/?ref_=bo_cso_table_197,Daylight,997,33023469,2561,1996
Balls of Fury,/title/tt0424823/?ref_=bo_cso_table_198,Balls of Fury,998,32886940,2569,2007
In Her Shoes,/title/tt0388125/?ref_=bo_cso_table_199,In Her Shoes,999,32880591,2570,2005


### Step 2: click in individual movie link, getting more details about the movie

In [5]:
def get_single_movie_info(df):
    '''
    Go to each single movie link, then extract more features from the webpage about this movie  
    '''
    base_url = 'https://www.boxofficemojo.com'
    
    for i in range(df.shape[0]):
        
        #getting movie title & link_stub
        movie_title, link_stub =  df.iloc[i,1], df.iloc[i,0]
        url = base_url + link_stub
        response = requests.get(url) 
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        # movie_title = soup.find('title').text[:-18]
            
        #budget
        sec = soup.find('div', class_ = 'mojo-summary-values')
        try:
            budget = int(sec.find('span', class_='money').text.replace('$', '').replace(',', ''))
        except:
            budget = None
            
        #domestic_distributor
        try:
            domestic_distributor = sec.find_all('span')[1].text.split('See')[0]
        except:
            domestic_distributor = None
            
        #running_time
        try:
            runtime_regex = re.compile('Run')
            rt_string = soup.find(text=runtime_regex)
            running_time = rt_string.findNext().text
        except:
            running_time = None
            
        #Earliest release date
        try:
            regex = re.compile('Earliest Release')
            erd = soup.find(text = regex)
            earliest_release_string = erd.findNext().text
            earliest_release_date = earliest_release_string.split('\n')[0]
        except:
            movie_title = None
            
        #Genre list
        try:
            gen = soup.find(text = 'Genres')
            genres_string = gen.findNext().text
            genres = genres_string.replace('\n', '').split()
        except:
            genres = None
        #MPAA
        try:
            regex = re.compile('MPAA')
            MPAA_string = soup.find(text = regex)
            MPAA = MPAA_string.findNext().text
        except:
            MPAA = None
            
        single_movie[movie_title] = [budget, domestic_distributor,running_time,earliest_release_date,genres, MPAA]

        time.sleep(1+1*random.random())  #more human like web_scraping
        
        if i%200 == 0:
            time.sleep(10)
        
    #return single_movie

In [6]:
single_movie = {}
get_single_movie_info(pg13_movies)

In [7]:
#create a new dataframe for the 1000 data point
new_df = pd.DataFrame(single_movie).T  #transpose - make the movie title being the index
new_df.columns = ['budget', 'domestic_distributor','running_time','earliest_release_date','genres', 'MPAA']
new_df

Unnamed: 0,budget,domestic_distributor,running_time,earliest_release_date,genres,MPAA
Star Wars: Episode VII - The Force Awakens,247966675,Walt Disney Studios Motion Pictures,2 hr 18 min,"December 16, 2015","[Action, Adventure, Sci-Fi]",PG-13
Avengers: Endgame,357115007,Walt Disney Studios Motion Pictures,3 hr 1 min,"April 24, 2019","[Action, Adventure, Drama, Sci-Fi]",PG-13
Avatar,77025481,Twentieth Century Fox,2 hr 42 min,"December 16, 2009","[Action, Adventure, Fantasy, Sci-Fi]",PG-13
Black Panther,202003951,Walt Disney Studios Motion Pictures,2 hr 14 min,"February 13, 2018","[Action, Adventure, Sci-Fi]",PG-13
Avengers: Infinity War,257698183,Walt Disney Studios Motion Pictures,2 hr 29 min,"April 25, 2018","[Action, Adventure, Sci-Fi]",PG-13
...,...,...,...,...,...,...
Arthur,12222756,Warner Bros.,1 hr 50 min,"April 8, 2011","[Comedy, Romance]",PG-13
Daylight,10015875,Universal Pictures,1 hr 54 min,"December 6, 1996","[Action, Adventure, Drama, Thriller]",PG-13
Balls of Fury,11352123,Rogue Pictures,1 hr 30 min,"August 29, 2007","[Comedy, Crime, Sport]",PG-13
In Her Shoes,10017575,Twentieth Century Fox,2 hr 10 min,"September 28, 2005","[Comedy, Drama, Romance]",PG-13


### Step 3: Combine 2 dataframes

In [8]:
pg13_movies_combined = pg13_movies.merge(new_df, left_index=True, right_index=True)
#Use the index from the left DataFrame as the join key(s). 
#Use the index from the right DataFrame as the join key(s). 
pg13_movies_combined

Unnamed: 0,link_stub,title,rank,lifetime_gross,rank_overall,year,budget,domestic_distributor,running_time,earliest_release_date,genres,MPAA
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,1,936662225,1,2015,247966675,Walt Disney Studios Motion Pictures,2 hr 18 min,"December 16, 2015","[Action, Adventure, Sci-Fi]",PG-13
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,2,858373000,2,2019,357115007,Walt Disney Studios Motion Pictures,3 hr 1 min,"April 24, 2019","[Action, Adventure, Drama, Sci-Fi]",PG-13
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,Avatar,3,760507625,3,2009,77025481,Twentieth Century Fox,2 hr 42 min,"December 16, 2009","[Action, Adventure, Fantasy, Sci-Fi]",PG-13
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,4,700426566,4,2018,202003951,Walt Disney Studios Motion Pictures,2 hr 14 min,"February 13, 2018","[Action, Adventure, Sci-Fi]",PG-13
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,5,678815482,5,2018,257698183,Walt Disney Studios Motion Pictures,2 hr 29 min,"April 25, 2018","[Action, Adventure, Sci-Fi]",PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...
Arthur,/title/tt1334512/?ref_=bo_cso_table_196,Arthur,996,33035397,2560,2011,12222756,Warner Bros.,1 hr 50 min,"April 8, 2011","[Comedy, Romance]",PG-13
Daylight,/title/tt0116040/?ref_=bo_cso_table_197,Daylight,997,33023469,2561,1996,10015875,Universal Pictures,1 hr 54 min,"December 6, 1996","[Action, Adventure, Drama, Thriller]",PG-13
Balls of Fury,/title/tt0424823/?ref_=bo_cso_table_198,Balls of Fury,998,32886940,2569,2007,11352123,Rogue Pictures,1 hr 30 min,"August 29, 2007","[Comedy, Crime, Sport]",PG-13
In Her Shoes,/title/tt0388125/?ref_=bo_cso_table_199,In Her Shoes,999,32880591,2570,2005,10017575,Twentieth Century Fox,2 hr 10 min,"September 28, 2005","[Comedy, Drama, Romance]",PG-13


### Step 4: save it to csv file

In [9]:
pg13_movies_combined.to_csv('Movie_data.csv')

Notes: A film distributor is responsible for the marketing of a film. The distribution company is usually different from the production company. Distribution deals are an important part of financing a film. 