In [80]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [330]:
url = 'https://www.boxofficemojo.com/year/2017/?grossesOption=totalGrosses&sort=rank&sortDir=asc'
#https://www.boxofficemojo.com/year/2018/?grossesOption=totalGrosses&sort=rank&sortDir=asc
#https://www.boxofficemojo.com/year/2019/?grossesOption=totalGrosses&sort=rank&sortDir=asc
#https://www.boxofficemojo.com/year/2020/?grossesOption=totalGrosses&sort=rank&sortDir=asc
#https://www.boxofficemojo.com/year/2021/?grossesOption=totalGrosses&sort=rank&sortDir=asc
#https://www.boxofficemojo.com/year/2022/?grossesOption=totalGrosses&sort=rank&sortDir=asc
response = requests.get(url)    
page = response.text
soup = BeautifulSoup(page,"lxml")
table = soup.find('table')
rows = [row for row in table.find_all('tr')]  # tr tag is for rows



In [332]:
movies = {}

for row in rows[1:]:
    items = row.find_all('td') #td finds the target cell
    link = items[1].find('a') #a finds the anchor
    title, url = link.text, link['href'] #href gives the url
    movies[title] = [url]

In [333]:
movie_links = pd.DataFrame(movies).T #transpose
movie_links.columns = ['Link_stub']
movie_links.reset_index().head()

Unnamed: 0,index,Link_stub
0,Star Wars: Episode VIII - The Last Jedi,/release/rl2708702721/?ref_=bo_yld_table_1
1,Beauty and the Beast,/release/rl222594561/?ref_=bo_yld_table_2
2,Wonder Woman,/release/rl578455041/?ref_=bo_yld_table_3
3,Jumanji: Welcome to the Jungle,/release/rl3095234049/?ref_=bo_yld_table_4
4,Guardians of the Galaxy Vol. 2,/release/rl2976089601/?ref_=bo_yld_table_5


In [334]:
movie_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199 entries, Star Wars: Episode VIII - The Last Jedi to Free Fire
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Link_stub  199 non-null    object
dtypes: object(1)
memory usage: 3.1+ KB


In [335]:
def get_movie_value(soup, field_name):
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [336]:
def runtime_to_min(runtimestring):
    try:    
        runtime = runtimestring.split()
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [337]:
def movie_revenue(cash):
    revenue = []
    try:
        money = cash.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0].text
        revenue.append(money)
    except:
        revenue.append('None')
    
    try:
        money = cash.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[1].text
        revenue.append(money)
    except:
        revenue.append('None')
    
    try:
        money = cash.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[2].text
        revenue.append(money)
    except:
        revenue.append('None')
    return revenue



In [338]:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

In [339]:
def theatre_number(places_string):
    try:
        places = places_string.split()
        number = places[0].replace(',','')
        return number
    except:
        return None

In [340]:
def clean_genre(category):
    try:
        category = category.split()
        return category
    except:
        return None

In [341]:
def get_movie_dict(link):
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['Title', 'Distributor','Opening','Release_date','In_Release',
               'Rating','Run_time','Genre','Theatres','Domestic_gross',
              'Internation_gross','Worldwide_gross']
    
    #Get title
    title_string = soup.find('title').text.split('-')[0].strip()

    #Get Distributor
    distributor = get_movie_value(soup,'Distributor')
    
    #Get opening day box office gross
    opening = soup.find(class_='money').text.replace('$','').replace(',','')
    
    #Get release date
    release_date = get_movie_value(soup,'Release Date')
    
    #Get days since release date
    Days_since_release = get_movie_value(soup,'In Release')
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')
    
    #Get run time
    run_time = get_movie_value(soup,'Running') 
    minutes = runtime_to_min(run_time)
    
    #Get genre
    genre = get_movie_value(soup,'Genres')
    genre = clean_genre(genre)
    
    #Get release theatres
    theatre = get_movie_value(soup,'Widest Release')
    theatre = theatre_number(theatre) 
    
    #Get revenue gross
    domestic = movie_revenue(soup)[0]
    domestic = money_to_int(domestic)
    international = movie_revenue(soup)[1]
    international = money_to_int(international)
    worldwide = movie_revenue(soup)[2]
    worldwide = money_to_int(worldwide)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title_string, distributor, opening, release_date,
                                    Days_since_release, rating, minutes, genre, theatre,
                                    domestic, international, worldwide]))

    return movie_dict



In [342]:
movies_pages = []

for link in movie_links.Link_stub:
    movies_pages.append(get_movie_dict(link))

In [516]:
movie_2022 = pd.DataFrame(movies_pages)

In [286]:
movie_2021 = pd.DataFrame(movies_pages)

In [300]:
movie_2020 = pd.DataFrame(movies_pages)

In [314]:
movie_2019 = pd.DataFrame(movies_pages)

In [328]:
movie_2018 = pd.DataFrame(movies_pages)

In [343]:
movie_2017 = pd.DataFrame(movies_pages)

In [494]:
combine = [movie_2022, movie_2021, movie_2020, movie_2019, movie_2018, movie_2017]
movie_df = pd.concat(combine)

In [495]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1063 entries, 0 to 198
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1063 non-null   object 
 1   Distributor        1063 non-null   object 
 2   Opening            1063 non-null   object 
 3   Release_date       1063 non-null   object 
 4   In_Release         1063 non-null   object 
 5   Rating             892 non-null    object 
 6   Run_time           1034 non-null   float64
 7   Genre              1059 non-null   object 
 8   Theatres           1050 non-null   object 
 9   Domestic_gross     1063 non-null   int64  
 10  Internation_gross  1063 non-null   int64  
 11  Worldwide_gross    944 non-null    float64
dtypes: float64(2), int64(2), object(8)
memory usage: 108.0+ KB


In [497]:
movie_df.Run_time.fillna('0',inplace=True)

In [498]:
movie_df.Genre.fillna('None',inplace=True)

In [499]:
movie_df.Theatres.fillna(0,inplace=True)

In [500]:
movie_df.Worldwide_gross.fillna(0,inplace=True)

In [501]:
movie_df.Rating.fillna('None',inplace=True)

In [502]:
movie_df['Theatres'] = pd.to_numeric(movie_df['Theatres'])
movie_df['Run_time'] = pd.to_numeric(movie_df['Run_time'])
movie_df['Opening'] = pd.to_numeric(movie_df['Opening'])

In [518]:
df = pd.DataFrame(movie_df)
df.to_csv('movie_df.csv')