# Overview

This scraper loads the TMDB data file with IMDB IDs, containing 10,000 movie entries, and scrapes the Box Office Mojo website for any/all of the following:
    * budget
    * domestic gross
    * worldwide gross
    * studio
    * MPAA rating

# Library Imports

In [None]:
import pandas as pd
import requests
import re
import numpy as np
from bs4 import BeautifulSoup as bs
import time

# Scraping Functions

In [None]:
# function that converts our string to a search string
def format_title(title):
    text = str(title)
    result = re.sub(r"[,@\'?\.$%_:â()-]", "", text, flags=re.I)
    result = re.sub(r"\s+","+", result, flags = re.I)
    return result

In [None]:
# function to convert the raw numbers into integers
def convert_number(x):
    '''Takes in a string formatted number that starts with $ and may include commas, and returns that 
    number as a whole integer that can be used in calculations'''
    try:
        x = x[1:]
        x = x.replace(',', '')
        x = int(x)
        return x
    except:
        print('No number to convert')
        return None

In [None]:
# function that receives the followurl appendation for the IMDB id and follows it
def follow_imdb(movie_id):
    followurl = 'https://www.boxofficemojo.com/title/' + movie_id
    page = requests.get(followurl)
    soup = bs(page.content, 'html.parser')
    return soup

In [None]:
# Function that returns the domestic and foreign gross
def gross(movie_page):
    try:
        search1 = movie_page.find('div', class_='a-section a-spacing-none mojo-performance-summary-table')
        dgross = search1.find_all('span', class_='money')[0].text
        fgross = search1.find_all('span', class_='money')[1].text
        dgross = convert_number(dgross)
        fgross = convert_number(fgross)
        return dgross, fgross
    except:
        return None, None

In [None]:
# function that takes in the product page and gets the studio name
def studio_name(soup):
    try:
        search1 = soup.find('div', class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile')
        search2 = search1.find_all('div', class_='a-section a-spacing-none')[0].find_all("span")
        if search2[0].text == 'Domestic Distributor':
            studio = search2[1].text.replace('See full company information', '').rstrip()
            return studio
        else:
            return None
    except:
        print('error in studio function')
        return None

In [None]:
# function that takes in the product page and gets the budget
def find_budget(soup):
    try:
        search1 = soup.find('div', class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile')
        search2 = search1.find_all('div', class_='a-section a-spacing-none')[2].find_all("span")
        if search2[0].text == 'Budget':
            budget = search2[1].text
            budget = convert_number(budget)
            return budget
        else:
            return None
    except:
        print('error in budget function')
        return None

In [None]:
# function that takes in the product page and gives the MPAA rating
def find_mpaa(soup):
    try:
        search1 = soup.find('div', class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile')
        div_search = search1.find_all('div', class_='a-section a-spacing-none')
        entries = len(div_search)
        
        for x in range(0, entries):
            search = div_search[x].find_all("span")
            if search[0].text == 'MPAA':
                rating = search[1].text
                return rating
            else: continue
    except:
        print('error in rating function')
        return None

In [None]:
# Master Scraping Function

def webscraper(scrapingset):
    count = 1
    length = scrapingset.shape[0]
    ts = time.time()
    for ind in scrapingset.index:   
        movie_id = str(scrapingset['id'][ind])
        print(('Item {} / {} - {}').format(count, length, movie_id))
        movie_page = follow_imdb(movie_id)
        count += 1

        rating = find_mpaa(movie_page)
        scrapingset['rating'][ind] = rating
        
        studio = studio_name(movie_page)
        scrapingset['studio'][ind] = studio
   
        dgross, fgross = gross(movie_page)
        scrapingset['dom_gross'][ind] = dgross
        scrapingset['for_gross'][ind] = fgross
    
        budget = find_budget(movie_page)
        scrapingset['budget'][ind] = budget
    
    tnow = time.time()
    duration = round((tnow - ts), 2)
    scrape_average = round(duration/length, 2)
    print('{} minutes elapsed'.format(duration/60))
    print('{} seconds per item'.format(scrape_average))
    return scrapingset

# TO DO BEFORE REDOING

Round the numbers so they don't have decimal places

Add a scraper for rating!

# Data Import

In [None]:
# import tmdb_imdb files
df = pd.read_csv('api_data/tmdb_imdb_ids.csv')

In [None]:
df

In [None]:
#We need nothing but the id from this dataframe
df.drop(columns=['popularity', 'vote_count', 'genre_ids', 'title', 'vote_average', 'release_date'], inplace=True)

In [None]:
# We're prepping our dataframe structure for the information that we plan to scrape
df['studio'] = ''
df['rating'] = ''
df['budget'] = ''
df['dom_gross'] = ''
df['for_gross'] = ''
df

# Testing Set

Before we scrape for 10k returns, we will do a small test scrape

In [None]:
test1 = pd.DataFrame(columns = ['id', 'rating', 'studio', 'budget', 'dom_gross', 'for_gross'])

In [None]:
ids = ['tt1825683', 'tt0349080', 'tt7286456']
test1['id'] = np.array(ids)

In [None]:
test1

In [None]:
test1 = webscraper(test1)

In [None]:
test1

In [None]:
#A bigger test set
test2 = df[0:10]
test2

In [None]:
test2 = webscraper(test2)

In [None]:
test2

# The Big Scrape

We're ready to do the big scrape!
We'll break our frame of 10,000 entries into 5 smaller ones in case of any errors.

In [None]:
df1 = df[0:2000]
df2 = df[2000:4000]
df3 = df[4000:6000]
df4 = df[6000:8000]
df5 = df[8000:10001]

In [None]:
df1 = webscraper(df1)

In [None]:
df2 = webscraper(df2)

In [None]:
df3 = webscraper(df3)

In [None]:
df4 = webscraper(df4)

In [None]:
df5 = webscraper(df5)

In [None]:
#attach the 5 dataframes together
tmdb_bom = pd.concat([df1, df2, df3, df4, df5])

In [None]:
tmdb_bom

In [None]:
# Setting the title as the index
tmdb_bom.set_index('id', inplace=True)

# Data Export

In [None]:
#exporting the dataframe to a csv
tmdb_bom.to_csv('api_data/tmdb_bom_scraped.csv')

# TO DO

Write Docstrings for all functions