# Overview

This get_franchises loads the TMDB data file with IMDB IDs, containing 10,000 movie entries, and scrapes the Box Office Mojo website for any/all of the following:
    * budget
    * domestic gross
    * worldwide gross
    * studio
    * MPAA rating

# Library Imports

In [1]:
import pandas as pd
import requests
import re
import numpy as np
from bs4 import BeautifulSoup as bs
import time
from random import randint
from time import sleep
from fake_useragent import UserAgent

In [2]:
ua = UserAgent()
header = {'User-Agent':str(ua.random)}
print(header)

{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}


# Scraping Functions

In [3]:
#function that goes to the base page and grabs the content
def get_franchises():
    followurl = 'https://www.the-numbers.com/movies/franchises'
    page = requests.get(followurl, headers=header)
    soup = bs(page.content, 'html.parser')
    franchise_links(soup)

In [4]:
# function that collects the franchise links from the base page and puts them in our list of tags to follow

def franchise_links(soup):
    for link in soup.find_all('a', href=True):
        if '/movies/franchise/' in link.get('href'):
            franchise = link.text # saving the franchise name here in a variable. We need to figure out where to properly connect this
            franchiselinks[franchise] = link.get('href')
        else: continue

In [5]:
# follows a link from the base page
def grab_next(franchise_link):
    followurl = 'https://www.the-numbers.com' + franchise_link
    page = requests.get(followurl, headers=header)
    soup = bs(page.content, 'html.parser')
    return soup

In [6]:
def scraper(links, enddict):
    count = 1
    length = len(links)
    ts = time.time()
    
    for key, value in links.items():
        print(('Item {} / {} - {}').format(count, length, key))
        
        soup = grab_next(value)
        franchise_table = soup.find('table', id="franchise_movies_overview")
        entries = franchise_table.find_all('a', href=True)
        print(len(entries))
        if len(entries) < 4:
            continue
        else:
            for item in entries:
                enddict['franchise'].append(key)
                enddict['movie'].append(item.text)
        
        sleep(randint(0,2))
        count += 1
    tnow = time.time()
    duration = round((tnow - ts), 2)
    scrape_average = round(duration/length, 2)
    print('{} minutes elapsed'.format(duration/60))
    print('{} seconds per item'.format(scrape_average))
    return enddict

# Testing Set

Before we scrape for 10k returns, we will do a small test scrape

In [7]:
franchiselinks = {}

In [8]:
tempset = {'franchise' : [], 'movie' : []}

In [9]:
get_franchises()

In [10]:
franchiselinks

{'Marvel Cinematic Universe': '/movies/franchise/Marvel-Cinematic-Universe',
 'Star Wars': '/movies/franchise/Star-Wars',
 'James Bond': '/movies/franchise/James-Bond',
 'Batman': '/movies/franchise/Batman',
 'Harry Potter': '/movies/franchise/Harry-Potter',
 'Spider-Man': '/movies/franchise/Spider-Man',
 'X-Men': '/movies/franchise/X-Men',
 'Avengers': '/movies/franchise/Avengers',
 'Jurassic Park': '/movies/franchise/Jurassic-Park',
 'Star Trek': '/movies/franchise/Star-Trek',
 "Peter Jackson's Lord of the Rings": '/movies/franchise/Peter-Jacksons-Lord-of-the-Rings',
 'DC Extended Universe': '/movies/franchise/DC-Extended-Universe',
 'Indiana Jones': '/movies/franchise/Indiana-Jones',
 'Superman': '/movies/franchise/Superman',
 'Fast and the Furious': '/movies/franchise/Fast-and-the-Furious',
 'Shrek': '/movies/franchise/Shrek',
 'Rocky': '/movies/franchise/Rocky',
 'Pirates of the Caribbean': '/movies/franchise/Pirates-of-the-Caribbean',
 'Transformers': '/movies/franchise/Transform

In [11]:
#testing set

# Initialize limit  
N = 3
    
# Using items() + list slicing  
# Get first K items in dictionary  
out = dict(list(franchiselinks.items())[0: N]) 

In [12]:
tempstuff = scraper(out, tempset)

Item 1 / 3 - Marvel Cinematic Universe
58
Item 2 / 3 - Star Wars
29
Item 3 / 3 - James Bond
44
0.13816666666666666 minutes elapsed
2.76 seconds per item


In [13]:
tempstuff

{'franchise': ['Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Univer

In [14]:
franchises = pd.DataFrame(tempstuff)
franchises

Unnamed: 0,franchise,movie
0,Marvel Cinematic Universe,"May 6, 2022"
1,Marvel Cinematic Universe,Black Panther II
2,Marvel Cinematic Universe,"Nov 5, 2021"
3,Marvel Cinematic Universe,Eternals
4,Marvel Cinematic Universe,"Nov 5, 2021"
...,...,...
126,James Bond,Casino Royale
127,James Bond,Thunderball
128,James Bond,Goldfinger
129,James Bond,From Russia With Love


# The Big Scrape

We're ready to do the big scrape!
We'll break our frame of 10,000 entries into 10 smaller ones in case of any errors.

In [15]:
franchiselinks = {}

In [16]:
movietitles = {'franchise' : [], 'movie' : []}

In [17]:
get_franchises()

In [18]:
franchiselinks

{'Marvel Cinematic Universe': '/movies/franchise/Marvel-Cinematic-Universe',
 'Star Wars': '/movies/franchise/Star-Wars',
 'James Bond': '/movies/franchise/James-Bond',
 'Batman': '/movies/franchise/Batman',
 'Harry Potter': '/movies/franchise/Harry-Potter',
 'Spider-Man': '/movies/franchise/Spider-Man',
 'X-Men': '/movies/franchise/X-Men',
 'Avengers': '/movies/franchise/Avengers',
 'Jurassic Park': '/movies/franchise/Jurassic-Park',
 'Star Trek': '/movies/franchise/Star-Trek',
 "Peter Jackson's Lord of the Rings": '/movies/franchise/Peter-Jacksons-Lord-of-the-Rings',
 'DC Extended Universe': '/movies/franchise/DC-Extended-Universe',
 'Indiana Jones': '/movies/franchise/Indiana-Jones',
 'Superman': '/movies/franchise/Superman',
 'Fast and the Furious': '/movies/franchise/Fast-and-the-Furious',
 'Shrek': '/movies/franchise/Shrek',
 'Rocky': '/movies/franchise/Rocky',
 'Pirates of the Caribbean': '/movies/franchise/Pirates-of-the-Caribbean',
 'Transformers': '/movies/franchise/Transform

In [19]:
# Engaging in a manual process of popping off sub-franchises, i.e. Avengers which is part of the Marvel Cinematic Universe
# We're also dropping entries where it's not a franchise but a remake
franchiselinks.pop('Avengers')
franchiselinks.pop('Godzilla')
franchiselinks.pop('Deadpool')
franchiselinks.pop('Thor')
franchiselinks.pop('Hulk')
franchiselinks.pop('Ant-Man')
franchiselinks.pop('Wonder Woman')
franchiselinks.pop('Wolverine')
franchiselinks.pop('Guardians of the Galaxy')
franchiselinks.pop('Captain America')
franchiselinks.pop('Iron Man')
franchiselinks.pop('King Kong')
franchiselinks.pop('Justice League')
franchiselinks.pop('Dark Universe')
franchiselinks.pop('Scorpion King')
franchiselinks.pop('Air Bud')
franchiselinks.pop('Minions')
franchiselinks.pop('Dark Knight Trilogy')
franchiselinks.pop('The Conjuring')
franchiselinks.pop("Sonyâ\x80\x99s Marvel Universe")
franchiselinks.pop('Shaft')
franchiselinks.pop('Pet Sematary')
franchiselinks.pop('Bad Grandpa')
franchiselinks.pop('The Thing')
franchiselinks.pop('Why Did I Get Married')
franchiselinks.pop('Alice in Wonderland')
franchiselinks.pop('I Spit on Your Grave')
franchiselinks.pop('Flatliners')
franchiselinks.pop('Dumbo')

'/movies/franchise/Dumbo'

In [20]:
# Initialize limit  
N = 700
    
# Using items() + list slicing  
# Get first K items in dictionary  
listof700 = dict(list(franchiselinks.items())[0: N]) 

In [21]:
movietitles = scraper(listof700, movietitles)

Item 1 / 700 - Marvel Cinematic Universe
58
Item 2 / 700 - Star Wars
29
Item 3 / 700 - James Bond
44
Item 4 / 700 - Batman
42
Item 5 / 700 - Harry Potter
24
Item 6 / 700 - Spider-Man
22
Item 7 / 700 - X-Men
26
Item 8 / 700 - Jurassic Park
12
Item 9 / 700 - Star Trek
26
Item 10 / 700 - Peter Jackson's Lord of the Rings
14
Item 11 / 700 - DC Extended Universe
32
Item 12 / 700 - Indiana Jones
10
Item 13 / 700 - Superman
19
Item 14 / 700 - Fast and the Furious
20
Item 15 / 700 - Shrek
13
Item 16 / 700 - Rocky
15
Item 17 / 700 - Pirates of the Caribbean
10
Item 18 / 700 - Transformers
14
Item 19 / 700 - Toy Story
10
Item 20 / 700 - Jaws
9
Item 21 / 700 - Hunger Games
8
Item 22 / 700 - Twilight
12
Item 23 / 700 - Mission: Impossible
16
Item 24 / 700 - Peter Pan
11
Item 25 / 700 - Planet of the Apes
13
Item 26 / 700 - The Lion King
6
Item 27 / 700 - Despicable Me
12
Item 28 / 700 - Lady and the Tramp
2
Item 28 / 700 - Exorcist
9
Item 29 / 700 - Billy Jack
3
Item 29 / 700 - Mary Poppins
3
Item

6
Item 230 / 700 - Hellboy
6
Item 231 / 700 - Ernest
10
Item 232 / 700 - Kill Bill
4
Item 233 / 700 - Magic Mike
4
Item 234 / 700 - Cats and Dogs
6
Item 235 / 700 - Now You See Me
4
Item 236 / 700 - Bridget Jones
6
Item 237 / 700 - Riddick
6
Item 238 / 700 - Kindergarten Cop
2
Item 238 / 700 - Stakeout
4
Item 239 / 700 - Fletch
4
Item 240 / 700 - Horrible Bosses
4
Item 241 / 700 - Young Guns
4
Item 242 / 700 - Bad Moms
4
Item 243 / 700 - Purple Rain
4
Item 244 / 700 - X-Files
4
Item 245 / 700 - Blade Runner
4
Item 246 / 700 - Major League
6
Item 247 / 700 - A Quiet Place
4
Item 248 / 700 - Shanghai Noon
4
Item 249 / 700 - Space Jam
4
Item 250 / 700 - Are We There Yet?
4
Item 251 / 700 - The Shining
4
Item 252 / 700 - Percy Jackson
4
Item 253 / 700 - The Heat
2
Item 253 / 700 - The Boss Baby
6
Item 254 / 700 - Untouchables
3
Item 254 / 700 - Cloverfield
6
Item 255 / 700 - Think Like a Man
4
Item 256 / 700 - Bill & Ted
6
Item 257 / 700 - Anaconda
6
Item 258 / 700 - View Askewniverse
16
I

5
Item 448 / 700 - Alpha and Omega
8
Item 449 / 700 - Before Sunrise
6
Item 450 / 700 - Pulse
4
Item 451 / 700 - Escape Plan
6
Item 452 / 700 - Warlock
4
Item 453 / 700 - Bloodsport
4
Item 454 / 700 - Exterminator
3
Item 454 / 700 - Baahubali
4
Item 455 / 700 - The Marine
8
Item 456 / 700 - Dungeons & Dragons
4
Item 457 / 700 - Vacancy
3
Item 457 / 700 - Wild Orchid
4
Item 458 / 700 - Skyline
4
Item 459 / 700 - Class of 1984
5
Item 460 / 700 - Far From Home
4
Item 461 / 700 - Leprechaun
8
Item 462 / 700 - Pure Country
4
Item 463 / 700 - Wrong Turn
11
Item 464 / 700 - Waiting
3
Item 464 / 700 - Wolf Creek
4
Item 465 / 700 - American Girl
9
Item 466 / 700 - Swan Princess
14
Item 467 / 700 - Philadelphia Experiment
3
Item 467 / 700 - No Manches Frida
4
Item 468 / 700 - Les Boys
6
Item 469 / 700 - Happily N'Ever After
3
Item 469 / 700 - Shaun the Sheep
6
Item 470 / 700 - See No Evil
4
Item 471 / 700 - Arthur and the Minimoys
6
Item 472 / 700 - Smoke
4
Item 473 / 700 - Undisputed
7
Item 474

In [22]:
movietitles

{'franchise': ['Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Univer

In [23]:
franchises = pd.DataFrame(movietitles)
franchises

Unnamed: 0,franchise,movie
0,Marvel Cinematic Universe,"May 6, 2022"
1,Marvel Cinematic Universe,Black Panther II
2,Marvel Cinematic Universe,"Nov 5, 2021"
3,Marvel Cinematic Universe,Eternals
4,Marvel Cinematic Universe,"Nov 5, 2021"
...,...,...
3883,The Wedding Party,The Wedding Party
3884,Donald Strachey,"Oct 10, 2008"
3885,Donald Strachey,Ice Blues: A Donald Strache…
3886,Donald Strachey,"Jul 18, 2008"


In [24]:
# Setting the title as the index
franchises.set_index('movie', inplace=True)

# Data Export

In [25]:
#exporting the dataframe to a csv
franchises.to_csv('api_data/franchises_scraped.csv')