# Overview

This get_franchises loads the TMDB data file with IMDB IDs, containing 10,000 movie entries, and scrapes the Box Office Mojo website for any/all of the following:
    * budget
    * domestic gross
    * worldwide gross
    * studio
    * MPAA rating

# Library Imports

In [1]:
import pandas as pd
import requests
import re
import numpy as np
from bs4 import BeautifulSoup as bs
import time
from random import randint
from time import sleep
from fake_useragent import UserAgent

In [2]:
ua = UserAgent()
header = {'User-Agent':str(ua.random)}
print(header)

{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F'}


# Scraping Functions

In [3]:
#function that goes to the base page and grabs the content
def get_franchises():
    followurl = 'https://www.the-numbers.com/movies/franchises'
    page = requests.get(followurl, headers=header)
    soup = bs(page.content, 'html.parser')
    franchise_links(soup)

In [4]:
# function that collects the franchise links from the base page and puts them in our list of tags to follow

def franchise_links(soup):
    for link in soup.find_all('a', href=True):
        if '/movies/franchise/' in link.get('href'):
            franchise = link.text # saving the franchise name here in a variable. We need to figure out where to properly connect this
            franchiselinks[franchise] = link.get('href')
        else: continue

In [5]:
# follows a link from the base page
def grab_next(franchise_link):
    followurl = 'https://www.the-numbers.com' + franchise_link
    page = requests.get(followurl, headers=header)
    soup = bs(page.content, 'html.parser')
    return soup

In [6]:
def scraper(links, enddict):
    count = 1
    length = len(links)
    ts = time.time()
    
    for key, value in links.items():
        print(('Item {} / {} - {}').format(count, length, key))
        
        soup = grab_next(value)
        franchise_table = soup.find('table', id="franchise_movies_overview")
        entries = franchise_table.find_all('a', href=True)
        print(len(entries))
        if len(entries) < 4:
            continue
        else:
            for item in entries:
                enddict['franchise'].append(key)
                enddict['movie'].append(item.text)
        
        sleep(randint(0,2))
        count += 1
    tnow = time.time()
    duration = round((tnow - ts), 2)
    scrape_average = round(duration/length, 2)
    print('{} minutes elapsed'.format(duration/60))
    print('{} seconds per item'.format(scrape_average))
    return enddict

# Testing Set

Before we scrape for 10k returns, we will do a small test scrape

In [7]:
franchiselinks = {}

In [8]:
tempset = {'franchise' : [], 'movie' : []}

In [9]:
get_franchises()

In [10]:
franchiselinks

{'Marvel Cinematic Universe': '/movies/franchise/Marvel-Cinematic-Universe',
 'Star Wars': '/movies/franchise/Star-Wars',
 'James Bond': '/movies/franchise/James-Bond',
 'Batman': '/movies/franchise/Batman',
 'Harry Potter': '/movies/franchise/Harry-Potter',
 'Spider-Man': '/movies/franchise/Spider-Man',
 'X-Men': '/movies/franchise/X-Men',
 'Avengers': '/movies/franchise/Avengers',
 'Jurassic Park': '/movies/franchise/Jurassic-Park',
 'Star Trek': '/movies/franchise/Star-Trek',
 "Peter Jackson's Lord of the Rings": '/movies/franchise/Peter-Jacksons-Lord-of-the-Rings',
 'DC Extended Universe': '/movies/franchise/DC-Extended-Universe',
 'Indiana Jones': '/movies/franchise/Indiana-Jones',
 'Superman': '/movies/franchise/Superman',
 'Fast and the Furious': '/movies/franchise/Fast-and-the-Furious',
 'Shrek': '/movies/franchise/Shrek',
 'Rocky': '/movies/franchise/Rocky',
 'Pirates of the Caribbean': '/movies/franchise/Pirates-of-the-Caribbean',
 'Transformers': '/movies/franchise/Transform

In [11]:
#testing set

# Initialize limit  
N = 3
    
# Using items() + list slicing  
# Get first K items in dictionary  
out = dict(list(franchiselinks.items())[0: N]) 

In [12]:
tempstuff = scraper(out, tempset)

Item 1 / 3 - Marvel Cinematic Universe
58
Item 2 / 3 - Star Wars
29
Item 3 / 3 - James Bond
44
0.16483333333333333 minutes elapsed
3.3 seconds per item


In [13]:
tempstuff

{'franchise': ['Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Univer

In [14]:
franchises = pd.DataFrame(tempstuff)
franchises

Unnamed: 0,franchise,movie
0,Marvel Cinematic Universe,"May 6, 2022"
1,Marvel Cinematic Universe,Black Panther II
2,Marvel Cinematic Universe,"Nov 5, 2021"
3,Marvel Cinematic Universe,Untitled Spider-Man: Far Fr…
4,Marvel Cinematic Universe,"Nov 5, 2021"
...,...,...
126,James Bond,Casino Royale
127,James Bond,Thunderball
128,James Bond,Goldfinger
129,James Bond,From Russia With Love


# The Big Scrape

We're ready to do the big scrape!
We'll break our frame of 10,000 entries into 10 smaller ones in case of any errors.

In [15]:
franchiselinks = {}

In [16]:
movietitles = {'franchise' : [], 'movie' : []}

In [17]:
get_franchises()

In [18]:
franchiselinks

{'Marvel Cinematic Universe': '/movies/franchise/Marvel-Cinematic-Universe',
 'Star Wars': '/movies/franchise/Star-Wars',
 'James Bond': '/movies/franchise/James-Bond',
 'Batman': '/movies/franchise/Batman',
 'Harry Potter': '/movies/franchise/Harry-Potter',
 'Spider-Man': '/movies/franchise/Spider-Man',
 'X-Men': '/movies/franchise/X-Men',
 'Avengers': '/movies/franchise/Avengers',
 'Jurassic Park': '/movies/franchise/Jurassic-Park',
 'Star Trek': '/movies/franchise/Star-Trek',
 "Peter Jackson's Lord of the Rings": '/movies/franchise/Peter-Jacksons-Lord-of-the-Rings',
 'DC Extended Universe': '/movies/franchise/DC-Extended-Universe',
 'Indiana Jones': '/movies/franchise/Indiana-Jones',
 'Superman': '/movies/franchise/Superman',
 'Fast and the Furious': '/movies/franchise/Fast-and-the-Furious',
 'Shrek': '/movies/franchise/Shrek',
 'Rocky': '/movies/franchise/Rocky',
 'Pirates of the Caribbean': '/movies/franchise/Pirates-of-the-Caribbean',
 'Transformers': '/movies/franchise/Transform

In [19]:
# Engaging in a manual process of popping off sub-franchises, i.e. Avengers which is part of the Marvel Cinematic Universe
# We're also dropping entries where it's not a franchise but a remake
'''franchiselinks.pop('Avengers')
franchiselinks.pop('Godzilla')
franchiselinks.pop('Deadpool')
franchiselinks.pop('Thor')
franchiselinks.pop('Hulk')
franchiselinks.pop('Ant-Man')
franchiselinks.pop('Wonder Woman')
franchiselinks.pop('Wolverine')
franchiselinks.pop('Guardians of the Galaxy')
franchiselinks.pop('Captain America')
franchiselinks.pop('Iron Man')
franchiselinks.pop('King Kong')
franchiselinks.pop('Justice League')
franchiselinks.pop('Dark Universe')
franchiselinks.pop('Scorpion King')
franchiselinks.pop('Air Bud')
franchiselinks.pop('Minions')
franchiselinks.pop('Dark Knight Trilogy')
franchiselinks.pop('The Conjuring')
franchiselinks.pop("Sonyâ\x80\x99s Marvel Universe")
franchiselinks.pop('Shaft')
franchiselinks.pop('Pet Sematary')
franchiselinks.pop('Bad Grandpa')
franchiselinks.pop('The Thing')
franchiselinks.pop('Why Did I Get Married')
franchiselinks.pop('Alice in Wonderland')
franchiselinks.pop('I Spit on Your Grave')
franchiselinks.pop('Flatliners')
franchiselinks.pop('Dumbo')'''

'franchiselinks.pop(\'Avengers\')\nfranchiselinks.pop(\'Godzilla\')\nfranchiselinks.pop(\'Deadpool\')\nfranchiselinks.pop(\'Thor\')\nfranchiselinks.pop(\'Hulk\')\nfranchiselinks.pop(\'Ant-Man\')\nfranchiselinks.pop(\'Wonder Woman\')\nfranchiselinks.pop(\'Wolverine\')\nfranchiselinks.pop(\'Guardians of the Galaxy\')\nfranchiselinks.pop(\'Captain America\')\nfranchiselinks.pop(\'Iron Man\')\nfranchiselinks.pop(\'King Kong\')\nfranchiselinks.pop(\'Justice League\')\nfranchiselinks.pop(\'Dark Universe\')\nfranchiselinks.pop(\'Scorpion King\')\nfranchiselinks.pop(\'Air Bud\')\nfranchiselinks.pop(\'Minions\')\nfranchiselinks.pop(\'Dark Knight Trilogy\')\nfranchiselinks.pop(\'The Conjuring\')\nfranchiselinks.pop("Sonyâ\x80\x99s Marvel Universe")\nfranchiselinks.pop(\'Shaft\')\nfranchiselinks.pop(\'Pet Sematary\')\nfranchiselinks.pop(\'Bad Grandpa\')\nfranchiselinks.pop(\'The Thing\')\nfranchiselinks.pop(\'Why Did I Get Married\')\nfranchiselinks.pop(\'Alice in Wonderland\')\nfranchiselinks.po

In [20]:
# Initialize limit  
N = 800
    
# Using items() + list slicing  
# Get first K items in dictionary  
listof800 = dict(list(franchiselinks.items())[0: N]) 

In [21]:
movietitles = scraper(listof800, movietitles)

Item 1 / 800 - Marvel Cinematic Universe
58
Item 2 / 800 - Star Wars
29
Item 3 / 800 - James Bond
44
Item 4 / 800 - Batman
42
Item 5 / 800 - Harry Potter
24
Item 6 / 800 - Spider-Man
22
Item 7 / 800 - X-Men
26
Item 8 / 800 - Avengers
8
Item 9 / 800 - Jurassic Park
12
Item 10 / 800 - Star Trek
26
Item 11 / 800 - Peter Jackson's Lord of the Rings
14
Item 12 / 800 - DC Extended Universe
32
Item 13 / 800 - Indiana Jones
10
Item 14 / 800 - Superman
19
Item 15 / 800 - Fast and the Furious
20
Item 16 / 800 - Shrek
13
Item 17 / 800 - Rocky
15
Item 18 / 800 - Pirates of the Caribbean
10
Item 19 / 800 - Transformers
14
Item 20 / 800 - Toy Story
10
Item 21 / 800 - Jaws
9
Item 22 / 800 - Hunger Games
8
Item 23 / 800 - Twilight
12
Item 24 / 800 - Mission: Impossible
16
Item 25 / 800 - Dark Knight Trilogy
6
Item 26 / 800 - Peter Pan
11
Item 27 / 800 - Planet of the Apes
13
Item 28 / 800 - The Lion King
6
Item 29 / 800 - Despicable Me
12
Item 30 / 800 - Lady and the Tramp
2
Item 30 / 800 - Exorcist
9

2
Item 234 / 800 - Pet Sematary
6
Item 235 / 800 - Thomas Crown Affair
4
Item 236 / 800 - Blues Brothers
4
Item 237 / 800 - I Know What You Did Last Summer
5
Item 238 / 800 - Mortal Kombat
6
Item 239 / 800 - Death Wish
9
Item 240 / 800 - The Equalizer
4
Item 241 / 800 - Conan
6
Item 242 / 800 - The Red Curtain Trilogy
6
Item 243 / 800 - Sonyâs Marvel Universe
4
Item 244 / 800 - Ghost Rider
4
Item 245 / 800 - Diary of a Wimpy Kid
8
Item 246 / 800 - The Croods
4
Item 247 / 800 - Crouching Tiger, Hidden Dragon
4
Item 248 / 800 - The Hunchback of Notre Dame
2
Item 248 / 800 - Friday
6
Item 249 / 800 - Hellboy
6
Item 250 / 800 - Ernest
10
Item 251 / 800 - Kill Bill
4
Item 252 / 800 - Magic Mike
4
Item 253 / 800 - Cats and Dogs
6
Item 254 / 800 - Now You See Me
4
Item 255 / 800 - Bridget Jones
6
Item 256 / 800 - Riddick
6
Item 257 / 800 - Kindergarten Cop
2
Item 257 / 800 - Stakeout
4
Item 258 / 800 - Fletch
4
Item 259 / 800 - Horrible Bosses
4
Item 260 / 800 - Young Guns
4
Item 261 / 800 

4
Item 448 / 800 - Gettysburg
4
Item 449 / 800 - Sniper
6
Item 450 / 800 - Clerks
4
Item 451 / 800 - Quarantine
2
Item 451 / 800 - Cirque du Soleil
6
Item 452 / 800 - Machete
4
Item 453 / 800 - Van Wilder
5
Item 454 / 800 - American Ninja
6
Item 455 / 800 - Dragon Ball Z
6
Item 456 / 800 - Critters
7
Item 457 / 800 - Space Chimps
4
Item 458 / 800 - An Inconvenient Truth
4
Item 459 / 800 - Trainspotting
4
Item 460 / 800 - Tremors
8
Item 461 / 800 - Kickboxer
8
Item 462 / 800 - Joy Ride
5
Item 463 / 800 - Beastmaster
4
Item 464 / 800 - Gate
4
Item 465 / 800 - Big Lebowski
4
Item 466 / 800 - Omega Code
4
Item 467 / 800 - Cabin Fever
6
Item 468 / 800 - Millenium
6
Item 469 / 800 - Never Back Down
6
Item 470 / 800 - Academy Award Short Film Nominations
30
Item 471 / 800 - Flicka
4
Item 472 / 800 - Center Stage
5
Item 473 / 800 - Alpha and Omega
8
Item 474 / 800 - Before Sunrise
6
Item 475 / 800 - Pulse
4
Item 476 / 800 - Escape Plan
6
Item 477 / 800 - Warlock
4
Item 478 / 800 - Bloodsport
4

5
Item 656 / 800 - Masked Rider
5
Item 657 / 800 - Shopkins
6
Item 658 / 800 - Ivan Tsarevich i Seryy Volk
6
Item 659 / 800 - Argentoâs Animal Trilogy
3
Item 659 / 800 - Detective Conan
4
Item 660 / 800 - Celtic Woman
2
Item 660 / 800 - Phineas and Ferb
4
Item 661 / 800 - I Am Vengeance
4
Item 662 / 800 - The Man From Macau
4
Item 663 / 800 - Always
3
Item 663 / 800 - Ryanverse
2
Item 663 / 800 - Wolfcop
4
Item 664 / 800 - Si da ming bu
4
Item 665 / 800 - When Calls the Heart
4
Item 666 / 800 - Rest Stop
2
Item 666 / 800 - Fireangels
4
Item 667 / 800 - Best F(r)iends
4
Item 668 / 800 - Tiny Times
4
Item 669 / 800 - The Woman
4
Item 670 / 800 - Fred Figglehorn
2
Item 670 / 800 - A Bread Factory
4
Item 671 / 800 - Rise of the Footsoldier
3
Item 671 / 800 - Debt Collector
4
Item 672 / 800 - Crossword Mysteries
4
Item 673 / 800 - Spanish Affair
4
Item 674 / 800 - Wilden Kerle
2
Item 674 / 800 - Tom and Jerry
3
Item 674 / 800 - When Love Happens
4
Item 675 / 800 - Paw Patrol
4
Item 676 / 

In [22]:
movietitles

{'franchise': ['Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Universe',
  'Marvel Cinematic Univer

In [23]:
franchises = pd.DataFrame(movietitles)
franchises

Unnamed: 0,franchise,movie
0,Marvel Cinematic Universe,"May 6, 2022"
1,Marvel Cinematic Universe,Black Panther II
2,Marvel Cinematic Universe,"Nov 5, 2021"
3,Marvel Cinematic Universe,Untitled Spider-Man: Far Fr…
4,Marvel Cinematic Universe,"Nov 5, 2021"
...,...,...
4448,Vacanze,Vacanze di Natale a Cortina
4449,Torrente,"Dec 31, 2014"
4450,Torrente,Torrente 5. OperaciÃ³n Eurov…
4451,Torrente,"Dec 31, 2010"


In [24]:
# Setting the title as the index
franchises.set_index('movie', inplace=True)

# Data Export

In [26]:
#exporting the dataframe to a csv
franchises.to_csv('api_data/franchises_scraped.csv')