# Disney Dataset Creation (Web Scraping) using BeautifulSoup

### Scrape and clean a list of Disney wikipidea pages to create a dataset to further analyse

In [2]:
from bs4 import BeautifulSoup as bs
import requests

In [3]:
req = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
# req = requests.get('https://en.wikipedia.org/wiki/Angels_in_the_Outfield_(1994_film)')

In [4]:
webpage = bs(req.content)

### Locate the infobox

In [5]:
info_box = webpage.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
# for row in info_rows:
#     print(row.prettify())


### Get info box for Toy Story 3

In [6]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

In [11]:
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

In [12]:
movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['Title'] = row.find('th').get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        key = row.find('th').get_text(" ", strip=True)
        value = get_content_value(row.find('td'))
        movie_info[key] = value

        
# print(movie_info)
for key, val in movie_info.items():
    print(f"{key}: {val}")
    print("\n")

Title: Toy Story 3


Directed by: Lee Unkrich


Screenplay by: Michael Arndt


Story by: ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich']


Produced by: Darla K. Anderson


Starring: ['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Don Rickles', 'Wallace Shawn', 'John Ratzenberger', 'Estelle Harris', 'Ned Beatty', 'Michael Keaton', 'Jodi Benson', 'John Morris']


Cinematography: ['Jeremy Lasky', 'Kim White']


Edited by: Ken Schretzmann


Music by: Randy Newman


Production companies: ['Walt Disney Pictures', 'Pixar Animation Studios']


Distributed by: ['Walt Disney Studios', 'Motion Pictures']


Release dates: ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )', 'June 18, 2010 ( 2010-06-18 ) (United States)']


Running time: 103 minutes [1]


Country: United States


Language: English


Budget: $200 million [1]


Box office: $1.067 billion [1]




### Get info box for all Disney films

In [13]:
new_request = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

soup = bs(new_request.content)

In [19]:
def get_info_box(url):
    
    req = requests.get(url)
    webpage = bs(req.content)
    info_box = webpage.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    clean_tags(webpage)
    
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                key = row.find('th').get_text(" ", strip=True)
                value = get_content_value(row.find('td'))
                movie_info[key] = value
            
    return movie_info

In [15]:
get_info_box('https://en.wikipedia.org/wiki/Toy_Story_3')

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['June 12, 2010 ( Taormina Film Fest )',
  'June 18, 2010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

In [21]:
movies = soup.select(".wikitable.sortable i a")
base_link = 'https://en.wikipedia.org/'
all_info_boxes = []

for movie in movies:
    try:
        url = base_link + movie['href']
        all_info_boxes.append(get_info_box(url))
    except Exception as e:
        print(movie.get_text())
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
Elemental
'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
The Graveyard Book
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
FC Barcelona
'NoneType' object

In [24]:
 len(movies)

537

In [25]:
len(all_info_boxes)

520

In [43]:
# Save and reload data

import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, 'r', encoding='utf-8') as file:
        return json.load(file)

In [23]:
save_data("disney_data_cleaned.json", all_info_boxes)

In [169]:
# Clean our data

In [2]:
movie_info_list = load_data('disney_data_cleaned.json')

### Subtasks
- ~~Clean up references~~
- ~~Split up the long strings with comma~~
- ~~Convert running time into a number~~
- ~~Convert date to datetime object~~
- ~~Convert Budget and Box offices into numbers (have to use Regular Expression on this one)~~
- Add IMDB/Rotten Tomatoes scores (have to look up at Keith's code, knowledge about API required)
- Save data to CSV and JSON files

In [27]:
# Remove [1], [2]...

# done by defining clean_tags


In [3]:
running_times = [movie.get('Running time', 'NaN') for movie in movie_info_list]
release_dates = [movie.get('Release dates', 'NaN') for movie in movie_info_list]
budgets = [movie.get('Budget', 'NaN') for movie in movie_info_list]
box_offices = [movie.get('Box office', 'NaN') for movie in movie_info_list]


In [7]:
running_times

['83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '70 min',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'],
 '127 minutes',
 '93 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '73 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 minutes',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '100 minutes',
 '118 m

# Convert running time into a number

In [13]:
import numpy as np

def convert_runtime_to_int(runtime):
    if isinstance(runtime, list):
        runtime_str = runtime[0]
        runtime_int = int(runtime_str.split(" ")[0])
    elif runtime == 'NaN':
        runtime_int = np.nan
    else:
        runtime_int = int(runtime.split(" ")[0])
    
    return runtime_int

In [16]:

for movie in movie_info_list:
    runtime_str = movie.get('Running time', 'NaN')
    runtime_int = convert_runtime_to_int(runtime_str)
    movie['Running time (int)'] = runtime_int



# Convert date to datetime object

In [43]:
from datetime import datetime

print(datetime.strptime('December 21, 1937',  '%B %d, %Y'))

def convert_dates_to_datetime(release_date):

    formats = ['%B %d, %Y', '%d %B %Y']
    if isinstance(release_date, list):
        release_date_str = release_date[0].split("(")[0].strip(" ")
        for format in formats:
            try:
                return datetime.strptime(release_date_str, format)
            except:
                pass
    elif release_date == 'NaN':
        return np.nan

1937-12-21 00:00:00


In [22]:
for idx, movie in enumerate(movie_info_list):
    if movie['Title'] == 'Tinker Bell':
        out = idx

movie_info_list.pop(out)

{'Title': 'Tinker Bell',
 'Directed by': ['Bradley Raymond ( 1 , 3 & 4 )',
  'Klay Hall ( 2 )',
  'Bobs Gannaway ( 5 )',
  'Peggy Holmes ( 5 & 6 )'],
 'Starring': ['Mae Whitman',
  'Lucy Liu',
  'Raven-Symoné',
  'Megan Hilty',
  '( More )'],
 'Music by': 'Joel McNeely',
 'Production company': 'DisneyToon Studios',
 'Distributed by': ['Walt Disney Studios', 'Home Entertainment'],
 'Release dates': ['1',
  ': October\xa028,\xa02008',
  '2',
  ':',
  '3',
  ':',
  '4',
  ':',
  '5',
  ':',
  '6',
  ':'],
 'Running time': ['468 minutes'],
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 468}

In [46]:
for movie in movie_info_list:
    release_date = movie.get('Release dates', 'NaN')
    release_date_dt = convert_dates_to_datetime(release_date)
    movie['Release date (datetime)'] = release_date_dt



# Convert Budget and Box offices to numbers

In [56]:
from helper_functions import money_conversion

for movie in movie_info_list:
    budget, box_office = movie.get('Budget', 'NaN'), movie.get('Box office', 'NaN')
    movie['Budget (number)'] = money_conversion(budget)
    movie['Box office (number)'] = money_conversion(box_office)
    

In [57]:
movie_info_list[0]

{'Title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'William Cottrell',
  'Wilfred Jackson',
  'Larry Morey',
  'Perce Pearce',
  'Ben Sharpsteen'],
 'Written by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Lucille La Verne',
  'Harry Stockwell',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins',
  'Moroni Olsen',
  'Stuart Buchanan'],
 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',

In [1]:
import pickle 

def save_data_pickle(name, data):
    with open(name, 'wb') as file:
        pickle.dump(data, file)

def load_data_pickle(name):
    with open(name, 'rb') as file:
        return pickle.load(file)


In [59]:
save_data_pickle('disney_data_further_cleaned.pickle', movie_info_list)

In [4]:
movie_info_list = load_data_pickle('disney_data_further_cleaned.pickle')

# Attach IMDB/Rotten Tomatoes/Metascores scores

In [5]:
movie_info_list[0]

{'Title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'William Cottrell',
  'Wilfred Jackson',
  'Larry Morey',
  'Perce Pearce',
  'Ben Sharpsteen'],
 'Written by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Lucille La Verne',
  'Harry Stockwell',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins',
  'Moroni Olsen',
  'Stuart Buchanan'],
 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',

In [25]:
# http://www.omdbapi.com/?apikey=[yourkey]&
import requests
import urllib

def get_omdb_info(title):
    base_url = 'https://www.omdbapi.com/?'
    parameters = {'apikey': 'aba50b29', 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomatoes_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']


omdb_info = get_omdb_info('into the woods')
get_rotten_tomatoes_score(omdb_info)



'71%'

In [29]:
for movie in movie_info_list:
    title = movie['Title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomatoes_score(omdb_info)

# Save final data as JSON/CSV files

In [44]:
from copy import deepcopy
from datetime import datetime

movie_info_list_copy = deepcopy(movie_info_list)

for movie in movie_info_list_copy:
    date = movie['Release date (datetime)']
    if isinstance(date, datetime):
        movie['Release date (datetime)'] = date.strftime('%B %d, %Y')

save_data('final_cleaned_data.json', movie_info_list_copy)

In [46]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()

df.to_csv('final_cleaned_data.csv')