# Dataset Creation

## 1. Scrape Info Box 

### Import Libraries

In [16]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load WebPage

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

#Convert to a beautiful soup object
soup  = bs(r.content)

### Get Info Box Data

In [3]:
info_box = soup.find(class_="infobox vevent")
rows = info_box.find_all("tr")

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip = True).replace("\xa0", " ")


### Store Movie in Dictionary

In [5]:
movie_info = {}
for index, row in enumerate(rows):
    if(index == 0):
        movie_info['title'] = row.find('th').get_text(" ", strip = True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(" ", strip = True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

## 2. Get Info for all Movies

### Get List of Movies

In [9]:
#clean up extra info and references
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

In [6]:
w = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

soup = bs(w.content)

movies = soup.select(".wikitable.sortable i a")
urls = [movie["href"] for movie in movies]

In [12]:
def get_info_box(url):
    r = requests.get('https://en.wikipedia.org' + url)

    soup  = bs(r.content)

    info_box = soup.find(class_="infobox vevent")
    rows = info_box.find_all("tr")

    clean_tags(soup)
        
    movie_info = {}
    for index, row in enumerate(rows):
        if(index == 0):
            movie_info['title'] = row.find('th').get_text(" ", strip = True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find('th').get_text(" ", strip = True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [13]:
movieDB = []

for index, url in enumerate(urls):
    if index % 20 == 0:
        print(index)
    try:
        movieDB.append(get_info_box(url))
    
    except Exception as e:
        print(url)
        print(e)

0
20
40
/wiki/Zorro_(1957_TV_series)#Media
'NoneType' object has no attribute 'find'
/wiki/Zorro_(1957_TV_series)#Media
'NoneType' object has no attribute 'find'
60
80
100
120
/wiki/True-Life_Adventures
'NoneType' object has no attribute 'find_all'
140
/wiki/The_Apple_Dumpling_Gang_Rides_Again
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/The_Apple_Dumpling_Gang_Rides_Again (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001261163E860>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440


### Save Data

In [None]:
import json

with open('movieDB.json', 'w', encoding='utf-8') as f:
    json.dump(movieDB, f, ensure_ascii=False, indent=4)

## 3. Clean Data

### Clean references 

In [17]:
pd.DataFrame(movieDB)

Unnamed: 0,Adaptation by,Animation by,Based on,Box office,Budget,Cinematography,Country,Created by,Directed by,Distributed by,...,Production company,Release date,Running time,Screenplay by,Simplified,Starring,Story by,Traditional,Written by,title
0,,,,$45.472,,,United States,,,,...,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),,,,,,,Academy Award Review of
1,,,"[Snow White, by The, Brothers Grimm]",$418 million,$1.49 million,,United States,,"[David Hand (supervising), William Cottrell, W...",RKO Radio Pictures,...,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,,,"[Adriana Caselotti, Lucille La Verne, Harry St...",,,"[Ted Sears, Richard Creedon, Otto Englander, D...",Snow White and the Seven Dwarfs
2,,,"[The Adventures of Pinocchio, by, Carlo Collodi]",$164 million,$2.6 million,,United States,,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",RKO Radio Pictures,...,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,,,"[Cliff Edwards, Dickie Jones, Christian Rub, M...","[Ted Sears, Otto Englander, Webb Smith, Willia...",,,Pinocchio
3,,,,$76.4–$83.3 million,$2.28 million,James Wong Howe,United States,,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Walt Disney Productions, RKO Radio Pictures]",...,Walt Disney Productions,"[November 13, 1940]",126 minutes,,,"[Leopold Stokowski, Deems Taylor]","[Joe Grant, Dick Huemer]",,,Fantasia
4,,,,"$960,000 (worldwide rentals)","$600,000",Bert Giennon,United States,,"[Alfred Werker, (live action), Hamilton Luske,...",RKO Radio Pictures,...,Walt Disney Productions,"[June 20, 1941]",74 minutes,,,"[Robert Benchley, Frances Gifford, Buddy Peppe...",,,"[Live-action:, Ted Sears, Al Perkins, Larry Cl...",The Reluctant Dragon
5,,,"[Dumbo, the Flying Elephant, by, Helen Aberson...",$1.3 million (est. United States/Canada rental...,"$950,000",,United States,,"[Supervising director:, Ben Sharpsteen, Sequen...",RKO Radio Pictures,...,Walt Disney Productions,"[October 23, 1941 (New York City), October 31,...",64 minutes,,,"[Edward Brophy, Herman Bing, Margaret Wright, ...","[Otto Englander, Joe Grant, Dick Huemer]",,,Dumbo
6,,,"[Bambi, a Life in the Woods, by, Felix Salten]",$267.4 million,"$858,000",,United States,,"[Supervising director, David Hand, Sequence di...",RKO Radio Pictures,...,Walt Disney Productions,"[August 9, 1942 (World Premiere-London), Augus...",70 minutes,,,see below,"[Story direction, Perce Pearce, Story adaptati...",,,Bambi
7,,,,"$1,135,000 (worldwide rentals)",,,United States,,"[Norman Ferguson, Wilfred Jackson, Jack Kinney...",RKO Radio Pictures,...,Walt Disney Productions,"[August 24, 1942 (World Premiere-Rio de Janeir...",42 minutes,,,"[Lee Blair, Mary Blair, Pinto Colvig, Walt Dis...","[Homer Brightman, William Cottrell, Richard Hu...",,,Saludos Amigos
8,,,Victory Through Air Power by Maj. Alexander P....,"$799,000","$788,000",Ray Rennahan,United States,,"[Animated sequences:, James Algar, Clyde Geron...",United Artists,...,Walt Disney Productions,"[July 17, 1943]",65 min.,,,Alexander de Seversky,,,"[Story direction:, Perce Pearce, Story adaptat...",Victory Through Air Power
9,,,,"$3,355,000 (worldwide rentals)",,,United States,,"[Norman Ferguson (supervising director), Clyde...",RKO Radio Pictures,...,Walt Disney Productions,"[December 21, 1944 (Mexico City), February 3, ...",71 minutes,,,"[Clarence Nash, José Oliveira, Joaquin Garay]","[Homer Brightmen, Ernest Terrazas, Ted Sears, ...",,,The Three Caballeros


### Convert running times to integer

In [18]:
#convert running time to integer
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    if isinstance(running_time, list):
        running_time = running_time[0]
    return int(running_time.split(" ")[0])

for movie in movieDB:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))

### Convert money to floats

In [19]:
import re

In [20]:
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

value_re = rf"\${number}"
word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"

In [21]:
def parse_value(str):
     value_string = re.search(number, str).group()
     value = float(value_string.replace(",", ""))
     return value

In [22]:
def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word(string):
    value_string = (re.search(number, string).group())
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags= re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

In [23]:
#convert money to integer
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
    
    value_syntax = re.search(value_re, money)
    word_syntax = re.search(word_re, money, flags = re.I)
   
    if word_syntax:
        return parse_word(word_syntax.group())
    elif value_syntax:
        return parse_value(value_syntax.group())

    else:
        return None

In [24]:
for movie in movieDB:
    movie["Budget (float)"] = money_conversion(movie.get("Budget", "N/A"))
    movie["Box office (float)"] = money_conversion(movie.get("Box office", "N/A"))

### Convert dates to date objects

In [25]:
from datetime import datetime

In [26]:
dates = [movie.get('Release date', 'N/A') for movie in movieDB]

In [27]:
def clean_date(date):
    return date.split("(")[0].strip()

In [28]:
def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None

    date_string = clean_date(date)
    
    fmts = ["%B %d, %Y", "%d %B %Y", "%Y"]

    for fmt in fmts:
        try:
            return datetime.strptime(date_string, fmt)
        except:
            pass
    
    return None

In [29]:
for movie in movieDB:
    movie["Release date (datetime)"] = date_conversion(movie.get('Release date', 'N/A'))

### Dropping extra columns

In [40]:
#based on columns having more than 85% empty entries
movieDB = pd.DataFrame(movieDB)
movieDB.drop(['Japanese','Hepburn','Adaptation by','Animation by','Traditional','Simplified','Created by','Original work','Owned by'],axis=1,inplace=True)

### Save with Pickle

In [41]:
import pickle
with open('movieDB.pickle', 'wb') as f:
    pickle.dump(movieDB, f)

### Load from Pickle

In [42]:
with open('movieDB.pickle', "rb") as f:
    movieDB = pickle.load(f)

## Save Data as CSV

In [44]:
df.to_csv('movieDB_new.csv')