# Dataset Creation

## 1. Scrape Info Box 

### Import Libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load WebPage

In [None]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

#Convert to a beautiful soup object
soup  = bs(r.content)

### Get Info Box Data

In [None]:
info_box = soup.find(class_="infobox vevent")
rows = info_box.find_all("tr")

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip = True).replace("\xa0", " ")


### Store Movie in Dictionary

In [None]:
movie_info = {}
for index, row in enumerate(rows):
    if(index == 0):
        movie_info['title'] = row.find('th').get_text(" ", strip = True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(" ", strip = True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value
movie_info

## 2. Get Info for all Movies

### Get List of Movies

In [None]:
#clean up extra info and references
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

In [None]:
w = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

soup = bs(w.content)

movies = soup.select(".wikitable.sortable i a")
urls = [movie["href"] for movie in movies]

In [None]:
def get_info_box(url):
    r = requests.get('https://en.wikipedia.org' + url)

    soup  = bs(r.content)

    info_box = soup.find(class_="infobox vevent")
    rows = info_box.find_all("tr")

    clean_tags(soup)
        
    movie_info = {}
    for index, row in enumerate(rows):
        if(index == 0):
            movie_info['title'] = row.find('th').get_text(" ", strip = True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find('th').get_text(" ", strip = True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
movieDB = []

for index, url in enumerate(urls):
    if index % 20 == 0:
        print(index)
    try:
        movieDB.append(get_info_box(url))
    
    except Exception as e:
        print(url)
        print(e)

### Save Data

In [None]:
import json

with open('movieDB_original.json', 'w', encoding='utf-8') as f:
    json.dump(movieDB, f, ensure_ascii=False, indent=4)

## 3. Clean Data

### Clean references 

In [None]:
pd.DataFrame(movieDB)

### Convert running times to integer

In [None]:
#convert running time to integer
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    if isinstance(running_time, list):
        running_time = running_time[0]
    return int(running_time.split(" ")[0])

for movie in movieDB:
    movie["Running time (int)"] = minutes_to_integer(movie.get("Running time", "N/A"))

### Convert money to floats

In [None]:
import re

In [None]:
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

value_re = rf"\${number}"
word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"

In [None]:
def parse_value(str):
     value_string = re.search(number, str).group()
     value = float(value_string.replace(",", ""))
     return value

In [None]:
def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word(string):
    value_string = (re.search(number, string).group())
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags= re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

In [None]:
#convert money to integer
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
    
    value_syntax = re.search(value_re, money)
    word_syntax = re.search(word_re, money, flags = re.I)
   
    if word_syntax:
        return parse_word(word_syntax.group())
    elif value_syntax:
        return parse_value(value_syntax.group())

    else:
        return None

In [None]:
for movie in movieDB:
    movie["Budget (float)"] = money_conversion(movie.get("Budget", "N/A"))
    movie["Box office (float)"] = money_conversion(movie.get("Box office", "N/A"))

### Convert dates to date objects

In [None]:
from datetime import datetime

In [None]:
dates = [movie.get('Release date', 'N/A') for movie in movieDB]

In [None]:
def clean_date(date):
    return date.split("(")[0].strip()

In [None]:
def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None

    date_string = clean_date(date)
    
    fmts = ["%B %d, %Y", "%d %B %Y", "%Y"]

    for fmt in fmts:
        try:
            return datetime.strptime(date_string, fmt)
        except:
            pass
    
    return None

In [None]:
for movie in movieDB:
    movie["Release date (datetime)"] = date_conversion(movie.get('Release date', 'N/A'))

### Dropping extra columns

In [None]:
#based on columns having more than 85% empty entries
movieDB = pd.DataFrame(movieDB)
movieDB.drop(['Japanese','Hepburn','Adaptation by','Animation by','Traditional','Simplified','Created by','Original work','Owned by'],axis=1,inplace=True)

### Save with Pickle

In [None]:
import pickle
with open('movieDB.pickle', 'wb') as f:
    pickle.dump(movieDB, f)

### Load from Pickle

In [None]:
with open('movieDB.pickle', "rb") as f:
    movieDB = pickle.load(f)

## Save Data as CSV

In [None]:
movieDB.to_csv('movieDB_new.csv')