### Disney Dataset Creation (w/ Python BeautifulSoup)
Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

### Task #1: Get Info Box (store in Python dictionary)

#### Import Necessary Libraries




In [None]:
from bs4 import BeautifulSoup as bs
import requests

#### Load the webpage

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info

### Task #2: Get info box for all movies

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info    

In [None]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)
    

In [None]:
len(movie_info_list)

#### Save/Reload Movie Data

In [None]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_data("disney_data_cleaned.json", movie_info_list)

### Task #3: Clean our data!

In [None]:
movie_info_list = load_data("disney_data_cleaned.json")

#### Subtasks
- ~~Clean up references [1]~~
- ~~Convert running time into an integer~~
- Convert dates into datetime object
- ~~Split up the long strings~~
- ~~Convert Budget & Box office to numbers~~

In [None]:
movie_info_list[-40]

In [None]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])


In [None]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))



In [None]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])


In [None]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [None]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [None]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [None]:
money_conversion(str(movie_info_list[-40]["Budget"]))

In [None]:
# Convert Dates into datetimes
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

In [None]:
movie_info_list[-50]

In [None]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [None]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
movie_info_list[50]

In [None]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [None]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [None]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [None]:
a == movie_info_list

### Task #4: Attach IMDB/Rotten Tomatoes/Metascore scores

In [None]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [None]:
movie_info_list[-60]

In [None]:
# http://www.omdbapi.com/?apikey=[yourkey]&

In [None]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("into the woods")

In [None]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [None]:
movie_info_list[-30]

In [None]:
for movie in movie_info_list:
    movie['imdb'] = float(movie['imdb'])
    movie['metascore'] = float(movie['metascore'])
    movie['rotten_tomatoes'] = float(movie['rotten_tomatoes'].strip('%'))

In [None]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

### Task #5: Save data as JSON & CSV

In [None]:
movie_info_list[50]

In [None]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [None]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [None]:
save_data("disney_data_final.json", movie_info_copy)

#### Convert data to CSV

In [None]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [None]:
df.head()

In [None]:
df.to_csv("disney_movie_data_final.csv")

In [None]:
running_times = df.sort_values(['Running time (int)'],  ascending=False)
running_times.head(20)