#### Import nessecary libraries

In [134]:
import requests
from bs4 import BeautifulSoup as bs
import json
from datetime import datetime
import pickle
import urllib
import pandas as pd

#### Scrape info box data from Toy Story 3 wikipedia web page

In [3]:
# load the web page
url = 'https://en.wikipedia.org/wiki/Toy_Story_3'
r = requests.get(url)

# convert to a beautiful soup object
soup = bs(r.content)

In [4]:
# get info box data
info_box = soup.find('table', class_='infobox vevent')

# gets rows
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [5]:
# process value data retrieval
def get_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

# put data into python dictionary
movie_data = {}
for index, row in enumerate(info_rows):
    if index == 0:  # movie title
        movie_data['Title'] = row.find('th').get_text()
    elif index == 1:  # image
        pass
    else:
        key = row.find('th').get_text(' ', strip=True)
        value = get_value(row.find('td'))
        movie_data[key] = value

movie_data

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

#### Scrape info box data from all Disney films wikipedia pages

In [20]:
def remove_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

def get_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

def get_info_box(url):
    r = requests.get(url)
    web_page = bs(r.content)
    remove_tags(web_page)
    info_box = web_page.find('table', class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    movie_data = {}
    for index, row in enumerate(info_rows):
        if index == 0:  # movie title
            movie_data['Title'] = row.find('th').get_text()
        else:
            header = row.find('th')
            if header:
                key = row.find('th').get_text(' ', strip=True)
                value = get_value(row.find('td'))
                movie_data[key] = value
    return movie_data

In [21]:
# test changes made
get_info_box('https://en.wikipedia.org/wiki/One_Little_Indian_(film)')

{'Title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Produced by': 'Winston Hibler',
 'Written by': 'Harry Spalding',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Music by': 'Jerry Goldsmith',
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [23]:
url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
r = requests.get(url)
soup = bs(r.content)
wiki_url = 'https://en.wikipedia.org'
movies = soup.select('.wikitable.sortable i a')
master_movie_data = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        movie_title = movie['title']
        movie_url = movie['href']
        movie_data = get_info_box(wiki_url + movie_url)
        master_movie_data.append(movie_data)
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


#### Save & load data using JSON

In [131]:
def save_data_json(title, data):
    with open(title, 'w' , encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# save_data_json('disney_data.json', master_movie_data)

In [132]:
def load_data_json(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

# master_movie_data = load_data_json('disney_data.json')

#### Clean the data
- ~~Remove references ([1], [2], etc)~~
- ~~Split up long strings~~
- ~~Convert running times to integers~~
- ~~Convert budget and box office to floats~~
- ~~Convert release dates to datetime objects~~

In [51]:
# Convert running times to integers
print([movie.get('Running time', 'N/A') for movie in master_movie_data])

['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min.', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS version)', '71 minutes (original)'], '127 minutes', '92 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '74 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 min.', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 Minutes', '110 minutes', '80 

In [52]:
def minutes_to_integer(running_time):
    # gets first element of list
    if isinstance(running_time, list):
        running_time = running_time[0]
    if running_time == 'N/A':
        return None
    return int(running_time.split(' ')[0])

for movie in master_movie_data:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

print([movie.get('Running time (int)') for movie in master_movie_data])

[41, 83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114, 93, 

In [53]:
# Convert budget and box office to floats
print([movie.get('Budget', 'N/A') for movie in master_movie_data])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.9 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [59]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"
standard = fr"\${number}(-|\sto\s|–)?({number})?\s({amounts})"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict.get(word.lower(), 1)

def parse_word_syntax(string):
    stripped_string = string.replace(",", "")
    value = float(re.search(number, stripped_string).group())
    modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
    return value * modifier

def parse_value_syntax(string):
    stripped_string = string.replace(",", "")
    return float(re.search(number, stripped_string).group())

def money_conversion(money):
    if money == 'N/A':
        return None
    
    if type(money) == list:
        money = money[0]

    word_syntax = re.search(standard, money, flags=re.I)
    value_syntax = re.search(fr"\${number}", money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None

for movie in master_movie_data:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

print([movie.get('Budget (float)') for movie in master_movie_data])

[None, 1490000.0, 2600000.0, 2280000.0, 600000.0, 950000.0, 858000.0, None, 788000.0, None, 1350000.0, 2125000.0, None, 1500000.0, 1500000.0, None, 2900000.0, 1800000.0, 3000000.0, None, 4000000.0, 2000000.0, 300000.0, 1800000.0, None, 5000000.0, None, 4000000.0, None, None, None, None, None, None, 700000.0, None, None, None, None, None, 6000000.0, 1000000.0, None, 2000000.0, None, None, 2500000.0, None, None, 4000000.0, 3600000.0, None, None, None, None, 3000000.0, None, 3000000.0, None, None, None, None, None, None, None, None, None, 3000000.0, None, None, None, None, 4400000.0, None, None, None, None, None, None, None, None, None, None, None, 4000000.0, None, 5000000.0, None, None, None, None, 5000000.0, None, None, None, None, None, None, 4000000.0, None, None, None, 6300000.0, None, None, None, None, None, None, None, None, 5000000.0, None, None, None, None, 8000000.0, None, None, None, None, None, 1000000.0, None, None, None, None, 5000000.0, None, None, None, 7500000.0, None, 10

In [61]:
# Convert release dates to datetime objects
print([movie.get('Release date', 'N/A') for movie in master_movie_data])

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre , Los Angeles , CA )', 'February 4, 1938 (United States)'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 20, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere-London)', 'August 13, 1942 (Premiere-New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere-Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere-Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['June 22, 1950

In [124]:
def clean_date(date):
    return date.split('(')[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == 'N/A':
        return None
    
    date_str = clean_date(date)
    formats = ['%B %d, %Y', '%d %B %Y', '%Y']
    for format in formats:
        try:
            return datetime.strptime(date_str, format)
        except:
            pass
    return None


for movie in master_movie_data:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

print([movie.get('Release date (datetime)') for movie in master_movie_data])

[datetime.datetime(1937, 5, 19, 0, 0), datetime.datetime(1937, 12, 21, 0, 0), datetime.datetime(1940, 2, 7, 0, 0), datetime.datetime(1940, 11, 13, 0, 0), datetime.datetime(1941, 6, 20, 0, 0), datetime.datetime(1941, 10, 23, 0, 0), datetime.datetime(1942, 8, 9, 0, 0), datetime.datetime(1942, 8, 24, 0, 0), datetime.datetime(1943, 7, 17, 0, 0), datetime.datetime(1944, 12, 21, 0, 0), datetime.datetime(1946, 4, 20, 0, 0), datetime.datetime(1946, 11, 12, 0, 0), datetime.datetime(1947, 9, 27, 0, 0), datetime.datetime(1948, 5, 27, 0, 0), datetime.datetime(1948, 11, 29, 0, 0), datetime.datetime(1949, 10, 5, 0, 0), datetime.datetime(1950, 2, 15, 0, 0), datetime.datetime(1950, 6, 22, 0, 0), datetime.datetime(1951, 7, 26, 0, 0), datetime.datetime(1952, 3, 13, 0, 0), datetime.datetime(1953, 2, 5, 0, 0), datetime.datetime(1953, 8, 8, 0, 0), datetime.datetime(1953, 11, 10, 0, 0), datetime.datetime(1953, 10, 26, 0, 0), datetime.datetime(1954, 8, 17, 0, 0), datetime.datetime(1954, 12, 23, 0, 0), dateti

#### Save & load data using Pickle

In [73]:
def save_data_pickle(title, data):
    with open(title, 'wb') as f:
        pickle.dump(data, f)

save_data_pickle('disney_data_cleaned_more.pickle',  master_movie_data)

In [75]:
def load_data_pickle(title):
    with open(title, 'rb') as f:
        return pickle.load(f)

master_movie_data = load_data_pickle('disney_data_cleaned_more.pickle')

#### Attach IMDB, Rotten Tomatoes and Metascors ratings

In [107]:
def get_omdb_info(title):
    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey': 'b376b4a9', 't': title}
    parameters_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + parameters_encoded
    return requests.get(full_url).json()

def get_rotten_tomatoe_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
             return rating['Value']
    return None

In [108]:
for movie in master_movie_data:
    title = movie['Title']
    omdb_info = get_omdb_info(title)
    movie['IMDB'] = omdb_info.get('imdbRating', None)
    movie['Metascore'] = omdb_info.get('Metascore', None)
    movie['Rotten Tomatoes'] = get_rotten_tomatoe_score(omdb_info)

master_movie_data[50]

http://www.omdbapi.com/?apikey=b376b4a9&t=Academy+Award+Review+of+
http://www.omdbapi.com/?apikey=b376b4a9&t=Snow+White+and+the+Seven+Dwarfs
http://www.omdbapi.com/?apikey=b376b4a9&t=Pinocchio
http://www.omdbapi.com/?apikey=b376b4a9&t=Fantasia
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Reluctant+Dragon
http://www.omdbapi.com/?apikey=b376b4a9&t=Dumbo
http://www.omdbapi.com/?apikey=b376b4a9&t=Bambi
http://www.omdbapi.com/?apikey=b376b4a9&t=Saludos+Amigos
http://www.omdbapi.com/?apikey=b376b4a9&t=Victory+Through+Air+Power
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Three+Caballeros
http://www.omdbapi.com/?apikey=b376b4a9&t=Make+Mine+Music
http://www.omdbapi.com/?apikey=b376b4a9&t=Song+of+the+South
http://www.omdbapi.com/?apikey=b376b4a9&t=Fun+and+Fancy+Free
http://www.omdbapi.com/?apikey=b376b4a9&t=Melody+Time
http://www.omdbapi.com/?apikey=b376b4a9&t=So+Dear+to+My+Heart
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Adventures+of+Ichabod+andMr.+Toad
http://www.omdbapi.com/?apikey=b37

http://www.omdbapi.com/?apikey=b376b4a9&t=The+Rescuers
http://www.omdbapi.com/?apikey=b376b4a9&t=Herbie+Goes+to+Monte+Carlo
http://www.omdbapi.com/?apikey=b376b4a9&t=Pete%27s+Dragon
http://www.omdbapi.com/?apikey=b376b4a9&t=Candleshoe
http://www.omdbapi.com/?apikey=b376b4a9&t=Return+from+Witch+Mountain
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Cat+from+Outer+Space
http://www.omdbapi.com/?apikey=b376b4a9&t=Hot+Lead+and+Cold+Feet
http://www.omdbapi.com/?apikey=b376b4a9&t=The+North+Avenue+Irregulars
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Apple+Dumpling+Gang+Rides+Again
http://www.omdbapi.com/?apikey=b376b4a9&t=Unidentified+Flying+Oddball
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Black+Hole
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Omega+Connection
http://www.omdbapi.com/?apikey=b376b4a9&t=Midnight+Madness
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Watcher+in+the+Woods
http://www.omdbapi.com/?apikey=b376b4a9&t=Herbie+Goes+Bananas
http://www.omdbapi.com/?apikey=b376b4a

http://www.omdbapi.com/?apikey=b376b4a9&t=The+Haunted+Mansion
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Young+Black+Stallion
http://www.omdbapi.com/?apikey=b376b4a9&t=Teacher%27s+Pet
http://www.omdbapi.com/?apikey=b376b4a9&t=Miracle
http://www.omdbapi.com/?apikey=b376b4a9&t=Confessions+of+a+TeenageDrama+Queen
http://www.omdbapi.com/?apikey=b376b4a9&t=Home+on+the+Range
http://www.omdbapi.com/?apikey=b376b4a9&t=Sacred+Planet
http://www.omdbapi.com/?apikey=b376b4a9&t=Around+the+World+in+80+Days
http://www.omdbapi.com/?apikey=b376b4a9&t=America%27s+Heart+and+Soul
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Princess+Diaries+2%3ARoyal+Engagement
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Incredibles
http://www.omdbapi.com/?apikey=b376b4a9&t=National+Treasure
http://www.omdbapi.com/?apikey=b376b4a9&t=Aliens+of+the+Deep
http://www.omdbapi.com/?apikey=b376b4a9&t=Pooh%27s+Heffalump+Movie
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Pacifier
http://www.omdbapi.com/?apikey=b376b4a9&t=Ice

http://www.omdbapi.com/?apikey=b376b4a9&t=Cars+3
http://www.omdbapi.com/?apikey=b376b4a9&t=Ghost+of+the+Mountains
http://www.omdbapi.com/?apikey=b376b4a9&t=Jagga+Jasoos
http://www.omdbapi.com/?apikey=b376b4a9&t=Coco
http://www.omdbapi.com/?apikey=b376b4a9&t=A+Wrinkle+in+Time
http://www.omdbapi.com/?apikey=b376b4a9&t=Incredibles+2
http://www.omdbapi.com/?apikey=b376b4a9&t=Christopher+Robin
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Nutcracker+and+the+Four+Realms
http://www.omdbapi.com/?apikey=b376b4a9&t=Ralph+Breaks+the+Internet
http://www.omdbapi.com/?apikey=b376b4a9&t=Mary+Poppins+Returns
http://www.omdbapi.com/?apikey=b376b4a9&t=Dumbo
http://www.omdbapi.com/?apikey=b376b4a9&t=Penguins
http://www.omdbapi.com/?apikey=b376b4a9&t=Aladdin
http://www.omdbapi.com/?apikey=b376b4a9&t=Toy+Story+4
http://www.omdbapi.com/?apikey=b376b4a9&t=The+Lion+King
http://www.omdbapi.com/?apikey=b376b4a9&t=Maleficent%3A+Mistress+of+Evil
http://www.omdbapi.com/?apikey=b376b4a9&t=Lady+and+the+Tramp
http://

#### Save data as JSON & CSV

In [129]:
# make copy of data to convert datetime objects to strings
copy_movie_data = [movie.copy() for movie in master_movie_data]
copy_movie_data[50]

{'Title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0),
 'IMDB': '7.2',
 'Metascore': '83',
 'Rotten Tomatoes': '98%'}

In [130]:
for movie in copy_movie_data:
    date = movie['Release date (datetime)']
    if date:
        movie['Release date (datetime)'] = date.strftime('%B %d, %Y')

copy_movie_data[50]

{'Title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': 'January 25, 1961',
 'IMDB': '7.2',
 'Metascore': '83',
 'Rotten Tomatoes': '98%'}

In [133]:
save_data_json('disney_data_final.json', copy_movie_data)

In [136]:
# convert data to pandas dataframe
df = pd.DataFrame(master_movie_data)
df.head()

Unnamed: 0,Title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Narrated by,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,83300000.0,...,Deems Taylor,James Wong Howe,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,Bert Giennon,Paul Weatherwax,,,,,,,


In [138]:
# output to csv file
df.to_csv('disney_data_final.csv')