**Disney Dataset Creation**

Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze.

**Task 1: Get Info Box (store in a Python dictionary)**

In [1]:
# important libraries
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

soup = bs(r.content)

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Toy Story 3 - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"69459303-91d4-43a5-946a-cbba696bde4d","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Toy_Story_3","wgTitle":"Toy Story 3","wgCurRevisionId":1038331984,"wgRevisionId":1038331984,"wgArticleId":1213838,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","Good articles","Wikipedia indefinitely semi-protected pages","Use American English f

In [3]:
 info_box = soup.find(class_="infobox vevent")
 info_rows = info_box.find_all("tr")
 for row in info_rows:
   print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="row" style="white-space: nowrap; padding-right: 0.65em;">
  Directed by
 </th>
 <td class="

In [4]:
def get_content_value(row_data):
  if row_data.find('li'):
    return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
  else:
    return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
  if index == 0:
    movie_info['title'] = row.find('th').get_text(" ", strip=True)
  elif index == 1:
    continue
  else:
    content_key = row.find('th').get_text(" ", strip=True)
    content_value = get_content_value(row.find("td"))
    movie_info[content_key] = content_value

movie_info

{'Box office': '$1.067 billion [1]',
 'Budget': '$200 million [1]',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Country': 'United States',
 'Directed by': 'Lee Unkrich',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Edited by': 'Ken Schretzmann',
 'Language': 'English',
 'Music by': 'Randy Newman',
 'Produced by': 'Darla K. Anderson',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Screenplay by': 'Michael Arndt',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'title': 'Toy Story 3'}

In [5]:
# Task 2: Get info box for all movies
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
content = soup.prettify()
print(content)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Walt Disney Pictures films - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6ff12344-9685-4012-a663-9343c256bf4a","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Walt_Disney_Pictures_films","wgTitle":"List of Walt Disney Pictures films","wgCurRevisionId":1039130994,"wgRevisionId":1039130994,"wgArticleId":1970335,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 maint: archived copy as title","Articles with 

In [6]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title=

In [7]:
def get_content_value(row_data):
  if row_data.find('li'):
    return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
  elif row_data.find('br'):
    return [text for text in row_data.stripped_strings]
  else:
    return row_data.get_text( " ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
  for tag in soup.find_all(['sup', 'span']):
    tag.decompose()

def get_info_box(url):

  r = requests.get(url)
  soup = bs(r.content)
  info_box = soup.find(class_ = "infobox vevent")
  info_rows = info_box.find_all('tr')

  clean_tags(soup)

  movie_info = {}
  for index, row in enumerate(info_rows):
    if index == 0:
      movie_info['title'] = row.find('th').get_text(" ", strip=True)
    else:
      header = row.find('th')
      if header:
        content_key = row.find('th').get_text(" ", strip=True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value
  
  return movie_info

In [8]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'Box office': '$2 million',
 'Cinematography': 'Charles F. Wheeler',
 'Country': 'United States',
 'Directed by': 'Bernard McEveety',
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': 'Robert Stafford',
 'Language': 'English',
 'Music by': 'Jerry Goldsmith',
 'Produced by': 'Winston Hibler',
 'Production company': 'Walt Disney Productions',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Written by': 'Harry Spalding',
 'title': 'One Little Indian'}

In [9]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = 'https://en.wikipedia.org/'

movie_info_list = []
for index, movie in enumerate(movies):
  if index % 10 == 0:
    print(index)
  try:
    relative_path = movie['href']
    full_path = base_path + relative_path
    title = movie['title']

    movie_info_list.append(get_info_box(full_path))

  except Exception as e:
    print(movie.get_text())
    print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Better Nate Than Never
'NoneType' object has no attribute 'find_all'


In [10]:
len(movie_info_list)

449

**Save/Reload Movie Data**

In [11]:
import json

def save_data(title, data):
  with open(title, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
import json

def load_data(title):
  with open(title, encoding='utf-8') as f:
    return json.load(f)

In [13]:
save_data("disney_data_cleaned.json", movie_info_list)

In [14]:
load_data('disney_data_cleaned.json')

[{'Box office': '$45.472',
  'Country': 'United States',
  'Language': 'English',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'title': 'Academy Award Review of'},
 {'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Box office': '$418 million',
  'Budget': '$1.49 million',
  'Country': 'United States',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Distributed by': 'RKO Radio Pictures',
  'Language': 'English',
  'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
  'Produced by': 'Walt Disney',
  'Production company': 'Walt Disney Productions',
  'Release date': ['December 21, 1937 ( Carthay Circle Theatre )'],
  'Running time': '83 minutes',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan'

Task 3: Data Cleaning

In [15]:
movie_info_list = load_data("disney_data_cleaned.json")

**Subtasks**



*   --Clean up references--
*   --Convert running time into an integer--
*   Convert dates into datetime object
*   --Split up the long strings--
*   --Convert Budget & Box office to numbers--



In [16]:
movie_info_list[-40]

{'Based on': ['Mary Poppins', 'by', 'P. L. Travers'],
 'Box office': '$349.5 million',
 'Budget': '$130 million',
 'Cinematography': 'Dion Beebe',
 'Country': 'United States',
 'Directed by': 'Rob Marshall',
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Edited by': 'Wyatt Smith',
 'Language': 'English',
 'Music by': 'Marc Shaiman',
 'Produced by': ['Rob Marshall', 'John DeLuca', 'Marc Platt'],
 'Production companies': ['Walt Disney Pictures',
  'Lucamar Productions',
  'Marc Platt Productions'],
 'Release date': ['November 29, 2018 ( Dolby Theatre )',
  'December 19, 2018 (United States)'],
 'Running time': '131 minutes',
 'Screenplay by': 'David Magee',
 'Starring': ['Emily Blunt',
  'Lin-Manuel Miranda',
  'Ben Whishaw',
  'Emily Mortimer',
  'Julie Walters',
  'Colin Firth',
  'Meryl Streep'],
 'Story by': ['David Magee', 'Rob Marshall', 'John DeLuca'],
 'title': 'Mary Poppins Returns'}

In [17]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '70 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS version)', '71 minutes (original)'], '127 minutes', '92 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '131 minutes', '79 minutes', '97 minutes', '128 minutes', '74 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 min.', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 Minutes', '110 minutes', '80 m

In [18]:
# "131 minutes"
def minutes_to_integer(running_time):
  if running_time =='N/A':
    return None

  if isinstance(running_time, list):
    return int(running_time[0].split(" ")[0])
  else: # is a string
    return int(running_time.split(" ")[0])

for movie in movie_info_list:
  movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

In [19]:
print([movie.get('Runnnig time (int)', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

In [20]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

In [21]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|-)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
  value_dict = {"thousand": 1000, "million": 1000000, 'billion': 1000000000}
  return value_dict[word]

def parse_word_syntax(string):
  value_string = re.search(number, string).group()
  value = float(value_string.replace(',', ''))
  word = re.search(amounts, string, flags=re.I).group().lower()
  word_value = word_to_value(word)
  return value*word_value

def parse_value_syntax(string):
  value_string = re.search(number, string).group()
  value = float(value_string.replace(',', ''))
  return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''

def money_conversion(money):
  if money == "N/A":
    return None
  
  if isinstance(money, list):
    money = money[0]

  word_syntax = re.search(word_re, money, flags=re.I)
  value_syntax = re.search(value_re, money)

  if word_syntax:
    return parse_word_syntax(word_syntax.group())

  elif value_syntax:
    return parse_value_syntax(value_syntax.group())
  
  else:
    return None

In [22]:
for movie in movie_info_list:
  movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
  movie['Box Office (float)'] = money_conversion(movie.get('Box Office', 'N/A'))

In [23]:
money_conversion(str(movie_info_list[-40]['Budget']))

130000000.0

In [24]:
# Convert Dates into datatimes
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['May 19, 1937'], ['December 21, 1937 ( Carthay Circle Theatre )'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], ['November 13, 1940'], ['June 27, 1941'], ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], ['July 17, 1943'], ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford Theatre, Palo Alto, California)'], ['September 27, 1947'], 'May 27, 1948', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], ['October 5, 1949'], ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['

In [25]:
movie_info_list[-50]

{'Based on': ['Characters by Ted Elliott Terry Rossio Stuart Beattie Jay Wolpert',
  'Pirates of the Caribbean by Walt Disney'],
 'Box Office (float)': None,
 'Box office': '$794.9 million',
 'Budget': '$230–320 million',
 'Budget (float)': 230.0,
 'Cinematography': 'Paul Cameron',
 'Country': 'United States',
 'Directed by': ['Joachim Rønning', 'Espen Sandberg'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Edited by': ['Roger Barton', 'Leigh Folsom Boyd'],
 'Language': 'English',
 'Music by': 'Geoff Zanelli',
 'Produced by': 'Jerry Bruckheimer',
 'Production companies': ['Walt Disney Pictures', 'Jerry Bruckheimer Films'],
 'Release date': ['May 11, 2017 ( Shanghai Disney Resort )',
  'May 26, 2017 (United States)'],
 'Running time': '129 minutes',
 'Running time (int)': 129,
 'Screenplay by': 'Jeff Nathanson',
 'Starring': ['Johnny Depp',
  'Javier Bardem',
  'Geoffrey Rush',
  'Brenton Thwaites',
  'Kaya Scodelario',
  'Kevin McNally'],
 'Story by': ['Jeff Nathan

In [26]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
  return date.split('(')[0].strip()

def date_conversion(date):
  if isinstance(date, list):
    date = date[0]

  if date == 'N/A':
    return None
  
  date_str = clean_date(date)

  fmts = ["%B %d %Y", "%d %B %Y"]
  for fmt in fmts:
    try:
      return datetime.strptime(date_str, fmt)
    except:
      pass
  return None

In [27]:
for movie in movie_info_list:
  movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [28]:
movie_info_list[50]

{'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Box Office (float)': None,
 'Box office': '$303 million',
 'Budget': '$3.6 million',
 'Budget (float)': 3600000.0,
 'Country': 'United States',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Language': 'English',
 'Music by': 'George Bruns',
 'Produced by': 'Walt Disney',
 'Production company': 'Walt Disney Productions',
 'Release date': ['January 25, 1961'],
 'Release date (datetime)': None,
 'Running time': '79 minutes',
 'Running time (int)': 79,
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Story by': 'Bill Peet',
 'title': 'One Hundred and One Dalmatians'}

In [29]:
import pickle

def save_data_pickle(name, data):
  with open(name, 'wb') as f:
    pickle.dump(data, f)

In [30]:
def load_data_pickle(name):
  with open(name, 'rb') as f:
    return pickle.load(f)

In [31]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

In [32]:
a = load_data_pickle('disney_movie_data_final.pickle')

In [33]:
a == movie_info_list

True

**Task 4: Save data to csv**

In [34]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [35]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box Office (float),Release date (datetime),Directed by,Written by,Based on,Produced by,Starring,Music by,Distributed by,Budget,Story by,Narrated by,Cinematography,Edited by,Languages,Screenplay by,Countries,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,,NaT,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",RKO Radio Pictures,$1.49 million,,,,,,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,,NaT,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",RKO Radio Pictures,$2.6 million,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,,NaT,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,RKO Radio Pictures,$2.28 million,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,,,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,,NaT,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",RKO Radio Pictures,"$600,000",,,Bert Glennon,Paul Weatherwax,,,,,,,,,,,


In [36]:
df.to_csv("disney_movie_data_final.csv")

Save to JSON

In [37]:
movie_info_list[50]

{'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Box Office (float)': None,
 'Box office': '$303 million',
 'Budget': '$3.6 million',
 'Budget (float)': 3600000.0,
 'Country': 'United States',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Distributed by': 'Buena Vista Distribution',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Language': 'English',
 'Music by': 'George Bruns',
 'Produced by': 'Walt Disney',
 'Production company': 'Walt Disney Productions',
 'Release date': ['January 25, 1961'],
 'Release date (datetime)': None,
 'Running time': '79 minutes',
 'Running time (int)': 79,
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Story by': 'Bill Peet',
 'title': 'One Hundred and One Dalmatians'}

In [38]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [39]:
for movie in movie_info_copy:
  current_date = movie['Release date (datetime)']
  if current_date:
    movie['Release date (datetime)'] = current_date.strftime("%B %d %Y")
  else:
    movie['Release date (datetime)'] = None

In [40]:
save_data('disney_data_final.json', movie_info_copy)