## Task #3: Data Cleaning

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [2]:
movie_info_list = load_data('disney_data_cleaned.json')

In [3]:
movie_info_list[-40]

{'title': 'The Ice Age Adventures of Buck Wild',
 'Directed by': 'John C. Donkin',
 'Screenplay by': ['Jim Hecht', 'William Schifrin', 'Ray DeLaurentis'],
 'Story by': 'Jim Hecht',
 'Based on': ['Characters', 'by', 'Michael J. Wilson'],
 'Produced by': 'Denise L. Rottina',
 'Starring': ['Simon Pegg',
  'Utkarsh Ambudkar',
  'Justina Machado',
  'Vincent Tong',
  'Aaron Harris',
  'Dominique Jennings',
  'Jake Green',
  'Sean Kenin Elias-Reyes',
  'Skyler Stone'],
 'Edited by': 'Braden Oberson',
 'Music by': 'Batu Sener',
 'Production companies': ['Walt Disney Pictures',
  '20th Century Animation',
  'Bardel Entertainment'],
 'Distributed by': 'Disney+',
 'Release dates': ['January 28, 2022 (North America)',
  'March 25, 2022 (International)',
  'September 27, 2022 (Home media; DVD, Blu-ray & 4K)'],
 'Running time': '82 minutes',
 'Countries': ['United States', 'Canada'],
 'Language': 'English'}

#### Converting running times into an integer


In [4]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'], '127 minutes', '93 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '84 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '73 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 minutes', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 minutes', '110 minutes', '80 min.', '74 minutes'

In [5]:
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):  # IF the running time is in form of list  eg - ['85 mintues', '90 minutes']
        return int(running_time[0].split(" ")[0])
    else: # is a string  eg - '85 minutes'    // split them in a list and return first element
        return int(running_time.split(" ")[0])


for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [6]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 93, 76, 75, 73, 85, 81, 70, 90, 80, 75, 84, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 74, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 102, 85, 91, 93, 100, 100, 79, 96, 113, 89, 117, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, None, 91, 112, 115, 95, 91, 97, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 96, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 74, 93, 92, 98, 95, 93, 87, 93, 87, 128, 77, 86, 95, 114, 

####  Coverting budget and Box Office to float

In [7]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '$6.3 mi

In [8]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [9]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

#### Converting release date to datetime

In [16]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], 'May 27, 1948', 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953 (United States)'], ['July 23, 1953 (US)'], ['November 10, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], 'May 25, 1955', ['June 22, 1955'], ['September 14, 1955'], 'December 22, 1955', 'June 8, 1956', ['July 18, 1956'], ['September 4, 1956'], ['December 20, 1956'], 'June 19, 1957', 'August 28, 1957', ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], 'January 21, 1960 ( Sarasota, FL )', ['February 24, 1960'], 'May 19, 1960', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], 'March 16, 1961', ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], 'April 5, 1962', ['May 17, 1962'], ['June 6, 196

In [17]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date): 
    return date.split("(")[0].strip()

def date_conversion(date): # if it's list return first element
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]  #setting up the format
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [18]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [22]:
movie_info_list[50]

{'title': 'The Absent-Minded Professor',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': 'Bill Walsh',
 'Based on': ['"A Situation of Gravity"',
  '1943 short story',
  'by',
  'Samuel W. Taylor'],
 'Produced by': ['Walt Disney', 'Bill Walsh'],
 'Starring': ['Fred MacMurray',
  'Nancy Olson',
  'Keenan Wynn',
  'Tommy Kirk',
  'Leon Ames',
  'Elliott Reid',
  'Edward Andrews'],
 'Cinematography': 'Edward Colman',
 'Edited by': 'Cotton Warburton',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'March 16, 1961',
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$11.426 million (U.S. and Canada rentals)',
 'Running time (int)': 97,
 'Budget (float)': None,
 'Box office (float)': 11426000.0,
 'Release date (datetime)': datetime.datetime(1961, 3, 16, 0, 0)}

### Saving the data

In [23]:
import pickle  # using pickle because save_data method throws an error with realease date not serializable

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [24]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [25]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [26]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [27]:
a == movie_info_list

True

## Task #4: Save data as CSV using Pandas


In [28]:
movie_info_list[50]

{'title': 'The Absent-Minded Professor',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': 'Bill Walsh',
 'Based on': ['"A Situation of Gravity"',
  '1943 short story',
  'by',
  'Samuel W. Taylor'],
 'Produced by': ['Walt Disney', 'Bill Walsh'],
 'Starring': ['Fred MacMurray',
  'Nancy Olson',
  'Keenan Wynn',
  'Tommy Kirk',
  'Leon Ames',
  'Elliott Reid',
  'Edward Andrews'],
 'Cinematography': 'Edward Colman',
 'Edited by': 'Cotton Warburton',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'March 16, 1961',
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$11.426 million (U.S. and Canada rentals)',
 'Running time (int)': 97,
 'Budget (float)': None,
 'Box office (float)': 11426000.0,
 'Release date (datetime)': datetime.datetime(1961, 3, 16, 0, 0)}

In [32]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [33]:
df.head()

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Simplified,Original title,Suggested by,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...",,"[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",...,,,,,,,,,,


In [34]:
df.to_csv("disney_movie_data_final.csv")

In [35]:
running_times = df.sort_values(['Running time (int)'],  ascending=False)
running_times.head(20)

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Simplified,Original title,Suggested by,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
519,Tinker Bell,"[Bradley Raymond ( 1 , 3 & 4 ), Klay Hall ( 2 ...",,,,"[Mae Whitman, Lucy Liu, Raven-Symoné, Megan Hi...",Joel McNeely,DisneyToon Studios,"[Walt Disney Studios, Home Entertainment]","[1, : October 28, 2008, 2, :, 3, :, 4, :, 5, :...",...,,,,,,,,,,
327,Pirates of the Caribbean: At World's End,Gore Verbinski,"[Ted Elliott, Terry Rossio]",[Characters by Ted Elliott Terry Rossio Stuart...,Jerry Bruckheimer,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Hans Zimmer,,Buena Vista Pictures,"[May 19, 2007 ( Disneyland Resort ), May 25, 2...",...,,,,,,,,,,
85,The Happiest Millionaire,Norman Tokar,,"[My Philadelphia Father, by Cordelia Drexel Bi...","[Walt Disney, Bill Anderson]","[Fred MacMurray, Tommy Steele, Greer Garson, G...",Jack Elliott,Walt Disney Productions,Buena Vista Distribution,"[June 23, 1967, November 30, 1967]",...,,,,,,,,,,
440,Jagga Jasoos,Anurag Basu,"[Screenplay:, Anurag Basu, Dialogues in Rhyme:...",,"[Siddharth Roy Kapur, Anurag Basu, Ranbir Kapoor]","[Ranbir Kapoor, Katrina Kaif, Saswata Chatterj...",Pritam,,UTV Motion Pictures,,...,,,,,,,,,,
433,Dangal,Nitesh Tiwari,"[Nitesh Tiwari, Piyush Gupta, Shreyas Jain, Ni...",,"[Aamir Khan, Kiran Rao, Siddharth Roy Kapur]","[Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh...",Pritam,,UTV Motion Pictures,"[21 December 2016 (United States), 23 December...",...,,,,,,,,,,
465,Hamilton,Thomas Kail,Lin-Manuel Miranda,"[Alexander Hamilton, by, Ron Chernow]","[Thomas Kail, Lin-Manuel Miranda, Jeffrey Seller]","[Daveed Diggs, Renée Elise Goldsberry, Jonatha...",Lin-Manuel Miranda,,Walt Disney Studios Motion Pictures,,...,,,,,,,,,,
421,ABCD 2,Remo D'Souza,"[Dialogues and Lyrics:, Mayur Puri, Screenplay...","[Suresh and Vernon, of, The Fictitious Crew]",Siddharth Roy Kapur,"[Prabhu Deva, Varun Dhawan, Shraddha Kapoor, L...",Sachin–Jigar,Walt Disney Pictures,UTV Motion Pictures,,...,,,,,,,,,,
318,Pirates of the Caribbean: Dead Man's Chest,Gore Verbinski,"[Ted Elliott, Terry Rossio]",[Characters by Ted Elliott Terry Rossio Stuart...,Jerry Bruckheimer,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Hans Zimmer,,Buena Vista Pictures,"[June 16, 2006 ( Disneyland Resort ), July 7, ...",...,,,,,,,,,,
337,The Chronicles of Narnia: Prince Caspian,Andrew Adamson,,"[Prince Caspian, by, C. S. Lewis]","[Mark Johnson, Andrew Adamson, Philip Steuer]","[Georgie Henley, Skandar Keynes, William Mosel...",Harry Gregson-Williams,,Walt Disney Studios Motion Pictures,"[May 7, 2008 ( New York City ), May 16, 2008 (...",...,,,,,,,,,,
400,The Lone Ranger,Gore Verbinski,,"[Fran Striker, George W. Trendle]","[Jerry Bruckheimer, Gore Verbinski]","[Johnny Depp, Armie Hammer, Tom Wilkinson, Wil...",Hans Zimmer,,"[Walt Disney Studios, Motion Pictures]","[June 22, 2013 ( Hyperion Theatre ), July 3, 2...",...,,,,,,,,,,
