# Reload Data

In [54]:
import json

def saveData(title, data):
    with open(title, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=2)

def loadData(title):
    with open(title, encoding='utf-8') as file:
        return json.load(file)

In [55]:
movies = loadData('MovieDatasetScraped.json') # Total 3254 Movies

# Data Cleaning

### Convert Running Time To Integer

In [56]:
def minutesToInteger(runningTime):
    if runningTime == "N/A":
        return None

    if isinstance(runningTime, list):
        return int(runningTime[0].split(" ")[0])
    else: # is a string
        return int(runningTime.split(" ")[0])

for movie in movies:
    movie['Running Time (int)'] = minutesToInteger(movie.get('Running time', "N/A"))


In [57]:
# print([movie.get('Running time', 'N/A') for movie in movies])

In [58]:
# print([movie.get('Running Time (int)', 'N/A') for movie in movies])

### Convert Budget & Box Office To Float

In [59]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def wordToValue(word):
    valueDict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return valueDict[word]

def parseWordSyntax(string):
    valueString = re.search(number, string).group()
    value = float(valueString.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    wordValue = wordToValue(word)
    return value * wordValue

def parseValueSyntax(string):
    valueString = re.search(number, string).group()
    value = float(valueString.replace(",", ""))
    return value

'''
moneyConversion("$12.2 million") --> 12200000 ## Word syntax
moneyConversion("$790,000") --> 790000        ## Value syntax
'''

def moneyConversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    wordSyntax = re.search(word_re, money, flags=re.I)
    valueSyntax = re.search(value_re, money)

    if wordSyntax:
        return parseWordSyntax(wordSyntax.group())

    elif valueSyntax:
        return parseValueSyntax(valueSyntax.group())

    else:
        return None

In [60]:
for movie in movies:
    movie['Budget (float)'] = moneyConversion(movie.get('Budget', "N/A"))
    movie['Box Office (float)'] = moneyConversion(movie.get('Box office', "N/A"))

### Convert Release Date To Datetime Object

In [61]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movies]

def cleanDate(date):
    return date.split("(")[0].strip()

def dateConversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = cleanDate(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [62]:
for movie in movies:
    movie['Release Date (datetime)'] = dateConversion(movie.get('Release dates', movie.get('Release date', 'N/A')))


# Save Data Using Pickle

In [63]:
import pickle

def saveDataPickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [64]:
def loadDataPickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [65]:
saveDataPickle("MovieDatasetCleaned.pickle", movies)

# Save Data as JSON

In [66]:
moviesCopy = [movie.copy() for movie in movies]

In [67]:
for movie in moviesCopy:
    currentDate = movie['Release Date (datetime)']
    if currentDate:
        movie['Release Date (datetime)'] = currentDate.strftime("%B %d, %Y")
    else:
        movie['Release Date (datetime)'] = None

In [68]:
saveData("MovieDatasetCleaned.json", moviesCopy)