In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re

# Movies_Metadata

### Drop NULL's

In [2]:
# read data
df_mm = pd.read_csv('../data/raw/movies_metadata.csv',low_memory=False)
df_mm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [3]:
# check null values in percentage
print(df_mm.isna().sum(axis=0)/len(df_mm))
# drop the columns with too many null values and the irrelevant columns
df_mm = df_mm.drop(columns=['belongs_to_collection','homepage','tagline',
                            'poster_path'])

adult                    0.000000
belongs_to_collection    0.901157
budget                   0.000000
genres                   0.000000
homepage                 0.828839
id                       0.000000
imdb_id                  0.000374
original_language        0.000242
original_title           0.000000
overview                 0.020983
popularity               0.000110
poster_path              0.008490
production_companies     0.000066
production_countries     0.000066
release_date             0.001914
revenue                  0.000132
runtime                  0.005785
spoken_languages         0.000132
status                   0.001914
tagline                  0.551049
title                    0.000132
video                    0.000132
vote_average             0.000132
vote_count               0.000132
dtype: float64


### Convert JSON to list

In [4]:
def Spok_Lang(x):
    '''
    For spoken_languages feature:
    Input a string of JSON, get the values for which the
    keys are "iso_639_1" (stands for country code) then 
    convert the string into a list
    '''
    if ((x is np.nan) or (x=='[]')):
        return np.nan
    
    # would have invalid escape error
    x = re.sub(r'\\','',x)
    x_json = json.loads(x.replace("\'","\""))
    return [i['iso_639_1'] for i in x_json] 

df_mm['spoken_languages'] = df_mm['spoken_languages'].apply(Spok_Lang)

In [5]:
def Gen(x):
    '''
    For genres feature:
    Input a string of JSON, get the values for which the
    keys are "name" then convert the string into a list
    '''
    if ((x is np.nan) or (x=='[]')):
        return np.nan
    
    x_json = json.loads(x.replace("\'","\""))
    return [i['name'] for i in x_json]

df_mm['genres'] = df_mm['genres'].apply(Gen)

In [6]:
def Prod_Count(x):
    '''
    For production_countries feature:
    Input a string of JSON, get the values for which the
    keys are "iso_639_1" (stands for country code) then 
    convert the string into a list
    '''
    if ((x is np.nan) or (x=='[]')):
        return np.nan
    
    try:
        # some observations have ill-formatted values
        # just drop them
        x_json = json.loads(x.replace("\'","\""))
        x_list = [i['iso_3166_1'] for i in x_json]
    except:
        return np.nan
    
    x_json = json.loads(x.replace("\'","\""))
    return [i['iso_3166_1'] for i in x_json]

df_mm['production_countries'] = df_mm['production_countries'].apply(Prod_Count)

In [7]:
def Prod_Com(x):
    '''
    For production_companies feature:
    Input a string of JSON, get the values for which the
    keys are "name" then convert the string into a list
    '''
    if ((x is np.nan) or (x=='[]')):
        return np.nan
    
    try:
        # some observations have ill-formatted values
        # just drop them
        x_json = json.loads(x.replace("\'","\""))
        x_list = [i['name'] for i in x_json]
    except:
        return np.nan
    
    x_json = json.loads(x.replace("\'","\""))
    return [i['name'] for i in x_json]

df_mm['production_companies'] = df_mm['production_companies'].apply(Prod_Com)

### Convert Other Features

In [8]:
# convert the date string to datetime
df_mm['release_date'] = pd.to_datetime(df_mm['release_date'],errors='coerce')

# convert to boolean values
df_mm['adult'] = df_mm['adult'].apply(lambda x: True if x == 'True' else False)

In [9]:
def ConvertToFloat(x):
    '''
    for feature budget: convert the string
    into float. If the input is ill-formatted,
    then return Nan
    '''
    try:
        float(x)
    except:
        return np.nan
    return float(x)

df_mm['budget'] = df_mm['budget'].apply(ConvertToFloat)
df_mm['popularity'] = df_mm['popularity'].apply(ConvertToFloat)

### Combine into One Function

In [10]:
def PreprocessMM(inpath,outpath,save=False,):
    '''
    clean the raw movies_metadata.csv dataset
    
    Params:
        path: path of the input dataset(Ex. '../data/raw/movies_metadata.csv')
        save: specify if the cleaned dataset 
              need to be saved in the '../data/interim/'
    '''
    # read data
    df_mm = pd.read_csv(path,low_memory=False)
    # drop columns
    df_mm = df_mm.drop(columns=['belongs_to_collection','homepage','tagline','poster_path'])
    
    # convert JSON in string to list
    df_mm['spoken_languages'] = df_mm['spoken_languages'].apply(Spok_Lang)
    df_mm['genres'] = df_mm['genres'].apply(Gen)
    df_mm['production_countries'] = df_mm['production_countries'].apply(Prod_Count)
    df_mm['production_companies'] = df_mm['production_companies'].apply(Prod_Com)
    
    # convert the date string to datetime
    df_mm['release_date'] = pd.to_datetime(df_mm['release_date'],errors='coerce')
    # convert to boolean values
    df_mm['adult'] = df_mm['adult'].apply(lambda x: True if x == 'True' else False)
    # convert from string to float
    df_mm['budget'] = df_mm['budget'].apply(ConvertToFloat)
    df_mm['popularity'] = df_mm['popularity'].apply(ConvertToFloat)
    
    if save:
        df_mm.to_csv(outpath)
    
    return df_mm

In [11]:
PreprocessMM('../data/raw/movies_metadata.csv',True)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count
0,False,30000000.0,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[US],1995-10-30,373554033.0,81.0,[en],Released,Toy Story,False,7.7,5415.0
1,False,65000000.0,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[US],1995-12-15,262797249.0,104.0,"[en, fr]",Released,Jumanji,False,6.9,2413.0
2,False,0.0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,"[Warner Bros., Lancaster Gate]",[US],1995-12-22,0.0,101.0,[en],Released,Grumpier Old Men,False,6.5,92.0
3,False,16000000.0,"[Comedy, Drama, Romance]",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],[US],1995-12-22,81452156.0,127.0,[en],Released,Waiting to Exhale,False,6.1,34.0
4,False,0.0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[US],1995-02-10,76578911.0,106.0,[en],Released,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,0.0,"[Drama, Family]",439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,,[IR],NaT,0.0,90.0,[fa],Released,Subdue,False,4.0,1.0
45462,False,0.0,[Drama],111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,[Sine Olivia],[PH],2011-11-17,0.0,360.0,[tl],Released,Century of Birthing,False,9.0,3.0
45463,False,0.0,"[Action, Drama, Thriller]",67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,[American World Pictures],[US],2003-08-01,0.0,90.0,[en],Released,Betrayal,False,3.8,6.0
45464,False,0.0,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,[Yermoliev],[RU],1917-10-21,0.0,87.0,,Released,Satan Triumphant,False,0.0,0.0
