# Data Cleaning

<font size="3">Import libraries</font>

In [2]:
# import standard libraries
from ast import literal_eval
# import third-party libraries
import pandas as pd
from IPython.display import display
# import local libraries

pd.options.display.max_columns = None

<font size="3">Import data</font>

In [3]:
movies = pd.read_csv('movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [4]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


<font size="3">Drop columns that are not needed</font>

In [5]:
# clean movies, some columns are too sparse or unnecessary
col2drop = ['homepage', 'overview', 'poster_path', 'tagline']
movies = movies.drop(columns = col2drop)

<font size="3">The 'belongs_to_collection' column is a dictionary, we expand the items into new columns and append them to movies. We choose to keep only 'id' and 'name' and we will check later if 'id' and 'name' correspond to each other, in which case we will create a mapper.</font>

In [6]:
# clean columns belongs_to_collection, expand dictionary in cells into new columns
tag = 'belongs_to_collection'
movies[tag] = movies[tag].fillna(value = '{}')
try:
    movies[tag] = movies[tag].apply(literal_eval) # evaluate strings as dictionaries
except: # if it has already been done
    pass
b2c = movies[tag].apply(pd.Series)

# we can keep only id and name columns
b2c = b2c[['id', 'name']]

# rename columns of b2c with tag
b2c = b2c.add_prefix('{}_'.format(tag))

# replace tag columns in movies by expanded columns
movies = pd.concat([movies.drop(columns = tag), b2c], axis = 1)

<font size="3">The 'genres' column's cells are a list of dictionaries. We expand each item in the list as new columns in movies. We keep columns id and name and creaete a mapper as well laster. We make a function for this.</font>

In [7]:
# function to expand lists of dictionaries and append them to movies data
def expandColumn(movies, tag, keep):
    """Expand columns whose cells contain lists of dictionaries
    
    Parameters
    ----------
    movies : pands.DataFrame
        Dataframe to expand.
    tag : str
        Column name to expand.
    keep : list of str
        Expanded columns to keep.
    
    """
    movies[tag] = movies[tag].fillna(value = '[]')
    try:
        movies[tag] = movies[tag].apply(literal_eval) # evaluate strings as lists
    except: # if it has already been done
        pass
    subcol = movies[tag].apply(pd.Series)

    # for each subcol we separate the dataframe into more subcols 
    for col in subcol:
        subcol[col] = subcol[col].fillna(value = '{}')
        try:
            subcol[col] = subcol[col].apply(literal_eval)
        except:
            pass
        subsubcol = subcol[col].apply(pd.Series)

        # keep only what to keep
        subsubcol = subsubcol[keep]

        # add prefix and suffix to new columns names
        subsubcol = subsubcol.add_prefix('{}_'.format(tag))
        subsubcol = subsubcol.add_suffix('_{}'.format(col))

        # concat to movies data
        movies = pd.concat([movies, subsubcol], axis = 1)

    # drop tag column from original data, since it has been expanded and appended to movies
    movies = movies.drop(columns = tag)
    
    return movies

In [None]:
# clean columns genres, expand list of dictonaries in cells into new columns
movies = expandColumn(movies, 'genres', ['id', 'name'])

In [15]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
movies = pd.read_csv('movies_metadata_cleaned.csv')

  interactivity=interactivity, compiler=compiler, result=result)


<font size="3">We repeat the same procedure with 'production_companies' column.</font>

In [17]:
movies = expandColumn(movies, 'production_companies', ['id', 'name'])

In [22]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
movies = pd.read_csv('movies_metadata_cleaned.csv')

  interactivity=interactivity, compiler=compiler, result=result)


<font size="3">We repeat the same procedure with 'production_countries' column.</font>

In [32]:
movies = expandColumn(movies, 'production_countries', ['iso_3166_1', 'name'])

In [34]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
movies = pd.read_csv('movies_metadata_cleaned.csv')

  interactivity=interactivity, compiler=compiler, result=result)


<font size="3">We repeat the same procedure with 'spoken_languages' column.</font>

In [37]:
movies = expandColumn(movies, 'spoken_languages', ['iso_639_1', 'name'])

In [39]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
movies = pd.read_csv('movies_metadata_cleaned.csv')

  interactivity=interactivity, compiler=compiler, result=result)
