# Data Cleaning

<font size="3">The goal of this script is to transform raw data to data we can work with without actually cleaning the data in the EDA part.</font>

<font size="3">Import libraries</font>

In [1]:
# import standard libraries
from ast import literal_eval
# import third-party libraries
import pandas as pd
from IPython.display import display
# import local libraries

pd.options.display.max_columns = None

<font size="3">Import data</font>

In [2]:
movies = pd.read_csv('movies_metadata.csv')
movies['id'] = movies['id'].astype(int)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  10 non-null     object 
 1   belongs_to_collection  4 non-null      object 
 2   budget                 10 non-null     object 
 3   genres                 10 non-null     object 
 4   homepage               2 non-null      object 
 5   id                     10 non-null     int64  
 6   imdb_id                10 non-null     object 
 7   original_language      10 non-null     object 
 8   original_title         10 non-null     object 
 9   overview               10 non-null     object 
 10  popularity             10 non-null     object 
 11  poster_path            10 non-null     object 
 12  production_companies   10 non-null     object 
 13  production_countries   10 non-null     object 
 14  release_date           10 non-null     object 
 15  revenue  

In [4]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


<font size="3">Drop columns that are not needed</font>

In [5]:
# clean movies, some columns are too sparse or unnecessary
col2drop = ['homepage', 'overview', 'poster_path', 'tagline']
movies = movies.drop(columns = col2drop)

<font size="3">The 'belongs_to_collection' column is a dictionary, we expand the items into new columns and append them to movies. We choose to keep only 'id' and 'name' and we will check later if 'id' and 'name' correspond to each other, in which case we will create a mapper.</font>

In [6]:
# clean columns belongs_to_collection, expand dictionary in cells into new columns
tag = 'belongs_to_collection'
movies[tag] = movies[tag].fillna(value = '{}')
try:
    movies[tag] = movies[tag].apply(literal_eval) # evaluate strings as dictionaries
except: # if it has already been done
    pass
b2c = movies[tag].apply(pd.Series)

# we can keep only id and name columns
b2c = b2c[['id', 'name']]

# rename columns of b2c with tag
b2c = b2c.add_prefix('{}_'.format(tag))

# replace tag columns in movies by expanded columns
movies = pd.concat([movies.drop(columns = tag), b2c], axis = 1)

<font size="3">The 'genres' column's cells are a list of dictionaries. We expand each item in the list as new columns in movies. We make a function for this.</font>

In [7]:
# function to expand lists of dictionaries and append them to movies data
def expandColumn(movies, tag, keep):
    """Expand columns whose cells contain lists of dictionaries
    
    Parameters
    ----------
    movies : pands.DataFrame
        Dataframe to expand.
    tag : str
        Column name to expand.
    keep : list of str
        Expanded columns to keep.
    
    """
    movies[tag] = movies[tag].fillna(value = '[]')
    try:
        movies[tag] = movies[tag].apply(literal_eval) # evaluate strings as lists
    except: # if it has already been done
        pass
    subcol = movies[tag].apply(pd.Series)

    # for each subcol we separate the dataframe into more subcols 
    for col in subcol:
        subcol[col] = subcol[col].fillna(value = '{}')
        try:
            subcol[col] = subcol[col].apply(literal_eval)
        except:
            pass
        subsubcol = subcol[col].apply(pd.Series)

        # keep only what to keep
        subsubcol = subsubcol[keep]

        # add prefix and suffix to new columns names
        subsubcol = subsubcol.add_prefix('{}_'.format(tag))
        subsubcol = subsubcol.add_suffix('_{}'.format(col))

        # concat to movies data
        movies = pd.concat([movies, subsubcol], axis = 1)

    # drop tag column from original data, since it has been expanded and appended to movies
    movies = movies.drop(columns = tag)
    
    return movies

In [8]:
# clean columns genres, expand list of dictonaries in cells into new columns
movies = expandColumn(movies, 'genres', ['id', 'name'])

In [9]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
#movies = pd.read_csv('movies_metadata_cleaned.csv')

<font size="3">We repeat the same procedure with 'production_companies' column.</font>

In [10]:
movies = expandColumn(movies, 'production_companies', ['id', 'name'])

In [11]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
#movies = pd.read_csv('movies_metadata_cleaned.csv')

<font size="3">We repeat the same procedure with 'production_countries' column.</font>

In [12]:
movies = expandColumn(movies, 'production_countries', ['iso_3166_1', 'name'])

In [13]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
#movies = pd.read_csv('movies_metadata_cleaned.csv')

<font size="3">We repeat the same procedure with 'spoken_languages' column.</font>

In [14]:
movies = expandColumn(movies, 'spoken_languages', ['iso_639_1', 'name'])

In [15]:
# take a break here
#movies.to_csv('movies_metadata_cleaned.csv', index = False)
#movies = pd.read_csv('movies_metadata_cleaned.csv')

<font size="3">The final dataset looks like this.</font>

In [16]:
movies

Unnamed: 0,adult,budget,id,imdb_id,original_language,original_title,popularity,release_date,revenue,runtime,status,title,video,vote_average,vote_count,belongs_to_collection_id,belongs_to_collection_name,genres_id_0,genres_name_0,genres_id_1,genres_name_1,genres_id_2,genres_name_2,genres_id_3,genres_name_3,production_companies_id_0,production_companies_name_0,production_companies_id_1,production_companies_name_1,production_companies_id_2,production_companies_name_2,production_companies_id_3,production_companies_name_3,production_companies_id_4,production_companies_name_4,production_companies_id_5,production_companies_name_5,production_companies_id_6,production_companies_name_6,production_countries_iso_3166_1_0,production_countries_name_0,production_countries_iso_3166_1_1,production_countries_name_1,spoken_languages_iso_639_1_0,spoken_languages_name_0,spoken_languages_iso_639_1_1,spoken_languages_name_1,spoken_languages_iso_639_1_2,spoken_languages_name_2
0,False,30000000,862,tt0114709,en,Toy Story,21.9469,1995-10-30,373554033.0,81.0,Released,Toy Story,False,7.7,5415.0,10194.0,Toy Story Collection,16,Animation,35.0,Comedy,10751.0,Family,,,3,Pixar Animation Studios,,,,,,,,,,,,,US,United States of America,,,en,English,,,,
1,False,65000000,8844,tt0113497,en,Jumanji,17.0155,1995-12-15,262797249.0,104.0,Released,Jumanji,False,6.9,2413.0,,,12,Adventure,14.0,Fantasy,10751.0,Family,,,559,TriStar Pictures,2550.0,Teitler Film,10201.0,Interscope Communications,,,,,,,,,US,United States of America,,,en,English,fr,Français,,
2,False,0,15602,tt0113228,en,Grumpier Old Men,11.7129,1995-12-22,0.0,101.0,Released,Grumpier Old Men,False,6.5,92.0,119050.0,Grumpy Old Men Collection,10749,Romance,35.0,Comedy,,,,,6194,Warner Bros.,19464.0,Lancaster Gate,,,,,,,,,,,US,United States of America,,,en,English,,,,
3,False,16000000,31357,tt0114885,en,Waiting to Exhale,3.85949,1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,False,6.1,34.0,,,35,Comedy,18.0,Drama,10749.0,Romance,,,306,Twentieth Century Fox Film Corporation,,,,,,,,,,,,,US,United States of America,,,en,English,,,,
4,False,0,11862,tt0113041,en,Father of the Bride Part II,8.38752,1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,False,5.7,173.0,96871.0,Father of the Bride Collection,35,Comedy,,,,,,,5842,Sandollar Productions,9195.0,Touchstone Pictures,,,,,,,,,,,US,United States of America,,,en,English,,,,
5,False,60000000,949,tt0113277,en,Heat,17.9249,1995-12-15,187436818.0,170.0,Released,Heat,False,7.7,1886.0,,,28,Action,80.0,Crime,18.0,Drama,53.0,Thriller,508,Regency Enterprises,675.0,Forward Pass,6194.0,Warner Bros.,,,,,,,,,US,United States of America,,,en,English,es,Español,,
6,False,58000000,11860,tt0114319,en,Sabrina,6.67728,1995-12-15,0.0,127.0,Released,Sabrina,False,6.2,141.0,,,35,Comedy,10749.0,Romance,,,,,4,Paramount Pictures,258.0,Scott Rudin Productions,932.0,Mirage Enterprises,5842.0,Sandollar Productions,14941.0,Constellation Entertainment,55873.0,Worldwide,58079.0,Mont Blanc Entertainment GmbH,DE,Germany,US,United States of America,fr,Français,en,English,,
7,False,0,45325,tt0112302,en,Tom and Huck,2.56116,1995-12-22,0.0,97.0,Released,Tom and Huck,False,5.4,45.0,,,28,Action,12.0,Adventure,18.0,Drama,10751.0,Family,2,Walt Disney Pictures,,,,,,,,,,,,,US,United States of America,,,en,English,de,Deutsch,,
8,False,35000000,9091,tt0114576,en,Sudden Death,5.23158,1995-12-22,64350171.0,106.0,Released,Sudden Death,False,5.5,174.0,,,28,Action,12.0,Adventure,53.0,Thriller,,,33,Universal Pictures,21437.0,Imperial Entertainment,23770.0,Signature Entertainment,,,,,,,,,US,United States of America,,,en,English,,,,
9,False,58000000,710,tt0113189,en,GoldenEye,14.686,1995-11-16,352194034.0,130.0,Released,GoldenEye,False,6.6,1194.0,645.0,James Bond Collection,12,Adventure,28.0,Action,53.0,Thriller,,,60,United Artists,7576.0,Eon Productions,,,,,,,,,,,GB,United Kingdom,US,United States of America,en,English,ru,Pусский,es,Español


In [17]:
movies = movies.set_index('id')

<font size="3">Download crew and cast.</font>

In [18]:
credits = pd.read_csv('credits.csv')

In [19]:
credits['id'] = credits['id'].astype(int)
credits = expandColumn(credits, 'crew', ['name', 'job'])

In [20]:
credits = expandColumn(credits, 'cast', ['name'])

In [21]:
credits = credits.set_index('id')

In [22]:
# merge movies with credits
movies_credits = pd.concat([movies, credits], axis = 1)

In [23]:
movies_credits = movies_credits.reset_index()

In [None]:
movies_credits.to_csv('movies_credits_metadata_cleaned.csv', index = False)