# Data processing for machine learning

Code for keyword stemming and low frequency word removal was adapted and modified from FabienDaniel's code
https://www.kaggle.com/fabiendaniel/film-recommendation-engine 

In [1]:
import numpy as np
import pandas as pd
import json
import seaborn as sb
import matplotlib.pyplot as plt
csv_file = pd.read_csv('movie_data.csv')


In [2]:
csv_file.head()

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,0,"[{""id"": 35, ""name"": ""Comedy""}]",3924,en,Blondie,Blondie and Dagwood are about to celebrate the...,4.074,"[{""id"": 5, ""logo_path"": ""/71BqEFAF4V3qjjMPCpLu...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1938-11-30,0,70.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,,Blondie,7.1,5,"[{""id"": 190801, ""name"": ""blondie""}]"
1,0,[],25449,en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,1.968,[],[],2008-12-08,0,69.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,,New World Disorder 9: Never Enough,4.5,2,"[{""id"": 6075, ""name"": ""sports""}, {""id"": 10192,..."
2,0,"[{""id"": 10751, ""name"": ""Family""}]",31975,en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",1.174,[],[],2010-01-05,0,46.0,[],Released,,Sesame Street: Elmo Loves You!,0.0,0,[]
3,4000000,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...",5,en,Four Rooms,It's Ted the Bellhop's first night on the job....,11.574,"[{""id"": 14, ""logo_path"": ""/m6AHu84oZQxvq7n1rsv...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1995-12-09,4257354,98.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,5.7,1919,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na..."
4,21000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",6,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",9.88,"[{""id"": 33, ""logo_path"": ""/8lvHyhjr8oUKOOy2dKX...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",1993-10-15,12136938,110.0,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.4,186,"[{""id"": 520, ""name"": ""chicago, illinois""}, {""i..."


# Drop all other columns. Leave only title, id, genres and keywords

In [3]:
df = csv_file[['title','id','genres','keywords']]

In [4]:
df.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,"[{""id"": 35, ""name"": ""Comedy""}]","[{""id"": 190801, ""name"": ""blondie""}]"
1,New World Disorder 9: Never Enough,25449,[],"[{""id"": 6075, ""name"": ""sports""}, {""id"": 10192,..."
2,Sesame Street: Elmo Loves You!,31975,"[{""id"": 10751, ""name"": ""Family""}]",[]
3,Four Rooms,5,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...","[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na..."
4,Judgment Night,6,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...","[{""id"": 520, ""name"": ""chicago, illinois""}, {""i..."


In [5]:
df.shape

(28436, 4)

# Clear rows with missing genres or keywords
There are some movies with missing genres and keywords

In [6]:
df = df.loc[(df['genres']!='[]') & (df['keywords']!='[]')]

In [7]:
df.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,"[{""id"": 35, ""name"": ""Comedy""}]","[{""id"": 190801, ""name"": ""blondie""}]"
3,Four Rooms,5,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...","[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na..."
4,Judgment Night,6,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...","[{""id"": 520, ""name"": ""chicago, illinois""}, {""i..."
5,Life in Loops (A Megacities RMX),8,"[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 215272, ""name"": ""megacities""}]"
6,Star Wars,11,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ..."


In [8]:
df.shape

(19593, 4)

No missing data.

# Clean the data. Make it more readable

In [9]:
from ast import literal_eval
df['genres'] = df['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] 
                                                                   if isinstance(x, list) else [])

In [10]:
df.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,[Comedy],"[{""id"": 190801, ""name"": ""blondie""}]"
3,Four Rooms,5,"[Crime, Comedy]","[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na..."
4,Judgment Night,6,"[Action, Thriller, Crime]","[{""id"": 520, ""name"": ""chicago, illinois""}, {""i..."
5,Life in Loops (A Megacities RMX),8,[Documentary],"[{""id"": 215272, ""name"": ""megacities""}]"
6,Star Wars,11,"[Adventure, Action, Science Fiction]","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ..."


In [11]:
from ast import literal_eval
df['keywords'] = df['keywords'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] 
                                                                   if isinstance(x, list) else [])

In [12]:
df.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,[Comedy],[blondie]
3,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room..."
4,Judgment Night,6,"[Action, Thriller, Crime]","[chicago, illinois, drug dealer, escape, one n..."
5,Life in Loops (A Megacities RMX),8,[Documentary],[megacities]
6,Star Wars,11,"[Adventure, Action, Science Fiction]","[android, galaxy, hermit, death star, lightsab..."


# Import nltk

In [13]:
import nltk

In [14]:
# Collect the unique keywords by stemming
#----------------------
def keywords_inventory(dataframe, column = 'keywords'):
    PS = nltk.stem.PorterStemmer()
    keywords_roots  = dict()  # collect the words / root
    keywords_select = dict()  # association: root <-> keyword
    category_keys = []
    icount = 0
    for s in dataframe[column]:
        for t in s:
            t = t.lower() ; stemWord = PS.stem(t)
            if stemWord in keywords_roots:    # add in t into already existing set            
                keywords_roots[stemWord].add(t)
            else:
                keywords_roots[stemWord] = {t} #Create new key = stemWord;value = t
    
    for s in keywords_roots.keys():
        if len(keywords_roots[s]) > 1:  # More than 1 value
            min_length = 1000
            for k in keywords_roots[s]:
                if len(k) < min_length:
                    clef = k ; min_length = len(k)            
            category_keys.append(clef)
            keywords_select[s] = clef
        else: # Only 1 value
            category_keys.append(list(keywords_roots[s])[0])
            keywords_select[s] = list(keywords_roots[s])[0]
                   
    print("Number of keywords in variable '{}': {}".format(column,len(category_keys)))
    return category_keys, keywords_roots, keywords_select

In [15]:
keywords, keywords_roots, keywords_select = keywords_inventory(df, column = 'keywords')

Number of keywords in variable 'keywords': 15470


In [None]:
# Plot of a sample of keywords that appear in close varieties 
#------------------------------------------------------------
icount = 0
for s in keywords_roots.keys():
    if len(keywords_roots[s]) > 1: 
        icount += 1
        if icount < 50: print(icount, keywords_roots[s], len(keywords_roots[s]))

# Replace duplicate keywords

For words of same group like 'gunfight' and 'gunfighter', the root word 'gunfight' is used

In [17]:
# Replacement of the keywords by the main form
#----------------------------------------------
def remplacement_df_keywords(df, remplacement, roots = False):
    PS = nltk.stem.PorterStemmer()
    df_new = df.copy(deep = True)
    for index, row in df_new.iterrows():
        chain = row['keywords']
        new_list = []
        for s in chain: 
            clef = PS.stem(s) if roots else s
            if clef in remplacement.keys():
                new_list.append(clef)
                print(s, ' is changed to ', clef)
            else:
                new_list.append(s)
        df_new.at[index,"keywords"] = new_list
    return df_new

In [None]:
# Replacement of the keywords by the main keyword
#-------------------------------------------------
df_keywords_cleaned = remplacement_df_keywords(df, keywords_select,
                                               roots = True)

In [19]:
df_keywords_cleaned.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,[Comedy],[blondi]
3,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's ev, witch, bet, hotel room,..."
4,Judgment Night,6,"[Action, Thriller, Crime]","[chicago, illinoi, drug deal, escap, one night..."
5,Life in Loops (A Megacities RMX),8,[Documentary],[megac]
6,Star Wars,11,"[Adventure, Action, Science Fiction]","[android, galaxi, hermit, death star, lightsab..."


In [20]:
#df_keywords_cleaned.to_csv( 'cleaned_data.csv', index=False, mode='a+')

In [21]:
df_keywords_cleaned.shape

(19593, 4)

## New set of keywords based on cleaned list

In [22]:
clean_keywords, clean_keywords_roots, clean_keywords_select = keywords_inventory(df_keywords_cleaned, column = 'keywords')

Number of keywords in variable 'keywords': 15437


## Remove low frequency words 
1) Delete all the keywords that appear in less than 3 films

In [23]:
# deletion of keywords that appear in less than 3 films
#-------------------------------------------
def remplacement_df_low_frequency_keywords(df, keyword_occurences):
    df_new = df.copy(deep = True)
    key_count = dict()
    for s in keyword_occurences: # fit keyword_occurence to dict
        key_count[s[0]] = s[1]    
    for index, row in df_new.iterrows():
        chain = row['keywords']
        new_list = []
        for s in chain: 
            if key_count.get(s, 4) > 3: new_list.append(s)
        df_new.at[index,"keywords"] = new_list
    return df_new

In [24]:
def count_word(df, column, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0 # Set keys, then their value = 0
    for list_of_keywords in df[column]:                
        for s in [s for s in list_of_keywords if s in liste]: 
            if pd.notnull(s): keyword_count[s] += 1 # Increment keycount
    #______________________________________________________________________
    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [25]:
# New count of keyword occurences
#-------------------------------------
new_keyword_occurences, keywords_count = count_word(df_keywords_cleaned,
                                                    'keywords',clean_keywords)
new_keyword_occurences[:10]

[['woman director', 1216],
 ['murder', 1043],
 ['based on novel or book', 996],
 ['music', 855],
 ['sport', 602],
 ['new york c', 518],
 ['violenc', 509],
 ['noir', 488],
 ['reveng', 441],
 ['short film', 433]]

Top keywords include 'woman director', 'murder', 'based on novel or book' and 'music'

In [26]:
# Creation of a dataframe where keywords of low frequencies are deleted
#-------------------------------------------------------------------------
df_keywords_occurence = remplacement_df_low_frequency_keywords(df_keywords_cleaned, new_keyword_occurences)
freq_keywords, freq_keywords_roots, freq_keywords_select = keywords_inventory(df_keywords_occurence, column = 'keywords')  

Number of keywords in variable 'keywords': 5187


# Low frequency keywords are removed

In [27]:
df_keywords_occurence.head()

Unnamed: 0,title,id,genres,keywords
0,Blondie,3924,[Comedy],[blondi]
3,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's ev, witch, bet, hotel room,..."
4,Judgment Night,6,"[Action, Thriller, Crime]","[chicago, illinoi, drug deal, escap, one night..."
5,Life in Loops (A Megacities RMX),8,[Documentary],[]
6,Star Wars,11,"[Adventure, Action, Science Fiction]","[android, galaxi, hermit, death star, jedi, re..."


There are some rows with empty keywords. This is due to the removal of low frequency keywords. 
We will delete these rows with empty keywords with Microsoft Excel.

In [28]:
df_keywords_occurence.to_csv( 'occur_cleaned_data.csv', index=False, mode='a+')