In [1]:
import numpy as np
import sys, getopt
import pandas as pd
from ast import literal_eval
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
#nltk.download('stopwords')

from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('datasets/movies_metadata.csv')
print(df.shape)

(45466, 24)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# keep only the columns that are relevant to this application
relCols = ['original_title', 'overview', 'genres']
df = df[relCols].copy()  #(45466, 24) --> (45466, 3)
print(df.shape)

(45466, 3)


In [4]:
def preprocessText(text):
    
    # tokenize text and set to lower case
    tokens = [x.strip().lower() for x in nltk.word_tokenize(text)]
    
     # get stopwords from nltk, remove them from our list as well as all punctuations and numbers
    stop_words = stopwords.words('english')
    output = [word for word in tokens if (word not in stop_words and word.isalpha())]
    
    return " ".join(output)

In [5]:
df["overview"] = df["overview"].astype(str).apply(lambda x: preprocessText(x)) #(45466, 3) --> (45466, 3)
print(df.shape)

(45466, 3)


In [6]:
df.head()

Unnamed: 0,original_title,overview,genres
0,Toy Story,led woody andy toys live happily room andy bir...,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,siblings judy peter discover enchanted board g...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,"[{'id': 35, 'name': 'Comedy'}]"


## Genres_df

In [7]:
# extract the names of the genres
df["genres"] = df["genres"].fillna('[]').apply(literal_eval).apply(lambda x: [i["name"] for i in x] if isinstance(x, list) else [])
print(df.shape)

# drop rows where no genre is specified # (45'466 --> 43'024)
df.drop(df.loc[(df['genres'].str.len() == 0),:].copy().index, inplace = True)
print(df.shape)

(45466, 3)
(43024, 3)


In [8]:
df.head()

Unnamed: 0,original_title,overview,genres
0,Toy Story,led woody andy toys live happily room andy bir...,"[Animation, Comedy, Family]"
1,Jumanji,siblings judy peter discover enchanted board g...,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,"[Romance, Comedy]"
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,[Comedy]


In [9]:
import nltk 
from nltk import FreqDist

all_genres = df["genres"].tolist()
full_list_genres = [genre for movie_genres in all_genres for genre in movie_genres] # 32 genres

freqdist = FreqDist(full_list_genres)
over_1000 = sorted(w for w in set(full_list_genres) if freqdist[w] > 1000) # 19 genres over 1000 appearances

print(over_1000)

['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']


In [10]:
#old_df = df.copy()

In [16]:
df = old_df.copy()

### In process

In [17]:
def set_genre(serie, over_1000):
    for genre in serie:
        if genre in over_1000:
            return genre
    return np.nan

In [18]:
df["genres"] = df["genres"].apply(lambda x: set_genre(x, over_1000))
print(df.shape)

(43024, 3)


In [20]:
df = df.dropna() # (43024, 5) --> (43014, 5)
print(df.shape)

(43014, 3)


In [21]:
df.head()

Unnamed: 0,original_title,overview,genres
0,Toy Story,led woody andy toys live happily room andy bir...,Animation
1,Jumanji,siblings judy peter discover enchanted board g...,Adventure
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,Romance
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,Comedy
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,Comedy


In [29]:
def preprocessGenres(df):
    
    # determine the unique genres contained within our dataset
    genres = df["genres"]
    genres = set(genres)
    
    genresDic = {}
    labelNb = 0
    
    # assign a label to each genre
    for genre in genres:
        genresDic[genre] = labelNb
        labelNb += 1
        
    # create a column and insert the value corresponding to the genre
    df["genre label"] = df["genres"].apply(lambda x: genresDic[x])
    
    return df, genresDic

In [30]:
df, genresDic = preprocessGenres(df)

In [31]:
genresDic

{'Science Fiction': 0,
 'Romance': 1,
 'Thriller': 2,
 'Western': 3,
 'Action': 4,
 'Documentary': 5,
 'Crime': 6,
 'Drama': 7,
 'Mystery': 8,
 'Fantasy': 9,
 'Foreign': 10,
 'Music': 11,
 'Horror': 12,
 'Family': 13,
 'Adventure': 14,
 'History': 15,
 'War': 16,
 'Animation': 17,
 'Comedy': 18}

In [32]:
df.head()

Unnamed: 0,original_title,overview,genres,genre label
0,Toy Story,led woody andy toys live happily room andy bir...,Animation,17
1,Jumanji,siblings judy peter discover enchanted board g...,Adventure,14
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,Romance,1
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,Comedy,18
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,Comedy,18


# CLEAN

In [None]:
import numpy as np
import pandas as pd
from ast import literal_eval

import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
def preprocessOverview(text):
    
    # tokenize text and set to lower case
    tokens = [x.strip().lower() for x in nltk.word_tokenize(text)]
    
     # get stopwords from nltk, remove them from our list as well as all punctuations and numbers
    stop_words = stopwords.words('english')
    output = [word for word in tokens if (word not in stop_words and word.isalpha())]
    
    return " ".join(output)

In [4]:
def preprocessGenres(df):
    # extract the names of the genres
    df["genres"] = df["genres"].fillna('[]').apply(literal_eval).apply(lambda x: [i["name"] for i in x] if isinstance(x, list) else [])

    # drop rows where no genre is specified # (45'466 --> 43'024)
    df.drop(df.loc[(df['genres'].str.len() == 0),:].copy().index, inplace = True)
    
    # keep relevant genres (more than 1000 appearances) (32 genres to 19)
    all_genres = df["genres"].tolist()
    full_list_genres = [genre for movie_genres in all_genres for genre in movie_genres] 
    freqdist = FreqDist(full_list_genres)
    over_1000 = sorted(w for w in set(full_list_genres) if freqdist[w] > 1000) 
    
    # keep only one genre per movie
    def set_genre(serie, over_1000):
        for genre in serie:
            if genre in over_1000:
                return genre
        return np.nan
    df["genres"] = df["genres"].apply(lambda x: set_genre(x, over_1000))
    
    # remove movies without relevant genre (43024, 5) --> (43014, 5)
    df = df.dropna()
    
    return df

In [5]:
def genresLabel(df):
    
    # determine the unique genres contained within our dataset
    genres = df["genres"]
    genres = set(genres)
    
    genresDic = {}
    labelNb = 0
    
    # assign a label to each genre
    for genre in genres:
        genresDic[genre] = labelNb
        labelNb += 1
        
    # create a column and insert the value corresponding to the genre
    df["genre label"] = df["genres"].apply(lambda x: genresDic[x])
    
    return df, genresDic

In [6]:
def preprocessDataset(df):
    
    # keep only the columns that are relevant to this application
    relCols = ['original_title', 'overview', 'genres']
    df = df[relCols].copy()      #(45466, 24) --> (45466, 3)
    
    # preprocess overview
    df["overview"] = df["overview"].astype(str).apply(lambda x: preprocessOverview(x))
    
    # preprocess genres label
    df = preprocessGenres(df)
    
    # give genres a label
    df, genresDic = genresLabel(df)
    
    return df, genresDic

In [7]:
df, genresDic = preprocessDataset(df)

In [11]:
def label_file(genresDic):
    genreLabel = pd.DataFrame.from_dict(genresDic, orient='index')
    genreLabel.reset_index(level=0, inplace=True)
    genreLabel.columns = ['genre', 'label']
    return genreLabel

In [12]:
genreLabel = label_file(genresDic)

In [13]:
genreLabel.head(len(genresDic))

Unnamed: 0,genre,label
0,Family,0
1,Fantasy,1
2,Comedy,2
3,History,3
4,Science Fiction,4
5,Thriller,5
6,Music,6
7,Action,7
8,Horror,8
9,Documentary,9


In [14]:
df.head(5)

Unnamed: 0,original_title,overview,genres,genre label
0,Toy Story,led woody andy toys live happily room andy bir...,Animation,13
1,Jumanji,siblings judy peter discover enchanted board g...,Adventure,18
2,Grumpier Old Men,family wedding reignites ancient feud neighbor...,Romance,15
3,Waiting to Exhale,cheated mistreated stepped women holding breat...,Comedy,2
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...,Comedy,2


In [15]:
df.to_csv('datasets/preprocessed.csv', index=False)
genreLabel.to_csv('datasets/genreLabels.csv', index = False)

In [16]:
allGenres = df['genres'].tolist()

freq = FreqDist(allGenres)

In [18]:
freq.most_common(50)

[('Drama', 12099),
 ('Comedy', 8869),
 ('Action', 4521),
 ('Documentary', 3428),
 ('Horror', 2634),
 ('Crime', 1707),
 ('Thriller', 1680),
 ('Adventure', 1522),
 ('Romance', 1220),
 ('Animation', 1131),
 ('Fantasy', 714),
 ('Science Fiction', 655),
 ('Mystery', 561),
 ('Family', 542),
 ('Music', 491),
 ('Western', 458),
 ('War', 381),
 ('History', 283),
 ('Foreign', 118)]