In [1]:
import pandas as pd

In [2]:
df_input = pd.read_pickle("netflix_prize.pkl")
df_mt = pd.read_pickle("data_genres.pkl")

In [3]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   index       int64  
 1   userID      object 
 2   rating      float64
 3   ratingDate  object 
 4   movieID     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.7+ GB


In [4]:
df_mt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieID        17770 non-null  object 
 1   releaseYear    17770 non-null  int64  
 2   title          17770 non-null  object 
 3   type           17770 non-null  object 
 4   genres         11881 non-null  object 
 5   averageRating  17770 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 833.1+ KB


In [5]:
df_mt.head()

Unnamed: 0,movieID,releaseYear,title,type,genres,averageRating
0,1,2003,Dinosaur Planet,movie,,0.0
1,10,2001,Fighter,movie,Documentary,7.3
2,100,1993,Sam the Iron Bridge,movie,,0.0
3,1000,2002,Jim Breuer: Hardcore,movie,"Comedy,Documentary",7.2
4,10000,1995,Dr. Jekyll and Ms. Hyde,movie,"Comedy,Romance,Sci-Fi",4.5


### Get keywords from movie title

In [6]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
 
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

df_mt['keywords'] = ""

for i in range(len(df_mt)):
    words = word_tokenize(df_mt['title'][i])
    filtered_words = [word for word in words if word.isalpha() and not word.lower() in stop_words]
    df_mt['keywords'][i] = [ps.stem(word) for word in filtered_words]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mt['keywords'][i] = [ps.stem(word) for word in filtered_words]


In [7]:
df_mt.head(20)

Unnamed: 0,movieID,releaseYear,title,type,genres,averageRating,keywords
0,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"
1,10,2001,Fighter,movie,Documentary,7.3,[fighter]
2,100,1993,Sam the Iron Bridge,movie,,0.0,"[sam, iron, bridg]"
3,1000,2002,Jim Breuer: Hardcore,movie,"Comedy,Documentary",7.2,"[jim, breuer, hardcor]"
4,10000,1995,Dr. Jekyll and Ms. Hyde,movie,"Comedy,Romance,Sci-Fi",4.5,"[jekyl, hyde]"
5,10001,1994,Chef!,tv show,Comedy,8.1,[chef]
6,10002,1937,The Three Stooges: Dizzy Doctors,movie,,0.0,"[three, stoog, dizzi, doctor]"
7,10003,1981,Charlie Chan and the Curse of the Dragon Queen,movie,"Comedy,Mystery",4.0,"[charli, chan, curs, dragon, queen]"
8,10004,1995,Ninja Scroll,movie,"Action,Adventure,Animation",7.8,"[ninja, scroll]"
9,10005,2004,Typhoid Mary: The Most Dangerous Woman in Amer...,movie,,0.0,"[typhoid, mari, danger, woman, america, nova]"


### Merge the imported datasets into one based on movieID

In [8]:
df = df_input.merge(df_mt)
df.drop('index', axis = 1, inplace=True)

In [9]:
df.head()

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,genres,averageRating,keywords
0,1488844,3.0,2005-09-06,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"
1,822109,5.0,2005-05-13,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"
2,885013,4.0,2005-10-19,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"
3,30878,4.0,2005-12-26,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"
4,823519,3.0,2004-05-03,1,2003,Dinosaur Planet,movie,,0.0,"[dinosaur, planet]"


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100480507 entries, 0 to 100480506
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   userID         object 
 1   rating         float64
 2   ratingDate     object 
 3   movieID        object 
 4   releaseYear    int64  
 5   title          object 
 6   type           object 
 7   genres         object 
 8   averageRating  float64
 9   keywords       object 
dtypes: float64(2), int64(1), object(7)
memory usage: 8.2+ GB


### How many NaN values in each column?

In [11]:
for column in df.columns:
    print("NaN sum in column:", column, "equals", df[column].isna().sum())

NaN sum in column: userID equals 0
NaN sum in column: rating equals 0
NaN sum in column: ratingDate equals 0
NaN sum in column: movieID equals 0
NaN sum in column: releaseYear equals 0
NaN sum in column: title equals 0
NaN sum in column: type equals 0
NaN sum in column: genres equals 12293249
NaN sum in column: averageRating equals 0
NaN sum in column: keywords equals 0


In [12]:
12293249/len(df)

0.12234461555812014

In [13]:
numeric_cols = ['userID', 'rating', 'movieID', 'releaseYear', 'averageRating']

In [14]:
## Are all values in numeric columns numbers?

for column in numeric_cols:
    print("NaN sum in column when numeric:", column, "equals", pd.to_numeric(df[column], errors='coerce').isna().sum())

NaN sum in column when numeric: userID equals 0
NaN sum in column when numeric: rating equals 0
NaN sum in column when numeric: movieID equals 0
NaN sum in column when numeric: releaseYear equals 0
NaN sum in column when numeric: averageRating equals 0


In [15]:
## Which rows in column 'releaseYear' have invalid value (-999, set in genres_data_prep.py)?

year_index = list(df.loc[df['releaseYear'] == -999, :].index)
len(year_index)

965

In [16]:
## Which titles have the '-999'?

df.loc[year_index, 'title'].value_counts()

Ancient Civilizations: Athens and Greece       195
Ancient Civilizations: Rome and Pompeii        189
Jimmy Hollywood                                189
Eros Dance Dhamaka                             116
Ancient Civilizations: Land of the Pharaohs    113
Hote Hote Pyaar Ho Gaya                         88
Roti Kapada Aur Makaan                          75
Name: title, dtype: int64

In [17]:
df.loc[year_index, :].sample(n=10)

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,genres,averageRating,keywords
41121955,1118119,4.0,2004-12-20,7241,-999,Ancient Civilizations: Athens and Greece,movie,,0.0,"[ancient, civil, athen, greec]"
41122099,1211556,3.0,2005-07-13,7241,-999,Ancient Civilizations: Athens and Greece,movie,,0.0,"[ancient, civil, athen, greec]"
93991386,1014939,3.0,2005-11-02,16678,-999,Jimmy Hollywood,movie,"Comedy,Crime,Drama",5.3,"[jimmi, hollywood]"
41122043,319644,2.0,2004-11-29,7241,-999,Ancient Civilizations: Athens and Greece,movie,,0.0,"[ancient, civil, athen, greec]"
41121972,2644677,3.0,2005-04-24,7241,-999,Ancient Civilizations: Athens and Greece,movie,,0.0,"[ancient, civil, athen, greec]"
58726782,970031,2.0,2004-04-14,10782,-999,Roti Kapada Aur Makaan,movie,"Action,Drama,Musical",6.7,"[roti, kapada, aur, makaan]"
41122094,893184,4.0,2003-09-05,7241,-999,Ancient Civilizations: Athens and Greece,movie,,0.0,"[ancient, civil, athen, greec]"
93991428,2056022,3.0,2004-11-22,16678,-999,Jimmy Hollywood,movie,"Comedy,Crime,Drama",5.3,"[jimmi, hollywood]"
23435161,1879238,3.0,2004-11-29,4388,-999,Ancient Civilizations: Rome and Pompeii,movie,,0.0,"[ancient, civil, rome, pompeii]"
23435077,328791,2.0,2003-01-22,4388,-999,Ancient Civilizations: Rome and Pompeii,movie,,0.0,"[ancient, civil, rome, pompeii]"


In [18]:
## Get the titles of movies

null_ids = df.loc[year_index, 'title'].value_counts()
null_ids = null_ids.reset_index()

In [19]:
## Set the release year value on Google-based information

df.loc[df.title == null_ids['index'].unique()[2], 'releaseYear'] = 1994
df.loc[df.title == null_ids['index'].unique()[3], 'releaseYear'] = 1999
df.loc[df.title == null_ids['index'].unique()[5], 'releaseYear'] = 1999
df.loc[df.title == null_ids['index'].unique()[6], 'releaseYear'] = 1974

In [20]:
## Set the release year based on the earliest rating date corresponding to the movie

df.ratingDate = pd.to_datetime(df['ratingDate'])

for movie in null_ids['index'].unique()[[0,1,4]]:
    df.loc[df.title == movie, 'releaseYear'] = min(df.loc[df['title'] == movie, 'ratingDate']).year

In [21]:
df.loc[year_index, :].sample(n=10)

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,genres,averageRating,keywords
100076108,1534243,3.0,2004-02-23,17667,1999,Eros Dance Dhamaka,movie,,0.0,"[ero, danc, dhamaka]"
89557122,2360109,3.0,2005-04-03,15918,1999,Hote Hote Pyaar Ho Gaya,movie,,0.0,"[hote, hote, pyaar, ho, gaya]"
58726834,1929178,4.0,2005-04-10,10782,1974,Roti Kapada Aur Makaan,movie,"Action,Drama,Musical",6.7,"[roti, kapada, aur, makaan]"
100076076,117368,3.0,2004-07-07,17667,1999,Eros Dance Dhamaka,movie,,0.0,"[ero, danc, dhamaka]"
25799167,21722,3.0,2005-07-20,4794,2001,Ancient Civilizations: Land of the Pharaohs,movie,,0.0,"[ancient, civil, land, pharaoh]"
93991454,1654721,3.0,2005-11-20,16678,1994,Jimmy Hollywood,movie,"Comedy,Crime,Drama",5.3,"[jimmi, hollywood]"
23435007,1477923,4.0,2004-09-02,4388,2001,Ancient Civilizations: Rome and Pompeii,movie,,0.0,"[ancient, civil, rome, pompeii]"
23435090,1662097,3.0,2004-10-08,4388,2001,Ancient Civilizations: Rome and Pompeii,movie,,0.0,"[ancient, civil, rome, pompeii]"
100076018,2176303,4.0,2005-05-01,17667,1999,Eros Dance Dhamaka,movie,,0.0,"[ero, danc, dhamaka]"
23435148,945068,3.0,2005-05-27,4388,2001,Ancient Civilizations: Rome and Pompeii,movie,,0.0,"[ancient, civil, rome, pompeii]"


### Save the cleaned data to pickle

In [22]:
cols = df.columns.tolist()
cols.remove('rating')
cols.append('rating')
cols

['userID',
 'ratingDate',
 'movieID',
 'releaseYear',
 'title',
 'type',
 'genres',
 'averageRating',
 'keywords',
 'rating']

In [23]:
df = df[cols]

In [24]:
df.to_pickle("input_data.pkl")