In [1]:
import pandas as pd

In [2]:
df_input = pd.read_pickle("netflix_prize.pkl")
df_mt = pd.read_csv("netflix-prize\movie_titles.csv", header=None, names=["movieID"])

In [3]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   index       int64  
 1   userID      object 
 2   rating      float64
 3   ratingDate  object 
 4   movieID     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.7+ GB


In [4]:
df_mt.head()

Unnamed: 0,movieID
0,1;2003;Dinosaur Planet
1,2;2004;Isle of Man TT 2004 Review
2,3;1997;Character
3,4;1994;Paula Abdul's Get Up & Dance
4,5;2004;The Rise and Fall of ECW


### Splitting the contents of 'movieID' column to get movie ID, its release year and the title

In [5]:
df_mt['movieID'] = df_mt['movieID'].apply(lambda x: list(x.split(";")))
df_mt["releaseYear"] = df_mt['movieID'].apply(lambda x: x[1])
df_mt["title"] = df_mt['movieID'].apply(lambda x: x[2])
df_mt["movieID"] = df_mt['movieID'].apply(lambda x: x[0])

In [6]:
## Get the content type from title (e.g. TV Shows have "Series" or "Season" the title)
df_mt['type'] = df_mt["title"].apply(lambda x: "tv show" if ": series " in x.lower() or ": season " in x.lower() else "movie")
df_mt['title'] = df_mt['title'].apply(lambda x: x[:x.lower().find(": s")] if ": series " in x.lower() or ": season " in x.lower() else x)
df_mt.head()

Unnamed: 0,movieID,releaseYear,title,type
0,1,2003,Dinosaur Planet,movie
1,2,2004,Isle of Man TT 2004 Review,movie
2,3,1997,Character,movie
3,4,1994,Paula Abdul's Get Up & Dance,movie
4,5,2004,The Rise and Fall of ECW,movie


In [7]:
df_input.head()

Unnamed: 0,index,userID,rating,ratingDate,movieID
0,1,1488844,3.0,2005-09-06,1
1,2,822109,5.0,2005-05-13,1
2,3,885013,4.0,2005-10-19,1
3,4,30878,4.0,2005-12-26,1
4,5,823519,3.0,2004-05-03,1


### Get keywords from movie title

In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
 
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

df_mt['keywords'] = ""

for i in range(len(df_mt)):
    words = word_tokenize(df_mt['title'][i])
    filtered_words = [word for word in words if word.isalpha() and not word.lower() in stop_words]
    df_mt['keywords'][i] = [ps.stem(word) for word in filtered_words]

In [9]:
df_mt.head(20)

Unnamed: 0,movieID,releaseYear,title,type,keywords
0,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"
1,2,2004,Isle of Man TT 2004 Review,movie,"[isl, man, tt, review]"
2,3,1997,Character,movie,[charact]
3,4,1994,Paula Abdul's Get Up & Dance,movie,"[paula, abdul, get, danc]"
4,5,2004,The Rise and Fall of ECW,movie,"[rise, fall, ecw]"
5,6,1997,Sick,movie,[sick]
6,7,1992,8 Man,movie,[man]
7,8,2004,What the #$*! Do We Know!?,movie,[know]
8,9,1991,Class of Nuke 'Em High 2,movie,"[class, nuke, high]"
9,10,2001,Fighter,movie,[fighter]


### Merge the imported datasets into one based on movieID

In [10]:
df = df_input.merge(df_mt)
df.drop('index', axis = 1, inplace=True)

In [11]:
df.head()

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,keywords
0,1488844,3.0,2005-09-06,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"
1,822109,5.0,2005-05-13,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"
2,885013,4.0,2005-10-19,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"
3,30878,4.0,2005-12-26,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"
4,823519,3.0,2004-05-03,1,2003,Dinosaur Planet,movie,"[dinosaur, planet]"


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100480507 entries, 0 to 100480506
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userID       object 
 1   rating       float64
 2   ratingDate   object 
 3   movieID      object 
 4   releaseYear  object 
 5   title        object 
 6   type         object 
 7   keywords     object 
dtypes: float64(1), object(7)
memory usage: 6.7+ GB


### How many NaN values in each column?

In [13]:
for column in df.columns:
    print("NaN sum in column:", column, "equals", df[column].isna().sum())

NaN sum in column: userID equals 0
NaN sum in column: rating equals 0
NaN sum in column: ratingDate equals 0
NaN sum in column: movieID equals 0
NaN sum in column: releaseYear equals 0
NaN sum in column: title equals 0
NaN sum in column: type equals 0
NaN sum in column: keywords equals 0


In [14]:
numeric_cols = ['userID', 'rating', 'movieID', 'releaseYear']

In [15]:
## Are all values in numeric columns numbers?

for column in numeric_cols:
    print("NaN sum in column when numeric:", column, "equals", pd.to_numeric(df[column], errors='coerce').isna().sum())

NaN sum in column when numeric: userID equals 0
NaN sum in column when numeric: rating equals 0
NaN sum in column when numeric: movieID equals 0
NaN sum in column when numeric: releaseYear equals 965


In [16]:
## Which rows contain non-numeric values in column 'releaseYear'?

year_index = list(df.loc[pd.isna(pd.to_numeric(df['releaseYear'], errors='coerce')), :].index)
len(year_index)

965

In [17]:
## Which titles miss the 'release year' information?

df.loc[year_index, 'title'].value_counts()

Ancient Civilizations: Athens and Greece       195
Ancient Civilizations: Rome and Pompeii        189
Jimmy Hollywood                                189
Eros Dance Dhamaka                             116
Ancient Civilizations: Land of the Pharaohs    113
Hote Hote Pyaar Ho Gaya                         88
Roti Kapada Aur Makaan                          75
Name: title, dtype: int64

In [18]:
df.loc[year_index, :].sample(n=10)

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,keywords
93991314,2439493,1.0,2005-03-10,16678,,Jimmy Hollywood,movie,"[jimmi, hollywood]"
58726816,2195326,3.0,2000-12-09,10782,,Roti Kapada Aur Makaan,movie,"[roti, kapada, aur, makaan]"
23435151,2126435,3.0,2004-08-21,4388,,Ancient Civilizations: Rome and Pompeii,movie,"[ancient, civil, rome, pompeii]"
23435071,573434,3.0,2005-09-28,4388,,Ancient Civilizations: Rome and Pompeii,movie,"[ancient, civil, rome, pompeii]"
93991401,305344,1.0,2004-10-15,16678,,Jimmy Hollywood,movie,"[jimmi, hollywood]"
41122100,2242431,4.0,2003-11-20,7241,,Ancient Civilizations: Athens and Greece,movie,"[ancient, civil, athen, greec]"
23435177,30298,1.0,2004-05-25,4388,,Ancient Civilizations: Rome and Pompeii,movie,"[ancient, civil, rome, pompeii]"
58726847,1514144,4.0,2004-09-20,10782,,Roti Kapada Aur Makaan,movie,"[roti, kapada, aur, makaan]"
89557170,724374,2.0,2005-10-29,15918,,Hote Hote Pyaar Ho Gaya,movie,"[hote, hote, pyaar, ho, gaya]"
58726795,1282495,3.0,2004-03-25,10782,,Roti Kapada Aur Makaan,movie,"[roti, kapada, aur, makaan]"


In [19]:
## Get the titles of movies with NaN values

null_ids = df.loc[year_index, 'title'].value_counts()
null_ids = null_ids.reset_index()

In [20]:
## Set the release year value on Google-based information

df.loc[df.title == null_ids['index'].unique()[2], 'releaseYear'] = 1994
df.loc[df.title == null_ids['index'].unique()[3], 'releaseYear'] = 1999
df.loc[df.title == null_ids['index'].unique()[5], 'releaseYear'] = 1999
df.loc[df.title == null_ids['index'].unique()[6], 'releaseYear'] = 1974

In [21]:
## Set the release year based on the earliest rating date corresponding to the movie

df.ratingDate = pd.to_datetime(df['ratingDate'])

for movie in null_ids['index'].unique()[[0,1,4]]:
    df.loc[df.title == movie, 'releaseYear'] = min(df.loc[df['title'] == movie, 'ratingDate']).year

In [22]:
df.loc[year_index, :].sample(n=10)

Unnamed: 0,userID,rating,ratingDate,movieID,releaseYear,title,type,keywords
100076064,780058,5.0,2004-10-13,17667,1999,Eros Dance Dhamaka,movie,"[ero, danc, dhamaka]"
23435021,2493000,2.0,2005-11-14,4388,2001,Ancient Civilizations: Rome and Pompeii,movie,"[ancient, civil, rome, pompeii]"
41122031,2082114,3.0,2002-08-14,7241,2001,Ancient Civilizations: Athens and Greece,movie,"[ancient, civil, athen, greec]"
25799151,851671,3.0,2003-04-01,4794,2001,Ancient Civilizations: Land of the Pharaohs,movie,"[ancient, civil, land, pharaoh]"
41121953,1048128,3.0,2003-10-06,7241,2001,Ancient Civilizations: Athens and Greece,movie,"[ancient, civil, athen, greec]"
93991317,1824389,2.0,2005-12-21,16678,1994,Jimmy Hollywood,movie,"[jimmi, hollywood]"
58726798,1153564,4.0,2000-12-28,10782,1974,Roti Kapada Aur Makaan,movie,"[roti, kapada, aur, makaan]"
58726787,472483,1.0,2005-08-03,10782,1974,Roti Kapada Aur Makaan,movie,"[roti, kapada, aur, makaan]"
23435154,1415862,3.0,2003-04-23,4388,2001,Ancient Civilizations: Rome and Pompeii,movie,"[ancient, civil, rome, pompeii]"
93991311,2588107,3.0,2005-05-17,16678,1994,Jimmy Hollywood,movie,"[jimmi, hollywood]"


### Save the cleaned data to pickle

In [23]:
df.to_pickle("input_data.pkl")