In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from sklearn.model_selection import train_test_split

In [9]:
path =  './datasets/movielens'

In [10]:
os.listdir(path)

['links.csv',
 'tags.csv',
 'ratings.csv',
 'README.txt',
 '.ipynb_checkpoints',
 'movies.csv']

## ratings

In [79]:
ratings_df = pd.read_csv(os.path.join(path,"ratings.csv"),encoding='utf-8')

In [80]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [223]:
ratings_df.drop('timestamp',inplace=True,axis=1)

In [224]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## movies

In [186]:
movies_df = pd.read_csv(os.path.join(path,"movies.csv"),encoding='utf-8',index_col= 'movieId')

In [187]:
movies_df

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [188]:
movies_df.shape

(9742, 2)

In [189]:
movies_df.isna().sum()

title     0
genres    0
dtype: int64

### movies-genre

In [190]:
genre_df =  movies_df['genres'].str.get_dummies(sep='|')

In [191]:
genre_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [193]:
movies_df = pd.concat([movies_df, genre_df],axis=1)

In [196]:
movies_df.drop("genres",inplace=True,axis=1)

In [197]:
movies_df.head()

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### movies_year

In [212]:
movies_df['year'] = movies_df["title"].str.extract('(\(\d{4}\))')

In [228]:
movies_df.head()

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [215]:
movies_df['year']=movies_df['year'].apply(lambda x: str(x).replace('(','').replace(')',""))

In [216]:
movies_df.head()

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [218]:
movies_df.drop('title',axis=1,inplace=True)

In [219]:
movies_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
2,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1995
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2017
193583,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2017
193585,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2017
193587,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2018


In [226]:
movies_df = movies_df.reset_index()

In [227]:
movies_df

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
9738,193583,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
9739,193585,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2017
9740,193587,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2018


In [231]:
ratings_df.shape

(100836, 3)

In [232]:
movies_df.shape

(9742, 22)

In [233]:
feature_vector = pd.merge(ratings_df,movies_df,how ="inner",on="movieId")

In [254]:
feature_vector

Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,1,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
1,5,1,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
2,7,1,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
3,15,1,2.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
4,17,1,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1997
100832,610,160527,4.5,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1971
100833,610,160836,3.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,2005
100834,610,163937,3.5,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,2016


In [259]:
user_onehot = pd.get_dummies(feature_vector['userId'], prefix='user')

In [265]:
user_onehot.shape

(100836, 610)

In [261]:
item_onehot = pd.get_dummies(feature_vector['movieId'],prefix = 'movie')

In [266]:
item_onehot.shape

(100836, 9724)

In [274]:
concated_feature_vector = pd.concat([feature_vector,user_onehot,item_onehot],axis=1).drop("userId",axis=1).drop("movieId",axis=1)

## final feature vector

```
user onehot 610 + item onehot 9724 + genres_df 20 + year 1 +rating 1 
== 10356
```

In [276]:
610 + 9724 + 20 + 1 + 1

10356

In [275]:
concated_feature_vector

Unnamed: 0,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,movie_193565,movie_193567,movie_193571,movie_193573,movie_193579,movie_193581,movie_193583,movie_193585,movie_193587,movie_193609
0,4.0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.5,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.5,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.5,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,2.5,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100832,4.5,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
100833,3.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
100834,3.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### target

In [280]:
target_rating = concated_feature_vector["rating"]

In [281]:
target_rating

0         4.0
1         4.0
2         4.5
3         2.5
4         4.5
         ... 
100831    2.5
100832    4.5
100833    3.0
100834    3.5
100835    3.5
Name: rating, Length: 100836, dtype: float64

In [283]:
concated_feature_vector.drop('rating',axis=1,inplace=True)

In [284]:
concated_feature_vector

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,movie_193565,movie_193567,movie_193571,movie_193573,movie_193579,movie_193581,movie_193583,movie_193585,movie_193587,movie_193609
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100832,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
100833,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
100834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [286]:
X_train, X_test , y_train,y_test = train_test_split(concated_feature_vector,target_rating,test_size=0.1)

In [287]:
X_train.shape

(90752, 10355)

In [288]:
X_test.shape

(10084, 10355)