In [1]:
import pandas as pd
from math import sqrt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Dense, Activation, Embedding, Flatten, Reshape, Dropout, Lambda
from keras.layers.merge import Concatenate, Dot
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, SGD, RMSprop
from keras.regularizers import l2
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error
from sklearn.model_selection import StratifiedShuffleSplit

Using TensorFlow backend.


## Read Data

In [2]:
# read movie data
moviedf = pd.read_csv('Data/movies.csv')
moviedf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# read rating data
ratingdf = pd.read_csv('Data/ratings.csv')
ratingdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Preprocess

#### - Movie Table

In [4]:
# move year from title column into year column
moviedf['year'] = moviedf.title.str.extract('(\(\d\d\d\d\))', expand = False)
moviedf['year'] = moviedf.year.str.extract('(\d\d\d\d)', expand = False)
moviedf['title'] = moviedf.title.str.replace('(\(\d\d\d\d\))', '')
moviedf['title'] = moviedf['title'].apply(lambda x: x.strip())
moviedf.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
# split genres data
moviedf['genres'] = moviedf.genres.str.split('|')
moviedf.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [6]:
# make every genre from genres into column
movieGenres = moviedf.copy()
for i, row in moviedf.iterrows():
    for genre in row['genres']:
        movieGenres.at[i, genre] = 1

movieGenres = movieGenres.fillna(0)
movieGenres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### - Rating Table 

In [7]:
# remove timestap column
ratingdf = ratingdf.drop('timestamp', 1)
ratingdf.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
# copy rating dafatframe
rating = ratingdf.copy()
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
#find the amount of user id
user = rating['userId'].unique()
len(user)

610

In [10]:
#find the amount of movie id
movie = rating['movieId'].unique()
len(movie)

9724

## Collaborative Filtering (Deep Learning)

In [11]:
rating_group = rating.groupby('userId')['rating'].count()
top_users = rating_group.sort_values(ascending=False)[:15]
rating_group = rating.groupby('movieId')['rating'].count()
top_movies = rating_group.sort_values(ascending=False)[:15]
top_r = rating.join(top_users, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(top_movies, rsuffix='_r', how='inner', on='movieId')
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,50,110,260,296,318,356,480,527,589,593,1196,2571,2858,2959
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
68,2.5,3.0,2.5,5.0,2.0,3.0,3.5,3.5,4.0,3.5,3.5,5.0,4.5,5.0,2.5
182,4.0,4.5,3.5,3.5,5.0,4.5,5.0,3.5,4.0,2.0,4.5,3.0,5.0,5.0,5.0
249,4.0,4.0,5.0,5.0,4.0,4.5,4.5,4.0,4.5,4.0,4.0,5.0,5.0,4.5,5.0
274,4.0,4.0,4.5,3.0,5.0,4.5,4.5,3.5,4.0,4.5,4.0,4.5,4.0,5.0,5.0
288,4.5,,5.0,5.0,5.0,5.0,5.0,2.0,5.0,4.0,5.0,4.5,3.0,,3.5
307,4.0,4.5,3.5,3.5,4.5,4.5,4.0,3.5,4.5,2.5,4.5,3.0,3.5,4.0,4.0
380,5.0,4.0,4.0,5.0,5.0,3.0,5.0,5.0,,5.0,5.0,5.0,4.5,,4.0
387,,4.5,3.5,4.5,5.0,3.5,4.0,3.0,,3.5,4.0,4.5,4.0,4.5,4.5
414,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
448,5.0,4.0,,5.0,5.0,,3.0,3.0,,3.0,5.0,5.0,2.0,4.0,4.0


In [12]:
encoder = LabelEncoder()
rating['user'] = encoder.fit_transform(rating['userId'].values)
users = rating['user'].nunique()
print("Total user : ",users)

Total user :  610


In [13]:
rating['movie'] = encoder.fit_transform(rating['movieId'].values)
rating
movies = rating['movie'].nunique()
print("Total movie : ",movies)

Total movie :  9724


In [14]:
rating.head()

Unnamed: 0,userId,movieId,rating,user,movie
0,1,1,4.0,0,0
1,1,3,4.0,0,2
2,1,6,4.0,0,5
3,1,47,5.0,0,43
4,1,50,5.0,0,46


In [15]:
rating['rating'] = rating['rating'].values.astype(np.float32)
min_rating = min(rating['rating'])
max_rating = max(rating['rating'])
ratings = rating['rating'].nunique()
print("Total rating : ",ratings)
print("Min-rating : ",min_rating, ", Max-rating : ",max_rating)

Total rating :  10
Min-rating :  0.5 , Max-rating :  5.0


In [29]:
x_user = rating['user'].values
x_movie = rating['movie'].values
y = rating['rating'].values
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=10)
for train_index, test_index in sss.split(x_user, y):
    x_train_user, x_test_user = x_user[train_index], x_user[test_index]
    x_train_movie, x_test_movie = x_movie[train_index], x_movie[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [30]:
user = Input(shape=(1,))
embd1 = Embedding(input_dim = users, output_dim = 20)(user)
dpe1 = Dropout(0.3)(embd1)
flat1 = Flatten()(dpe1)

movie = Input(shape=(1,))
embd2 = Embedding(input_dim = movies, output_dim = 20)(movie)
# embd2 = Reshape((20,))(embd2)
dpe2 = Dropout(0.3)(embd2)
flat2 = Flatten()(dpe2)

con = Concatenate()([flat1, flat2])
dp = Dropout(0.4)(con)
dense1 = Dense(64, activation='relu')(dp)
dp2 = Dropout(0.2)(dense1)
dense2 = Dense(32, activation='relu')(dp2)
dense3 = Dense(16, activation='relu')(dense2)
output = Dense(1, activation='sigmoid')(dense3)
output = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(output)
model = Model(inputs=[user, movie], outputs=output)
model.summary()
model.compile(optimizer= 'adam', loss='mean_squared_error')
model.fit([x_train_user, x_train_movie], y_train, batch_size=128, epochs=30, verbose=2, validation_data = ([x_test_user, x_test_movie], y_test))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 20)        12200       input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 20)        194480      input_4[0][0]                    
__________________________________________________________________________________________________
dropout_5 

<keras.callbacks.History at 0x2891d9ae208>

In [31]:
y_pred = model.predict([x_test_user, x_test_movie])
y_pred

array([[2.7275114],
       [3.8686657],
       [4.1327963],
       ...,
       [3.3556085],
       [4.2411118],
       [3.6109815]], dtype=float32)

In [32]:
print("MAE : ", mean_absolute_error(y_test, y_pred))

MAE :  0.65910566


## Content Based Filtering

In [20]:
userInput = [
            {'title':'Waiting to Exhale', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Waiting to Exhale,5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [21]:
inputId = movieGenres[movieGenres['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5


In [22]:
genreTable = movieGenres[~movieGenres.title.isin(inputMovies.title)]
genreTable

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat,"[Action, Crime, Thriller]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck,"[Adventure, Children]",1995,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,193585,Flint,[Drama],2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
inputMovies = inputMovies.reset_index(drop=True)
inputMovies = inputMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1).drop('rating', 1)
inputMovies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
genreTable = movieGenres.reset_index(drop=True)
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
x = cosine_similarity(genreTable, inputMovies)

In [26]:
similarity = pd.DataFrame(x).sum(axis=1)

In [27]:
similarity = similarity.sort_values(ascending=False)
similarity.head(10)

8900    2.907879
9169    2.725305
8349    2.725305
7530    2.703755
3194    2.703755
7805    2.703755
8927    2.703616
7760    2.703616
6486    2.703616
9430    2.703616
dtype: float64

In [28]:
moviedf.loc[moviedf.index.isin(similarity.head(10).keys())]

Unnamed: 0,movieId,title,genres,year
3194,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
6486,53121,Shrek the Third,"[Adventure, Animation, Children, Comedy, Fantasy]",2007
7530,84637,Gnomeo & Juliet,"[Adventure, Animation, Children, Comedy, Fanta...",2011
7760,91355,Asterix and the Vikings (Astérix et les Vikings),"[Adventure, Animation, Children, Comedy, Fantasy]",2006
7805,92348,Puss in Boots (Nagagutsu o haita neko),"[Adventure, Animation, Children, Comedy, Fanta...",1969
8349,108540,Ernest & Célestine (Ernest et Célestine),"[Adventure, Animation, Children, Comedy, Drama...",2012
8900,134853,Inside Out,"[Adventure, Animation, Children, Comedy, Drama...",2015
8927,136016,The Good Dinosaur,"[Adventure, Animation, Children, Comedy, Fantasy]",2015
9169,148775,Wizards of Waverly Place: The Movie,"[Adventure, Children, Comedy, Drama, Fantasy, ...",2009
9430,166461,Moana,"[Adventure, Animation, Children, Comedy, Fantasy]",2016
