# Building a recommender system using the movie dataset
In this project, we build a recommender system by joining the **latent factors** for each user and movie. latent factors are like word embeddings in that each user and each movie has its own latent factors. These factors are updated through fitting data to a fuction that maps both latent factors to the rating variable.

In [94]:
import os
import glob
import numpy as np
import pandas as pd
from keras.layers import Input, Dense, merge, Embedding, Dot, dot, concatenate, Flatten, add
from keras.models import Model
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical

data_dir = os.path.join(os.path.expanduser('~'),'school/data/ml-latest-small')

### dataset contains each user's ratings for movies

In [9]:
# load data
data = pd.read_csv(data_dir + '/ratings.csv')
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [10]:
print("there are {} ratings in total".format(data.shape[0]))

there are 100004 ratings in total


In [28]:
max_userid = max(data.userId.unique())
max_movid = max(data.movieId.unique())
print("max userid: ", max_userid)
print("max movieid: ", max_movid)
print("total unique userids: ", len(data.userId.unique()))
print("total unique movieids: ", len(data.movieId.unique()))

max userid:  671
max movieid:  163949
total unique userids:  671
total unique movieids:  9066


In [30]:
user_dict = {k:v for v,k in enumerate(data.userId.unique())}
movie_dict = {k:v for v,k in enumerate(data.movieId.unique())}

In [38]:
def look_up(array, dictionary):
    """return array with values looked up from dictionary
    """
    return np.array([dictionary[x] for x in array])

# postprocessed data
user_p = look_up(data.userId.values, user_dict)
movie_p = look_up(data.movieId.values, movie_dict)
# postprocessed predictors
data_p1 = np.column_stack([user_p, movie_p])
# postprocessed responses
data_p2 = data.rating.values

In [42]:
data_p2[train_idx].shape

(80003,)

In [87]:
data.rating.unique()

array([ 2.5,  3. ,  2. ,  4. ,  3.5,  1. ,  5. ,  4.5,  1.5,  0.5])

# prepare data

In [17]:
idx = np.random.choice(range(data.shape[0]), data.shape[0], replace=False)
train_idx = idx[:int(len(idx)*0.8)]
valid_idx = idx[int(len(idx)*0.8):int(len(idx)*0.92)]
test_idx = idx[int(len(idx)*0.92):]
print("there are {} training samples...".format(len(train_idx)))
print("there are {} validation samples...".format(len(valid_idx)))
print("there are {} test samples...".format(len(test_idx)))

there are 80003 training samples...
there are 12000 validation samples...
there are 8001 test samples...


In [77]:
train_predictors = data_p1[train_idx]
train_responses = data_p2[train_idx]
valid_predictors = data_p1[valid_idx]
valid_responses = data_p2[valid_idx]
test_predictors = data_p1[test_idx]
test_responses = data_p2[test_idx]
# reshape to match the network output shape
train_responses = train_responses[:, np.newaxis, np.newaxis]
valid_responses = valid_responses[:, np.newaxis, np.newaxis]
test_responses = test_responses[:, np.newaxis, np.newaxis]

# create encoders for both user and movie. 
* Rating is  product of the user encoded vector and the movie encoded vector

In [95]:
n_hidden = 5

def movie_model():
    inp1 = Input((1,))
    inp2 = Input((1,))
    emb1 = Embedding(max_userid, n_hidden)(inp1)
    emb2 = Embedding(max_movid, n_hidden)(inp2)
    # bias for each user and each movie
    b1 = Embedding(max_userid, 1)(inp1)
    b2 = Embedding(max_movid, 1)(inp2)
    x = dot([emb1, emb2], axes=2)
    x = add([x, b1, b2])
#     x = Flatten()(x)
    return Model([inp1, inp2], x)

model = movie_model()

In [96]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_53 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_54 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_59 (Embedding)        (None, 1, 5)         3355        input_53[0][0]                   
__________________________________________________________________________________________________
embedding_60 (Embedding)        (None, 1, 5)         819745      input_54[0][0]                   
__________________________________________________________________________________________________
dot_19 (Do

In [97]:
model.compile(loss='mse',optimizer=Adam(1e-2))
model.fit([train_predictors[:,0], train_predictors[:,1]], train_responses, epochs=5, 
          validation_data=[[valid_predictors[:,0], valid_predictors[:,1]], valid_responses])

Train on 80003 samples, validate on 12000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12a3a0080>

In [100]:
# performance on test set
out = model.predict([test_predictors[:,0], test_predictors[:,1]])
print("mse on test set is : ", np.mean(np.square(out - test_responses)))

mse on test set is :  1.00843508245


# more complex model

In [104]:
# change the way 
def movie_model_complex():
    inp1 = Input((1,))
    inp2 = Input((1,))
    emb1 = Embedding(max_userid, n_hidden)(inp1)
    emb2 = Embedding(max_movid, n_hidden)(inp2)
    # bias for each user and each movie
    b1 = Embedding(max_userid, 1)(inp1)
    b2 = Embedding(max_movid, 1)(inp2)
    x = concatenate([emb1, emb2])
    x = Dense(5)(x)
    x = Dense(1)(x)
    x = add([x, b1, b2])
#     x = Flatten()(x)
    return Model([inp1, inp2], x)

model1 = movie_model_complex()

In [105]:
model1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_59 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_60 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_71 (Embedding)        (None, 1, 5)         3355        input_59[0][0]                   
__________________________________________________________________________________________________
embedding_72 (Embedding)        (None, 1, 5)         819745      input_60[0][0]                   
__________________________________________________________________________________________________
concatenat

In [107]:
model1.compile(loss='mse',optimizer=Adam(1e-2))
model1.fit([train_predictors[:,0], train_predictors[:,1]], train_responses, epochs=5, batch_size=64,
          validation_data=[[valid_predictors[:,0], valid_predictors[:,1]], valid_responses])

Train on 80003 samples, validate on 12000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12a7d4860>

In [108]:
# performance on test set
out = model1.predict([test_predictors[:,0], test_predictors[:,1]])
print("mse on test set is : ", np.mean(np.square(out - test_responses)))

mse on test set is :  0.842478327883
