In [115]:
import pandas as pd
import numpy as np
from keras.layers import Input, Embedding, merge, Flatten
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.models import Model

In [1]:
path = "data/"

In [3]:
ratings = pd.read_csv(path + "ratings.csv")

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
len(ratings)

100004

In [21]:
movies_info = pd.read_csv(path + 'movies.csv')

In [22]:
movies_info.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [25]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [32]:
useridtoidx = {o:i for i,o in enumerate(users)}
movieidtoidx = {o:i for i,o in enumerate(movies)}

In [34]:
ratings.movieId = ratings.movieId.apply(lambda x: movieidtoidx[x])
ratings.userId = ratings.userId.apply(lambda x: useridtoidx[x])

In [35]:
user_min, user_max, movie_min, movie_max = (ratings.userId.min(), 
    ratings.userId.max(), ratings.movieId.min(), ratings.movieId.max())
user_min, user_max, movie_min, movie_max

(0, 670, 0, 9065)

In [36]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [37]:
n_factors = 50

In [40]:
np.random.seed = 40

In [41]:
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk]
test = ratings[~msk]

In [42]:
len(train), len(test)

(79803, 20201)

In [47]:
top_users = ratings.groupby('userId')['rating'].count().sort_values(ascending=False)[:15]

In [48]:
top_movies = ratings.groupby('movieId')['rating'].count().sort_values(ascending=False)[:15]

In [64]:
ratings_top_users = ratings[ratings.userId.isin(top_users.index)]

In [66]:
ratings_top_movies_users = ratings_top_users[ratings_top_users.movieId.isin(top_movies.index)]

In [68]:
ratings_top_movies_users.size

824

In [69]:
ratings_top_movies_users.head()

Unnamed: 0,userId,movieId,rating,timestamp
962,14,417,2.0,997938310
991,14,27,3.0,1040205792
1032,14,143,5.0,997938437
1037,14,49,5.0,997938771
1044,14,99,2.0,997938727


In [81]:
pd.crosstab(ratings_top_movies_users.userId, ratings_top_movies_users.movieId, ratings_top_movies_users.rating, aggfunc=np.max)

movieId,27,49,57,72,79,89,92,99,143,179,180,197,402,417,505
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14,3.0,5.0,1.0,3.0,4.0,4.0,5.0,2.0,5.0,5.0,4.0,5.0,5.0,2.0,5.0
29,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,3.0,4.0,5.0
72,4.0,5.0,5.0,4.0,5.0,3.0,4.5,5.0,4.5,5.0,5.0,5.0,4.5,5.0,4.0
211,5.0,4.0,4.0,3.0,5.0,3.0,4.0,4.5,4.0,,3.0,3.0,5.0,3.0,
212,2.5,,2.0,5.0,,4.0,2.5,,5.0,5.0,3.0,3.0,4.0,3.0,2.0
293,3.0,,4.0,4.0,4.0,3.0,,3.0,4.0,4.0,4.5,4.0,4.5,4.0,
310,3.0,3.0,5.0,4.5,5.0,4.5,2.0,4.5,4.0,3.0,4.5,4.5,4.0,3.0,4.0
379,5.0,5.0,5.0,4.0,,4.0,5.0,4.0,4.0,4.0,,3.0,5.0,4.0,4.0
451,4.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,2.0,3.5,5.0
467,3.0,3.5,3.0,2.5,,,3.0,3.5,3.5,3.0,3.5,3.0,3.0,4.0,4.0


Functional Model

In [94]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

In [104]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(adam(0.001), loss='mse')

In [105]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=1, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/1


<keras.callbacks.History at 0x11b05f2e8>

In [106]:
model.optimizer.lr=0.01

In [107]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=3, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11b02ef60>

In [108]:
model.optimizer.lr=0.001

In [109]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=6, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x11b05f668>

In [110]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

In [111]:
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

In [112]:
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [113]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [116]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
x = merge([x, ub], mode='sum')
x = merge([x, mb], mode='sum')
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [118]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=1, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/1


<keras.callbacks.History at 0x11c443c88>

In [119]:
model.optimizer.lr=0.01

In [120]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=6, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x11c4484a8>

In [127]:
model.optimizer.lr=0.001

In [128]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, nb_epoch=10, 
          validation_data=([test.userId, test.movieId], test.rating))

Train on 79803 samples, validate on 20201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11c601470>

In [129]:
model.predict([np.array([3]), np.array([6])])

array([[ 4.80345917]], dtype=float32)