In [1]:
import pandas as pd
import numpy as np
import keras

Using TensorFlow backend.


# Prework

Following this fast.ai [lesson](https://github.com/fastai/courses/blob/master/deeplearning1/nbs/lesson4.ipynb) and [video](https://www.youtube.com/watch?v=V2h3IOBDvrA&feature=youtu.be&t=5761).

In [2]:
# get data
# !wget -O data/ml-latest-small.zip http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip data/ml-latest-small.zip

# Preprocessing

In [3]:
path = '/opt/notebooks/data/movielens/ml-latest-small/'

In [54]:
ratings = pd.read_csv(path+'ratings.csv')

In [55]:
ratings.shape

(100004, 4)

In [56]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [57]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [58]:
userid2idx = {o: i for i, o in enumerate(users)}
movieid2idx = {o: i for i, o in enumerate(movies)}

In [59]:
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])

In [60]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [61]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

In [62]:
n_factors = 50

In [63]:
np.random.seed = 42

In [64]:
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

# Dot product

In [65]:
user_in = keras.layers.Input(shape=(1, ), dtype='int64', name='user_in')

In [66]:
u = keras.layers.Embedding(
    input_dim=n_users,
    output_dim=n_factors,
    input_length=1,
    embeddings_regularizer=keras.regularizers.l2(l=1e-4))(user_in)

In [67]:
movie_in = keras.layers.Input(shape=(1, ), dtype='int64', name='movie_in')

In [68]:
m = keras.layers.Embedding(
    input_dim=n_movies,
    output_dim=n_factors,
    input_length=1,
    embeddings_regularizer=keras.regularizers.l2(l=1e-4))(movie_in)

In [69]:
x = keras.layers.dot([u, m], axes=[2,2])

In [70]:
x.shape

TensorShape([Dimension(None), Dimension(1), Dimension(1)])

In [71]:
x = keras.layers.Flatten()(x)

In [72]:
x.shape

TensorShape([Dimension(None), Dimension(None)])

In [73]:
model = keras.models.Model(inputs=[user_in, movie_in], outputs=x)

In [74]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='mse')

In [26]:
model.fit(x=[trn.userId, trn.movieId],
          y=trn.rating,
          batch_size=64,
          epochs=1,
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80188 samples, validate on 19816 samples
Epoch 1/1


<keras.callbacks.History at 0x7f5309c4fb38>

In [27]:
model.optimizer.lr = 0.01

In [28]:
model.fit(
    x=[trn.userId, trn.movieId],
    y=trn.rating,
    batch_size=64,
    epochs=3,
    validation_data=([val.userId, val.movieId], val.rating), )

Train on 80188 samples, validate on 19816 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5304283278>

In [29]:
model.optimizer.lr = 0.001

In [30]:
model.fit(x=[trn.userId, trn.movieId],
          y=trn.rating,
          batch_size=64,
          epochs=6,
          validation_data=([val.userId, val.movieId], val.rating),
          shuffle=False)

Train on 80188 samples, validate on 19816 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f5309f92ac8>

# Bias

In [31]:
user_bias = keras.layers.Embedding(input_dim=n_users, output_dim=1, input_length=1)(user_in)
user_bias = keras.layers.Flatten()(user_bias)

In [34]:
movie_bias = keras.layers.Embedding(input_dim=n_movies, output_dim=1, input_length=1)(movie_in)
movie_bias = keras.layers.Flatten()(movie_bias)

In [35]:
# x = keras.layers.merge([u, m], mode='dot')
x = keras.layers.dot([u, m], axes=[2, 2])
x = keras.layers.Flatten()(x)

In [37]:
x = keras.layers.add([x, user_bias])
x = keras.layers.add([x, movie_bias])
# x = keras.layers.merge([x, user_bias], mode='sum')
# x = keras.layers.merge([x, movie_bias], mode='sum')

In [39]:
model = keras.models.Model(inputs=[user_in, movie_in], outputs=x)

In [40]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='mse')

In [41]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
dot_2 (Dot

In [35]:
model.fit(x=[trn.userId, trn.movieId], y=trn.rating, batch_size=64, validation_data=([val.userId, val.movieId], val.rating))

Train on 79795 samples, validate on 20209 samples
Epoch 1/1


<keras.callbacks.History at 0x7f6d7c377b00>

In [36]:
model.optimizer.lr=0.01

In [42]:
model.fit(x=[trn.userId, trn.movieId], y=trn.rating, batch_size=64, epochs=10, validation_data=([val.userId, val.movieId], val.rating))

Train on 80188 samples, validate on 19816 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5304269f98>

In [43]:
model.optimizer.lr=0.001

In [44]:
model.fit(x=[trn.userId, trn.movieId], y=trn.rating, batch_size=64, epochs=5, validation_data=([val.userId, val.movieId], val.rating))

Train on 80188 samples, validate on 19816 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f52fe28b9b0>

# Inspect

In [45]:
type(user_in)

tensorflow.python.framework.ops.Tensor

In [46]:
user_in_layer = model.get_layer(name='user_in')

In [47]:
user_in_layer.input_shape

(None, 1)

In [48]:
user_in_layer.output_shape

(None, 1)

In [49]:
model.get_layer(index=2).output_shape

(None, 1, 50)

# NN

In [50]:
# x = keras.layers.merge([u, m], mode='concat')
x = keras.layers.concatenate([u, m])

x = keras.layers.Flatten()(x)
x = keras.layers.Dropout(0.3)(x)
x = keras.layers.Dense(70, activation='relu')(x)
x = keras.layers.Dropout(0.75)(x)
x = keras.layers.Dense(1)(x)

In [51]:
nn = keras.models.Model(inputs=[user_in, movie_in], outputs=x)

In [52]:
nn.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='mse')

In [53]:
nn.fit(x=[trn.userId, trn.movieId],
          y=trn.rating,
          batch_size=64,
          epochs=10,
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80188 samples, validate on 19816 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f52fe13af98>

# Get parts of model

## Get bias

In [49]:
mdl_movie_bias = keras.models.Model(inputs=movie_in, outputs=movie_bias)

In [50]:
mdl_movie_bias.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
movie_in (InputLayer)        (None, 1)                 0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 1, 1)              9066      
_________________________________________________________________
flatten_3 (Flatten)          (None, 1)                 0         
Total params: 9,066
Trainable params: 9,066
Non-trainable params: 0
_________________________________________________________________


In [51]:
movies

array([  31, 1029, 1061, ...,  129, 4736, 6425])

In [52]:
mdl_movie_bias.predict(np.random.choice(ratings.movieId, 5))

array([[ 0.34335878],
       [ 0.33696863],
       [ 0.37269083],
       [ 0.34500208],
       [ 0.25692898]], dtype=float32)

In [53]:
predicted_movies_bias = mdl_movie_bias.predict(ratings.movieId)

In [54]:
predicted_movies_bias.shape

(100004, 1)

In [55]:
predicted_movies_bias[:10]

array([[ 0.52784991],
       [ 0.53162074],
       [ 0.65392113],
       [ 0.48458192],
       [ 0.61973959],
       [ 0.3011196 ],
       [ 0.39460009],
       [ 0.45963573],
       [ 0.52518362],
       [ 0.65834099]], dtype=float32)

In [56]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       user_in[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      movie_in[0][0]                   
__________________________________________________________________________________________________
merge_2 (M

In [57]:
model.layers[8].get_weights()[:10]

[array([[ 0.52784991],
        [ 0.53162074],
        [ 0.65392113],
        ..., 
        [ 0.42122352],
        [-0.04925413],
        [-0.02676388]], dtype=float32)]

## Get embedding

In [58]:
mdl_movie_embedding = keras.models.Model(inputs=movie_in, outputs=m)

In [59]:
mdl_movie_embedding.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
movie_in (InputLayer)        (None, 1)                 0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1, 50)             453300    
Total params: 453,300
Trainable params: 453,300
Non-trainable params: 0
_________________________________________________________________


In [60]:
mdl_movie_embedding.predict(ratings.movieId)[0]

array([[-0.00470133, -0.01742448,  0.06867763, -0.0124195 , -0.00975819,
         0.02067142, -0.00198756, -0.01112133,  0.03149248,  0.07239415,
        -0.08041882, -0.02958596,  0.01993454,  0.01273143, -0.04479582,
         0.01029354, -0.0482171 ,  0.02602037,  0.00675973,  0.04645941,
        -0.01937358, -0.06726788,  0.0291614 ,  0.05915263, -0.00011156,
        -0.03344529,  0.00588805, -0.01764589,  0.01082519, -0.01900363,
         0.0232577 ,  0.03231759, -0.05621422, -0.00175879, -0.06605551,
         0.0041251 ,  0.0214821 , -0.01076169, -0.03720035, -0.03366563,
        -0.03591022,  0.05150328,  0.02213801,  0.0365787 , -0.04870336,
         0.01013623,  0.00620029,  0.03192104,  0.01572293,  0.02237591]], dtype=float32)

In [61]:
model.layers[3].get_weights()[0][0]

array([-0.00470133, -0.01742448,  0.06867763, -0.0124195 , -0.00975819,
        0.02067142, -0.00198756, -0.01112133,  0.03149248,  0.07239415,
       -0.08041882, -0.02958596,  0.01993454,  0.01273143, -0.04479582,
        0.01029354, -0.0482171 ,  0.02602037,  0.00675973,  0.04645941,
       -0.01937358, -0.06726788,  0.0291614 ,  0.05915263, -0.00011156,
       -0.03344529,  0.00588805, -0.01764589,  0.01082519, -0.01900363,
        0.0232577 ,  0.03231759, -0.05621422, -0.00175879, -0.06605551,
        0.0041251 ,  0.0214821 , -0.01076169, -0.03720035, -0.03366563,
       -0.03591022,  0.05150328,  0.02213801,  0.0365787 , -0.04870336,
        0.01013623,  0.00620029,  0.03192104,  0.01572293,  0.02237591], dtype=float32)