In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Movielens model
In this notebook we build a movie recomandation model using movielens dataset.
The model uses entity embeddings for categorical variables from [this paper](https://arxiv.org/abs/1604.06737) to embedd users and movies into two 50 dimensional spaces.
Hence we have 3 methods to recomand movies: 
1. evaluating the model,
2. look at close neighbords of a movie in the embedding space,
3. look at close neighbords of a user in the embedding space, and recomand those top movies.

The code woarks on Linux and Windows.

## Imports

In [2]:
import re
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
import seaborn as sns

In [4]:
# from movienet import MovieNet

from keras.models import load_model, model_from_json
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Dropout
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras import optimizers
from pathlib import Path
from sklearn.metrics import mean_squared_error

class MovieNet: 
    def rmse(self, y, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y)))

    def custom_activation(self, x):
        return K.sigmoid(x) * (self.max_rating+1)

    def __init__(self, n_users, n_movies, min_rating=0.5, max_rating=5):
        self.min_rating = min_rating
        self.max_rating = max_rating
        self.n_users = n_users
        self.n_movies = n_movies
        
    def build_model(self, emb_size=[50, 50], hl=[10], drop=[0.25], emb_trainable=True):
        inputs = [Input(shape=(1,)), Input(shape=(1,))] #, Input(shape=(1,))]
        users_emb = Embedding(self.n_users, emb_size[0], name='users', trainable=emb_trainable)(inputs[0])
        movies_emb = Embedding(self.n_movies, emb_size[1], name='movies', trainable=emb_trainable)(inputs[1])
        outputs_emb = [Reshape(target_shape=(emb_size[0],))(users_emb), Reshape(target_shape=(emb_size[1],))(movies_emb)]
        
        output_model = Concatenate()(outputs_emb)
        for i in range(0, len(hl)):
            output_model = Dense(hl[i], kernel_initializer='uniform')(output_model)
            output_model = Activation('relu')(output_model)
            output_model = Dropout(drop[i])(output_model)

        output_model = Dense(1)(output_model)

        output_model = Activation(self.custom_activation)(output_model)
        
        self.model = KerasModel(inputs=inputs, outputs=output_model)
        
        opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
        
        self.model.compile(loss='mse', optimizer=opt, metrics=[self.rmse])
        
          
    def prepare_input(self, _X):
        X = [_X.userId.values, _X.movieId.values]#, _X.ratingWeight]
        return X            
            
    def evaluate(self, X, y):
        y_pred = self.predict(X)
        return mean_squared_error(y, y_pred)
    
    def fit(self, X_train, y_train, X_valid, y_valid, epochs=50, batch_size=32, verbose=1):
        self.model.fit(self.prepare_input(X_train), y_train,
                       validation_data=(self.prepare_input(X_valid), y_valid),
                      epochs=epochs, batch_size=batch_size, verbose=verbose)
        # print("Result on validation data: ", self.evaluate(X_valid, y_valid))
        
    def predict(self, X):
        y_pred = self.model.predict(self.prepare_input(X))
        return y_pred.flatten()

    def save_model(self, path=Path(""), name="MovieModel"):
        self.model.save_weights(path/str(name+"_weights.h5"))
        with open(path/str(name+'_arch.json'), 'w') as f:
            f.write(self.model.to_json())
    
    def load_model(self, path=Path(""), name="MovieModel"):
        with open(path/str(name +'_arch.json'), 'r') as f:
            self.model = model_from_json(f.read(), custom_objects={"custom_activation": self.custom_activation})
        self.model.load_weights(path/str(name+"_weights.h5"))  

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
from sklearn.model_selection import train_test_split

## Loading datasets

In [5]:
# PATH = Path("data/ml-latest-small")
# PATH = Path("data/ml-20m")

In [6]:
data_path = 'F:/recommender_jk/movielens/ml-20m/ml-20m/'
ratings_raw = pd.read_csv(data_path + '/ratings.csv')
movies_raw = pd.read_csv(data_path + '/movies.csv')

In [7]:
display(ratings_raw.head())
display(movies_raw.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Features engineering
The datasets are clean, we only creating dictionnaries to convert ids and indexes.

In [8]:
ratings_train = ratings_raw.copy()

Creating dictionaries to convert userId and movieId into index and vice versa.

In [9]:
users_uniq = ratings_train.userId.unique()
user2idx = {o:i for i,o in enumerate(users_uniq)}
idx2user = {i:o for i,o in enumerate(users_uniq)}
ratings_train.userId = ratings_train.userId.apply(lambda x: user2idx[x])

movies_uniq = ratings_train.movieId.unique()
movie2idx = {o:i for i,o in enumerate(movies_uniq)}
idx2movie = {i:o for i,o in enumerate(movies_uniq)}
ratings_train.movieId = ratings_train.movieId.apply(lambda x: movie2idx[x])

n_users = int(ratings_train.userId.nunique())
n_movies = int(ratings_train.movieId.nunique())
n_users, n_movies

(138493, 26744)

In [10]:
ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,3.5,1112486027
1,0,1,3.5,1112484676
2,0,2,3.5,1112484819
3,0,3,3.5,1112484727
4,0,4,3.5,1112484580


In [11]:
def save_obj(obj, name):  
    with open('./model' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

save_obj(user2idx, "user2idx")
save_obj(idx2user, "idx2user")
save_obj(movie2idx, "movie2idx")
save_obj(idx2movie, "idx2movie")

## Keras Model
The model works as follows:
1. Embedds the user and movie id.
2. Concanate the user embedding, movie embedding and the weighted rating into one vector.
3. Passes to linear layers with dropout.

The architecture takes as parameters the embedding size, the size of hidden layers, and the dropout probability associate to them.

Spliting data into train and validation sets.

In [12]:
movie_model = MovieNet(n_users, n_movies)
movie_model.build_model(emb_size=[50, 50], hl=[70, 10], drop=[0.4, 0.3])

In [13]:
X = ratings_train.drop(['timestamp', 'rating'], axis=1)
y = ratings_train['rating']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
len(X_train), len(X_valid), len(y_train), len(y_valid)

(16000210, 4000053, 16000210, 4000053)

It's important that every movie are in the training set to have trained embedding of each of them.

In [14]:
len(X_train["movieId"].unique()), n_movies, n_movies - len(X_train["movieId"].unique())

(25829, 26744, 915)

In [15]:
miss_movies = ratings_train[~ratings_train.movieId.isin(X_train["movieId"].unique())]["movieId"].unique()

In [16]:
concat = pd.DataFrame()
for i in miss_movies:
    concat = concat.append(ratings_train[ratings_train.movieId == i].sample(1))

In [17]:
concat.head()

Unnamed: 0,userId,movieId,rating,timestamp
65472,458,7277,3.5,1337498446
146083,978,9658,0.5,1420447834
254883,1754,10647,1.0,1370447115
387761,2649,11621,4.5,1422306879
387843,2649,11652,4.5,1347585021


In [18]:
X_valid.drop(concat.index, axis=0, inplace=True)
y_valid.drop(concat.index, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
X_train = pd.concat([X_train, concat.drop(["rating", "timestamp"], axis=1)])
y_train = pd.concat([y_train, concat["rating"]])

In [20]:
len(X_train["movieId"].unique()), n_movies

(26744, 26744)

In [21]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=5, batch_size=512)

Train on 16001125 samples, validate on 3999138 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
movie_model.save_model(name="movie_model")

In [None]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=1, batch_size=512)

Train on 16001125 samples, validate on 3999138 samples
Epoch 1/1
 2124800/16001125 [==>...........................] - ETA: 8:02 - loss: 0.6625 - rmse: 0.8133

In [56]:
movie_model.save_model(name="movie_model")

In [58]:
movie_model.fit(X_train, y_train, X_valid, y_valid, epochs=12, batch_size=128)

Train on 16001141 samples, validate on 3999122 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [59]:
movie_model.save_model(name="movie_model")

The current state of the art models uses either [matrix factorization](https://docs.treasuredata.com/articles/hivemall-movielens20m-fm) with RMSE of 0.80 or [autoencoders](https://arxiv.org/pdf/1606.07659.pdf) with RMSE of 0.81.

**Our model has a RMSE of ~0.80, on par with state of the art models.
The approach of entity embeddings is simple but efficient.**