In [72]:
import sys
sys.path.append('..')
from src import envs

import numpy as np
from keras.callbacks import EarlyStopping
from keras.layers import Input, Embedding, Flatten, Dot
from keras.models import Model

from time import sleep
from tqdm import tqdm_notebook as tqdm

import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD

 

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

In [73]:
data = envs.req_reset(envs='first', verbose=True)

------Summary-----
key:item_history, 	size:10000, 	type:<class 'int'>
key:	nb_items, 	value:	300
key:	nb_users, 	value:	100
key:	next_item, 	value:	58
key:	next_user, 	value:	93
key:rating_history, 	size:10000, 	type:<class 'int'>
key:user_history, 	size:10000, 	type:<class 'int'>


# Model

## BaseModel

In [74]:
class BaseModel(object):
    
    def __init__(self,nb_user, nb_items):
        pass
        
    def update(self, X, y ):
        '''
        This function used to update model with new data.
        Param:
            X : [user, item], user and item should be a list
            y : rating, rating should be a list
            
        '''
        pass
        
        
    def fit(self, X, y, verbose=3):
        """
        embedding model, with inputs of "user_id", "item_id"
        
        Param:
            X : [user_history, item_history]
            y : rating_history
        """
        pass

    def predict(self, X):
        '''
        Give the prediction for the given input
        Param:
            X : [user_history, item_history],  user and item should be a list or array-like
        Return:
            prediction: prediction of action
        
        '''
        return 2

## EmbeddingModel

In [75]:
class EmbeddingModel(BaseModel):
    
    def __init__(self, nb_user, nb_items, embedding_size=30):
        self._build_model(nb_user, nb_items, embedding_size=30)
        
    
    def _build_model(self, nb_users, nb_items, embedding_size=30):
        user_id_input = Input(shape=[1],name='user')
        item_id_input = Input(shape=[1], name='item')
        user_embedding = Embedding(output_dim=embedding_size, input_dim=nb_users+1,
                           input_length=1, name='user_embedding')(user_id_input)

        item_embedding = Embedding(output_dim=embedding_size, input_dim=nb_items+1,
                           input_length=1, name='item_embedding')(item_id_input)
        
        user_vecs = Flatten()(user_embedding)
        item_vecs = Flatten()(item_embedding)

        y = Dot(axes=1)([user_vecs, item_vecs])
        
        self.model = Model(inputs=[user_id_input, item_id_input], outputs=y)
        self.model.compile(optimizer='adam', loss='mse')
        
        
    def fit(self, X, y, epochs=50, verbose=False):
        """
        embedding model, with inputs of "user_id", "item_id"
        
        Param:
            X : [user_history, item_history]
            y : rating_history
        """
        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        self.model.fit(X, y, epochs=epochs, callbacks=[early_stopping],
                       batch_size=64,  validation_split=0.1,
                       shuffle=True, verbose=2)

    
    def update(self, X, y, epochs=1, verbose=False):
        """
        embedding model, with inputs of "user_id", "item_id"
        
        Param:
            X : [user_history, item_history]
            y : rating_history
        """
        self.model.fit(X, y, epochs=epochs, verbose=verbose)
        
    def predict(self, X):
        """
        perdict a rating of embedding model
        
        Params:
            input_data : [users, itmes]
        Returns:
            input
        """
        pred = self.model.predict(X)
        return float(pred)
    


## SGD Model

In [76]:
class SGDModel(BaseModel):
    
    def __init__(self, nb_users, nb_items):
        self.algo = SVD()
        pass
        
    
    def fit(self, X, y, verbose=True ):
        """
        SGD model, with inputs of "user_id", "item_id"
        """
        self.df = pd.DataFrame({'user':X[0], 'item':X[1], 'rating':y})
        reader = Reader(rating_scale=(1, 5))
        train_spr = Dataset.load_from_df(self.df[['user','item','rating']],reader).build_full_trainset()

        self.algo.fit(train_spr)

    def update(self, X, y):
        self.df = self.df.append({'user':X[0][0], 'item':X[1][0], 'rating':y[0]},ignore_index=True )
        reader = Reader(rating_scale=(1, 5))
        train_spr = Dataset.load_from_df(self.df[['user','item','rating']],reader).build_full_trainset()
        self.algo = SVD()
        self.algo.fit(train_spr)
    
    def predict(self, X):
        """
        perdict a rating of SGD model
        """
        pred = self.algo.predict(uid=X[0][0], iid=X[1][0])[3]
        return pred

## User-based collaborative filtering.

In [77]:
class UserBasedModel():
    '''
    User-based collaborative filtering.
    '''
    
    def __init__(self):
        pass
    
    def _similarity(self, ratings):

        # vecteur contenant pour chaque utilisateur le nombre de notes données
        r_user = (ratings>0).sum(axis=1)  

        # vecteur contenant pour chaque utilisateur la moyenne des notes données
        m_user = np.divide(ratings.sum(axis=1) , r_user, where=r_user!=0)

        # Notes recentrées par la moyenne par utilisateur : chaque ligne i contient le vecteur \bar r_i
        ratings_ctr = ratings.T - ((ratings.T!=0) * m_user)
        ratings_ctr = ratings_ctr.T

        # Matrice de Gram, contenant les produits scalaires
        sim = ratings_ctr.dot(ratings_ctr.T)

        # Renormalisation
        norms = np.array([np.sqrt(np.diagonal(sim))])
        sim = sim / norms / norms.T  
        # (En numpy, diviser une matrice par un vecteur ligne (resp. colonne) 
        # revient à diviser chaque ligne (resp. colonne) terme à terme par les éléments du vecteur)

        return sim

    def _phi(self, x):
        return np.maximum(x,0) 

    def fit(self, X, y, verbose):
        
        self.df = pd.DataFrame({'user_id':X[0], 'item_id':X[1], 'rating':y})
        self._calcule()
       
    
    def _calcule(self):
        '''
        Calculer le model
        '''
        moviemat = pd.pivot_table(self.df,index='user_id',columns='item_id',values='rating')
        
        self.ratings = moviemat.fillna(0).values
        self.user_max, self.item_max = self.ratings.shape
        
        sim = self._similarity(self.ratings)
        numerator = self._phi(sim).dot(self.ratings)
        denominator = self._phi(sim).dot(self.ratings>0)
        self.pred_ratings = np.divide(numerator,denominator,where = denominator!=0)

    def predict(self, X):
        
        user_id, item_id = X[0][0], X[1][0]
        if user_id > self.user_max or item_id > self.item_max:
            raise AttributeError(f'The user_id({user_id}) or the item_id{item_id} is unregistered. User_id ' )
        return self.pred_ratings[user_id, item_id]

    def update(self, X, y):
        
        user_id, item_id,rating = X[0][0],X[1][0], y[0]
        if user_id > self.user_max or item_id > self.item_max:
            raise AttributeError(f'The user_id({user_id}) or the item_id{item_id} is unregistered. User_id ' )
            
        self.ratings[user_id, item_id] == rating
        
        self._calcule()
        

# Test

In [78]:
def test_model(model, n_iter=500, online=True, verbose=2):
    """
    test embedding model's performance
    ---------------------------

    online: 'bool', "True" means after each time of prediction, update the model
                    "False" means don't update modeln_iter: 'int', times of applying "req_predict" function to get test data

    """
    
    data_reset = envs.req_reset(envs='first', verbose=False)
    nb_users, nb_items = data_reset['nb_users'], data_reset['nb_items'] 
    next_X = [[data_reset['next_user']], [data_reset['next_item']]] 
    X = [data_reset['user_history'], data_reset['item_history'] ]
    y = data_reset['rating_history']
    model.fit(X, y, verbose=verbose)
    mse, mae = 0, 0
    
    for i in tqdm(range(n_iter)):
        sleep(0.05)
        prediction = model.predict(next_X)
        test = envs.req_predict(prediction, envs='first', verbose=False)
        next_X = [[test['next_user']], [test['next_item']]]
        next_y = test['rating']
        
        mse += (next_y - prediction) ** 2
        mae += np.abs(next_y - prediction)

        if online:
            model.update(next_X,[next_y])

    print('mse: ',float(mse)/n_iter)
    print('mae: ', float(mae)/n_iter)

### Test for baseline

In [79]:
model = BaseModel(300,100)
test_model(model)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  2.72
mae:  1.3


## Test for EmbeddingModel

### Embedding Size:30, Offline

In [80]:
model = EmbeddingModel(300, 100, embedding_size=30)
test_model(model, online=False, verbose=True)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
 - 1s - loss: 9.8404 - val_loss: 9.7537
Epoch 2/50
 - 1s - loss: 9.7938 - val_loss: 9.6199
Epoch 3/50
 - 1s - loss: 9.4257 - val_loss: 8.8606
Epoch 4/50
 - 1s - loss: 8.4795 - val_loss: 7.6692
Epoch 5/50
 - 1s - loss: 7.5556 - val_loss: 6.8656
Epoch 6/50
 - 1s - loss: 7.0802 - val_loss: 6.5282
Epoch 7/50
 - 1s - loss: 6.9040 - val_loss: 6.4103
Epoch 8/50
 - 1s - loss: 6.8449 - val_loss: 6.3683
Epoch 9/50
 - 1s - loss: 6.8239 - val_loss: 6.3563
Epoch 10/50
 - 1s - loss: 6.8161 - val_loss: 6.3512
Epoch 11/50
 - 1s - loss: 6.8131 - val_loss: 6.3497
Epoch 12/50
 - 1s - loss: 6.8115 - val_loss: 6.3497
Epoch 13/50
 - 1s - loss: 6.8101 - val_loss: 6.3487
Epoch 14/50
 - 1s - loss: 6.8092 - val_loss: 6.3492
Epoch 15/50
 - 1s - loss: 6.8078 - val_loss: 6.3492


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  6.910711830907748
mae:  2.112194847583771


### Embedding Size:30 - Online

In [81]:
model = EmbeddingModel(300, 100, embedding_size=30)
test_model(model, online=True, verbose=True)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
 - 1s - loss: 9.6014 - val_loss: 9.2656
Epoch 2/50
 - 1s - loss: 9.5524 - val_loss: 9.1406
Epoch 3/50
 - 1s - loss: 9.1866 - val_loss: 8.4904
Epoch 4/50
 - 1s - loss: 8.2823 - val_loss: 7.4831
Epoch 5/50
 - 1s - loss: 7.3696 - val_loss: 6.7697
Epoch 6/50
 - 1s - loss: 6.8569 - val_loss: 6.4418
Epoch 7/50
 - 1s - loss: 6.6465 - val_loss: 6.3236
Epoch 8/50
 - 1s - loss: 6.5737 - val_loss: 6.2866
Epoch 9/50
 - 1s - loss: 6.5491 - val_loss: 6.2733
Epoch 10/50
 - 1s - loss: 6.5396 - val_loss: 6.2704
Epoch 11/50
 - 1s - loss: 6.5348 - val_loss: 6.2694
Epoch 12/50
 - 1s - loss: 6.5307 - val_loss: 6.2652
Epoch 13/50
 - 1s - loss: 6.5259 - val_loss: 6.2610
Epoch 14/50
 - 1s - loss: 6.5212 - val_loss: 6.2571
Epoch 15/50
 - 1s - loss: 6.5152 - val_loss: 6.2522
Epoch 16/50
 - 1s - loss: 6.5085 - val_loss: 6.2455
Epoch 17/50
 - 1s - loss: 6.5009 - val_loss: 6.2372
Epoch 18/50
 - 1s - loss: 6.4929 - val_loss: 6.2282
Epoch 19/50
 - 1s - loss:

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  6.417668297515005
mae:  1.9562104734182357


### Embedding Size:10 - Online

In [82]:
model = EmbeddingModel(300, 100, 10)
test_model(model, online=False)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
 - 1s - loss: 10.6560 - val_loss: 10.9013
Epoch 2/50
 - 1s - loss: 10.5954 - val_loss: 10.7573
Epoch 3/50
 - 1s - loss: 10.1208 - val_loss: 9.9885
Epoch 4/50
 - 1s - loss: 9.0004 - val_loss: 8.8743
Epoch 5/50
 - 1s - loss: 7.9610 - val_loss: 8.1357
Epoch 6/50
 - 1s - loss: 7.4350 - val_loss: 7.8320
Epoch 7/50
 - 1s - loss: 7.2466 - val_loss: 7.7235
Epoch 8/50
 - 1s - loss: 7.1884 - val_loss: 7.6854
Epoch 9/50
 - 1s - loss: 7.1708 - val_loss: 7.6718
Epoch 10/50
 - 1s - loss: 7.1655 - val_loss: 7.6680
Epoch 11/50
 - 1s - loss: 7.1630 - val_loss: 7.6658
Epoch 12/50
 - 1s - loss: 7.1620 - val_loss: 7.6657
Epoch 13/50
 - 1s - loss: 7.1603 - val_loss: 7.6652
Epoch 14/50
 - 1s - loss: 7.1590 - val_loss: 7.6647
Epoch 15/50
 - 1s - loss: 7.1578 - val_loss: 7.6634
Epoch 16/50
 - 1s - loss: 7.1556 - val_loss: 7.6621
Epoch 17/50
 - 1s - loss: 7.1535 - val_loss: 7.6611
Epoch 18/50
 - 1s - loss: 7.1507 - val_loss: 7.6585
Epoch 19/50
 - 1s - 

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  7.483281817152181
mae:  2.160544661998749


### Embedding Size:10 - offline

In [83]:
model = EmbeddingModel(300,100,10)
test_model(model, online=True)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
 - 1s - loss: 10.6323 - val_loss: 10.7320
Epoch 2/50
 - 1s - loss: 10.5668 - val_loss: 10.5836
Epoch 3/50
 - 1s - loss: 10.1415 - val_loss: 9.8941
Epoch 4/50
 - 1s - loss: 9.1504 - val_loss: 8.8612
Epoch 5/50
 - 1s - loss: 8.1764 - val_loss: 8.1246
Epoch 6/50
 - 1s - loss: 7.6295 - val_loss: 7.7908
Epoch 7/50
 - 1s - loss: 7.4078 - val_loss: 7.6657
Epoch 8/50
 - 1s - loss: 7.3303 - val_loss: 7.6243
Epoch 9/50
 - 1s - loss: 7.3043 - val_loss: 7.6081
Epoch 10/50
 - 1s - loss: 7.2948 - val_loss: 7.6030
Epoch 11/50
 - 1s - loss: 7.2910 - val_loss: 7.6003
Epoch 12/50
 - 1s - loss: 7.2880 - val_loss: 7.5974
Epoch 13/50
 - 1s - loss: 7.2854 - val_loss: 7.5957
Epoch 14/50
 - 1s - loss: 7.2824 - val_loss: 7.5940
Epoch 15/50
 - 1s - loss: 7.2793 - val_loss: 7.5911
Epoch 16/50
 - 1s - loss: 7.2754 - val_loss: 7.5885
Epoch 17/50
 - 1s - loss: 7.2711 - val_loss: 7.5856
Epoch 18/50
 - 1s - loss: 7.2663 - val_loss: 7.5819
Epoch 19/50
 - 1s - 

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  7.577700982971179
mae:  2.1727531677484513


## Test for SGDModel

### SGDModel - offline

In [84]:
model = SGDModel(300, 100)
test_model(model, online=False)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  0.41887082411441856
mae:  0.49732112232682224


### SGDModel - online

In [85]:
model = SGDModel(300,100)
test_model(model, online=True)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  0.7260943593962487
mae:  0.6981995127767968


## User-based collaborative filtering.

### Offline

In [86]:
model = UserBasedModel()
test_model(model, online=False)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  1.0076393690954204
mae:  0.8173901456236159


### Online

In [87]:
model = UserBasedModel()
test_model(model, online=True)

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


mse:  0.9594703204118263
mae:  0.7618447292647492


# Evalueation

baseline :
- mse:  2.448
- mae:  1.192

Factorisation de matrice et réseaux de neurones(embedding size: 10) - offline
- mse 0.7845633044878243
- mae 0.6608313537836075

Factorisation de matrice et réseaux de neurones(embedding size: 10) - online
- mse 0.697576296177385
- mae 0.6306659514009952

Factorisation de matrice et réseaux de neurones(embedding size: 30) - offline
- mse:  6.910711830907748
- mae:  2.112194847583771

Factorisation de matrice et réseaux de neurones(embedding size: 30) - online
- mse:  6.417668297515005
- mae:  1.9562104734182357

SVD(n_factor: 100) - offline
- mse:  0.41887082411441856
- mae:  0.49732112232682224


SVD(n_factor: 100) - online
- mse:  0.7260943593962487
- mae:  0.6981995127767968

User-based collaborative filtering - offline
- mse:  1.0076393690954204
- mae:  0.8173901456236159

User-based collaborative filtering - online
- mse:  0.9594703204118263
- mae:  0.7618447292647492
