### REF
- https://github.com/hexiangnan/neural_collaborative_filtering
- https://arxiv.org/abs/1708.05031

In [1]:
# ops
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse as sp


# ML

#import theano.tensor as T
import keras
from keras import backend as K
#https://github.com/keras-team/keras/issues/6278
#from keras import initializations
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2
#from keras import Sequential
#from Dataset import Dataset
#from evaluate import evaluate_model
from time import time
import multiprocessing as mp
import sys
import math
import argparse

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def get_data():
    route='datasets/ml-latest-small/'
    #df_movie = pd.read_csv(route +'movies.csv')
    df_ratings = pd.read_csv(route +'ratings.csv')
    return df_ratings

def data_preprocess(df):
    df['view_count']=df.groupby(['userId','movieId']).count().reset_index()['rating']
    movie_grouped = df.groupby(['movieId']).agg({'view_count': 'count'}).reset_index()
    grouped_sum = movie_grouped['view_count'].sum()
    movie_grouped['percentage']  = movie_grouped['view_count'].div(grouped_sum)*100
    movie_grouped.sort_values(['view_count', 'movieId'], ascending = [0,1])
    return movie_grouped

In [3]:
############ run step by step  ############

In [4]:
df_ratings = get_data()

In [5]:
df_ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [6]:
num_users = len(set(df_ratings.userId))
num_items = len(set(df_ratings.movieId))

In [7]:
num_users

671

In [8]:
def init_normal(shape,name=None):
    return initializers.normal(shape, scale=0.01, name=name)


In [9]:
######### dev 

In [10]:

import numpy as np
from keras.layers import Embedding, Reshape, Merge
from keras.models import Sequential

class CFModel(Sequential):

    # The constructor for the class
    def __init__(self, n_users, m_items, k_factors, **kwargs):
        # P is the embedding layer that creates an User by latent factors matrix.
        # If the intput is a user_id, P returns the latent factor vector for that user.
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Reshape((k_factors,)))

        # Q is the embedding layer that creates a Movie by latent factors matrix.
        # If the input is a movie_id, Q returns the latent factor vector for that movie.
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Reshape((k_factors,)))

        super(CFModel, self).__init__(**kwargs)
        
        # The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding rating.
        self.add(Merge([P, Q], mode='dot', dot_axes=1))

    # The rate function to predict user's rating of unrated items
    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]

In [11]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture

In [12]:
# ------------- dev  -------------

In [13]:
max_userid = max(df_ratings.userId)
max_movieid  =  max(df_ratings.movieId)
k_factors=1000

# If the intput is a user_id, P returns the latent factor vector for that user.
P = Sequential()
P.add(Embedding(max_userid, k_factors, input_length=1))
P.add(Reshape((k_factors,)))


Q = Sequential()
Q.add(Embedding(max_movieid, k_factors, input_length=1))
Q.add(Reshape((k_factors,)))


In [14]:
Sequential().add(Merge([P, Q], mode='dot', dot_axes=1))

  if __name__ == '__main__':


In [15]:
Sequential

keras.models.Sequential

In [16]:
#Sequential().predict([np.array([200]), np.array([100])])[0][0]

In [17]:
max_userid = max(df_ratings.userId)
max_movieid  =  max(df_ratings.movieId)
K_FACTORS=1000

# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')


#Users = df_ratings.head(1000).userId.values
#Movies = df_ratings.head(1000).movieId.values
#Ratings = df_ratings.head(1000).rating.values

Users = df_ratings.head(1000).userId.values
Movies = df_ratings.head(1000).movieId.values
Ratings = df_ratings.head(1000).rating.values



# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
#callbacks = [EarlyStopping('val_loss', patience=2), 
#             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 

callbacks = [EarlyStopping('val_loss', patience=2), ModelCheckpoint('weights.h5', save_best_only=True)]
# Use 30 epochs, 90% training data, 10% validation data 
#history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)


history = model.fit([Users, Movies], Ratings, nb_epoch=3, validation_split=.1, verbose=2)






  "This may consume a large amount of memory." % num_elements)


Train on 900 samples, validate on 100 samples
Epoch 1/3
 - 442s - loss: 15.1419 - val_loss: 11.7752
Epoch 2/3
 - 192s - loss: 14.5625 - val_loss: 11.7735
Epoch 3/3
 - 193s - loss: 14.0398 - val_loss: 11.7679


In [18]:
history.history

{'loss': [15.141920657687717, 14.562475666469998, 14.039828618367514],
 'val_loss': [11.775247230529786, 11.773504791259766, 11.767888526916504]}

In [19]:
history.history

{'loss': [15.141920657687717, 14.562475666469998, 14.039828618367514],
 'val_loss': [11.775247230529786, 11.773504791259766, 11.767888526916504]}

In [20]:
print (history.history)

{'val_loss': [11.775247230529786, 11.773504791259766, 11.767888526916504], 'loss': [15.141920657687717, 14.562475666469998, 14.039828618367514]}


In [21]:
# ------------- dev  -------------