In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('ml-20m/ratings.csv', sep=',', 
                      usecols=['userId', 'movieId', 'rating'])
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

# Reading ratings file
movies = pd.read_csv('ml-20m/movies.csv', sep=',', 
                     usecols=['movieId', 'title', 'genres'])

In [2]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['userId'][1:100000].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movieId'][1:100000].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'][1:100000].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [ 49018  89527 106704 ... 135700  58198  85916] , shape = (99999,)
Movies: [    32 109374   1060 ...   4031   3450   1348] , shape = (99999,)
Ratings: [2.  3.5 3.  ... 1.  3.  5. ] , shape = (99999,)


In [3]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture

Using TensorFlow backend.


In [4]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2000 # A random test user (user_id = 2000)

In [5]:
# Define model
from keras.layers import Input
from keras.models import Model
from keras.layers import Embedding, Reshape, dot

input_1 = Input(shape=(1,))
input_2 = Input(shape=(1,))

P = Reshape((K_FACTORS,))(Embedding(max_userid, K_FACTORS, input_length=1)(input_1))
Q = Reshape((K_FACTORS,))(Embedding(max_userid, K_FACTORS, input_length=1)(input_2))
P_dot_Q = dot([P, Q], axes = 1, normalize = False)

model = Model(inputs=[input_1,input_2], outputs=P_dot_Q)
#print(model.summary())
model.compile(loss = 'MSE', optimizer='adamax',metrics = ['accuracy'])


Instructions for updating:
Colocations handled automatically by placer.


In [None]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=3), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, epochs=30, validation_split=.1, verbose=2, callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Train on 89999 samples, validate on 10000 samples
Epoch 1/30


In [None]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

In [None]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(userId, movieId):
    return model.predict([np.array([userId - 1]),np.array([movieId - 1])])[0][0]

In [None]:
TEST_USER = 1456

In [None]:
user_ratings = ratings[ratings['userId'] == TEST_USER][['userId', 'movieId', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movieId']), axis=1)
user_ratings.sort_values(by='rating', ascending=False).merge(movies, 
                                                on='movieId', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

In [None]:
recommendations = ratings[ratings['movieId'].isin(user_ratings['movieId']) == False][['movieId']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movieId']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movieId',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)