In [1]:
# rerun the data with 
# https://github.com/khanhnamle1994/movielens
# https://github.com/yennanliu/movie_recommendation

In [2]:
import pandas as pd 
import numpy as np

In [5]:
ls datasets

[1m[34mml-latest[m[m/       [1m[34mml-latest-small[m[m/


In [6]:
ls datasets/ml-latest-small

README.txt   links.csv    movies.csv   ratings.csv  tags.csv


In [17]:
# small rating data 

In [12]:
df_ratings_small = pd.read_csv("datasets/ml-latest-small/ratings.csv")

In [13]:
df_ratings_small.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144


In [16]:
df_ratings_small.userId.max()

671

In [15]:
len(set(df_ratings_small.userId))

671

In [18]:
# big rating data 

In [19]:
df_ratings_big = pd.read_csv("datasets/ml-latest/ratings.csv")

In [20]:
df_ratings_big.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529


In [21]:
df_ratings_big.userId.max()

270896

In [22]:
len(set(df_ratings_big.userId))

270896

In [23]:
#### user data from https://github.com/khanhnamle1994/movielens  ####

In [24]:
cd ~/movielens/

/Users/yennanliu/movielens


In [26]:
ls

CFModel.py
CFModel.pyc
Content_Based_and_Collaborative_Filtering_Models.ipynb
Data_Processing.ipynb
Deep_Learning_Model.ipynb
LICENSE
README.md
SVD_Model.ipynb
[1m[34mdat[m[m/
[1m[34mimages[m[m/
movies.csv
ratings.csv
users.csv
weights.h5


In [27]:
ls dat/

README.md    movies.dat   ratings.dat  users.dat


In [30]:
users = pd.read_csv('users.csv',sep='\t')

In [31]:
users.head(1)

Unnamed: 0.1,Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,0,1,F,1,10,48067,Under 18,K-12 student


In [32]:
users.user_id.max()

6040

In [33]:
len(set(users.user_id))

6040

In [34]:
### help function ###


import numpy as np
from keras.layers import Embedding, Reshape, Merge
from keras.models import Sequential

class CFModel(Sequential):

    # The constructor for the class
    def __init__(self, n_users, m_items, k_factors, **kwargs):
        # P is the embedding layer that creates an User by latent factors matrix.
        # If the intput is a user_id, P returns the latent factor vector for that user.
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Reshape((k_factors,)))

        # Q is the embedding layer that creates a Movie by latent factors matrix.
        # If the input is a movie_id, Q returns the latent factor vector for that movie.
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Reshape((k_factors,)))

        super(CFModel, self).__init__(**kwargs)
        
        # The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding rating.
        self.add(Merge([P, Q], mode='dot', dot_axes=1))

    # The rate function to predict user's rating of unrated items
    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [38]:
# Create training set

RNG_SEED = 30 


shuffled_ratings = df_ratings_big.sample(frac=1., random_state=RNG_SEED)

# Shuffling users
Users = shuffled_ratings['movieId'].values
#print 'Users:', Users, ', shape =', Users.shape

# Shuffling movies
Movies = shuffled_ratings['movieId'].values
#print 'Movies:', Movies, ', shape =', Movies.shape

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
#print 'Ratings:', Ratings, ', shape =', Ratings.shape

In [37]:
shuffled_ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
12703093,131860,47,4.0,834520686


In [39]:
Movies

array([   47,  2303,  3255, ...,    11,  2019, 68954])

In [40]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

In [41]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2000 # A random test user (user_id = 2000)

In [43]:
max_userid = df_ratings_big['userId'].drop_duplicates().max()
max_movieid = df_ratings_big['movieId'].drop_duplicates().max()

In [45]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')



In [47]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)






Train on 23421860 samples, validate on 2602429 samples
Epoch 1/30


KeyboardInterrupt: 