In [29]:
from knn import load
from tqdm import tqdm_notebook 
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K


In [2]:
df_movies, df_ratings = load('movies.csv','ratings.csv',1366677221)

Movies:
   movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)
Ratings
      userId  movieId  rating   timestamp
1097      13      204     5.0  1413902060
1098      13      216     4.0  1413902039
1099      13      362     4.0  1413902063
1100      13      720     4.0  1413902042
1101      13      838     3.5  1413902093
There are 74433 unique users and 53028 unique movies in this data set


In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
1097,13,204,5.0
1098,13,216,4.0
1099,13,362,4.0
1100,13,720,4.0
1101,13,838,3.5


In [4]:
num_users = len(df_ratings.userId.unique())
num_items = len(df_ratings.movieId.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

user_maxId = df_ratings.userId.max()
item_maxId = df_ratings.movieId.max()
print('There are {} distinct users and the max of user ID is also {}'.format(num_users, user_maxId))
print('There are {} distinct movies, however, the max of movie ID is {}'.format(num_items, item_maxId))
print('In the context of matrix factorization, the current item vector is in unnecessarily high dimensional space')
print('So we need to do some data cleaning to reduce the dimension of item vector back to {}'.format(num_items))

There are 74433 unique users and 53028 unique movies in this data set
There are 74433 distinct users and the max of user ID is also 283228
There are 53028 distinct movies, however, the max of movie ID is 193886
In the context of matrix factorization, the current item vector is in unnecessarily high dimensional space
So we need to do some data cleaning to reduce the dimension of item vector back to 53028


In [12]:
users = {}
k = 0 
for user in tqdm_notebook(df_ratings.userId.unique()):
    users[user] = k 
    k += 1

movies = {}
k  = 0 
for movie in tqdm_notebook(df_ratings.movieId.unique()):
    movies[movie] = k    
    k += 1

HBox(children=(IntProgress(value=0, max=74433), HTML(value='')))




HBox(children=(IntProgress(value=0, max=53028), HTML(value='')))




In [20]:
df_ratings_f = df_ratings.copy()

In [21]:
df_ratings_f['userId'] = df_ratings['userId'].map(users)
df_ratings_f['movieId'] = df_ratings['movieId'].map(movies)

In [23]:
df_ratings_f.head().reset_index(drop=True)

Unnamed: 0,userId,movieId,rating
0,0,0,5.0
1,0,1,4.0
2,0,2,4.0
3,0,3,4.0
4,0,4,3.5


In [27]:
train, test = train_test_split(df_ratings_f, test_size=0.2, shuffle=True, random_state=99)

In [31]:
user = Input(shape=(1,))
item = Input(shape=(1,))

embed_user = Embedding(input_dim=num_users + 1, output_dim=32,embeddings_initializer='uniform',
        name='user_embedding',input_length=1)(user)
embed_item = Embedding(input_dim=num_items + 1,output_dim= 32,embeddings_initializer='uniform',
        name='item_embedding',input_length=1)(item) 
    
user2 = Flatten()(embed_user)
item2 = Flatten()(embed_item)

combine = Concatenate(axis=-1)([user2, item2])


layer1 = Dense(32,activation='relu',kernel_initializer='glorot_uniform')(combine)
layer2 = Dense(32,activation='relu',kernel_initializer='glorot_uniform')(layer1)
layer3 = Dense(32,activation='relu',kernel_initializer='glorot_uniform')(layer2)

out = Dense(1)(layer3)
    
model = Model([user, item], out)
model.compile(loss="mean_squared_error",optimizer="adam")
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 32)        2381888     input_3[0][0]                    
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 32)        1696928     input_4[0][0]                    
____________________________________________________________________________________________

In [33]:
model.fit([train.userId.values, train.movieId.values], train.rating.values,epochs=8, batch_size= 32, verbose=1)

Train on 6432185 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x1a4d83b240>

In [34]:
pred = model.predict([test.userId.values, test.movieId.values])

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred)))

rmse(test.rating.values, pred)