In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_recommenders as tr
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
tf.compat.v1.enable_eager_execution()

In [4]:
rating_df = pd.read_csv('mlens/ratings.csv')
movies_df = pd.read_csv('mlens/movies.csv')
intact_r_df = rating_df.copy()
rating_df.shape, movies_df.shape

((100836, 4), (9742, 3))

In [5]:
combined_df = rating_df.merge(movies_df, how='left', on='movieId')
combined_df.shape

(100836, 6)

In [6]:
class RankingModel(keras.Model):
    def __init__(self, userId, movieId, embedding_size):
        super(RankingModel, self).__init__()

        # user model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(vocabulary=userId, mask_token=None)(input)
        output = keras.layers.Embedding(input_dim=len(userId)+1, output_dim=embedding_size, name='embedding')(x)
        self.user_model = keras.Model(inputs=input, outputs=output, name='user_model')

        # item model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(vocabulary=movieId, mask_token=None)(input)
        output = keras.layers.Embedding(input_dim=len(movieId)+1, output_dim=embedding_size, name='embedding')(x)
        self.item_model = keras.Model(inputs=input, outputs=output, name='item_model')

        # rating model
        user_input = keras.Input(shape=(embedding_size,), name='user_emb')
        item_input = keras.Input(shape=(embedding_size,), name='item_emb')
        x = keras.layers.Concatenate(axis=1)([user_input, item_input])
        x = keras.layers.Dense(256, activation='relu')(x)
        x = keras.layers.Dense(128, activation='relu')(x)
        x = keras.layers.Dropout(0.2)(x)
        x = keras.layers.Dense(64, activation='relu')(x)
        # x = keras.layers.Conv1D(64, 3, activation='relu')(x)
        # x = keras.layers.Conv1D(32, 3, activation='relu')(x)
        # x = keras.layers.Conv1D(16, 3, activation='relu')(x)
        output = keras.layers.Dense(1)(x)
        self.rating_model = keras.Model(inputs={'userId': user_input, 'movieId': item_input}, outputs=output, name='rating_model')

    def call(self, inputs):
        user_emb = self.user_model(inputs['userId'])
        item_emb = self.item_model(inputs['movieId'])
        # concat = tf.concat([user_emb, item_emb], axis=1)
        prediction = self.rating_model({'userId': user_emb, 'movieId': item_emb})

        return prediction

In [7]:
class GMFModel(tr.models.Model):
    def __init__(self, userId, movieId, embedding_size):
        super().__init__()
        self.ranking_model = RankingModel(userId, movieId, embedding_size)
        self.task = tr.tasks.Ranking(loss=keras.losses.MeanSquaredError(), metrics=[keras.metrics.RootMeanSquaredError()])

    def call(self, features):
        return self.ranking_model({'userId': features['userId'], 'movieId': features['movieId']})

    def compute_loss(self, features, training=False):
        label = features.pop('rating')
        return self.task(labels=label, predictions=self.ranking_model(features))

In [8]:
def df_ds(df):
    ds = tf.data.Dataset.from_tensor_slices((dict(df[['userId', 'movieId']]), df['rating']))
    ds = ds.map(lambda x, y: {'userId': x['userId'], 'movieId': x['movieId'], 'rating': y})
    return ds.batch(256)

In [9]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
rating_df = rating_df.drop(columns=['timestamp'])
rating_df[['userId', 'movieId']] = rating_df[['userId', 'movieId']].astype(str)
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
train, test = train_test_split(rating_df, train_size=0.8, random_state=42)
train, test = df_ds(train), df_ds(test)

In [12]:
users = rating_df['userId'].unique()
mids = rating_df['movieId'].unique()
users.shape, mids.shape

((610,), (9724,))

In [13]:
embedding_size = 64
model = GMFModel(users.astype(str), mids.astype(str), embedding_size=embedding_size)
model.compile(optimizer=keras.optimizers.Adagrad(learning_rate=0.01))

In [14]:
model.fit(train, epochs=20, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x249b10efb08>

In [15]:
result = model.evaluate(test, return_dict=True, verbose=1)
result



{'root_mean_squared_error': 0.9114993214607239,
 'loss': 0.7251245379447937,
 'regularization_loss': 0,
 'total_loss': 0.7251245379447937}

In [16]:
item_emb = model.ranking_model.item_model.layers[-1].get_weights()[0]
item_mat = cosine_similarity(item_emb)
item_mat.shape

(9725, 9725)

In [17]:
cleaned_movies_df = movies_df.loc[movies_df['movieId'].astype(str).isin(rating_df['movieId'])]
cleaned_movies_df.shape

(9724, 3)

In [18]:
cleaned_movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [20]:
cleaned_movies_df = cleaned_movies_df.set_index('movieId')

In [21]:
ind2name = {ind:name for ind, name in enumerate(cleaned_movies_df.index)}
name2ind = {v:k for k, v in ind2name.items()}

In [22]:
def recommend(movieId, n, cos_sim, map_name):
    print(movieId)
    top_items = cos_sim[movieId,:].argsort()[-10:][::-1]
    top_items = [map_name[e] for e in top_items]
    return top_items

similars = recommend(name2ind[1], 10, item_mat, ind2name)
movies_df.loc[movies_df['movieId'].isin(similars)]

0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
32,34,Babe (1995),Children|Drama
2589,3461,Lord of the Flies (1963),Adventure|Drama|Thriller
2656,3554,Love and Basketball (2000),Drama|Romance
3720,5139,"Bad News Bears, The (1976)",Comedy
4648,6947,Master and Commander: The Far Side of the Worl...,Adventure|Drama|War
5827,32234,Julia (1977),Drama
8439,111781,Mission: Impossible - Rogue Nation (2015),Action|Adventure|Thriller
8527,114713,Annabelle (2014),Horror
8960,136840,Da geht noch was! (2013),Comedy|Drama|Romance


# Using Recommenders library

In [23]:
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import rmse, mae, ndcg_at_k, get_top_k_items

In [24]:
ncftrain, ncftest = python_chrono_split(intact_r_df, 0.75, col_user='userId', col_item='movieId', col_timestamp='timestamp')

In [25]:
data = Dataset(train=ncftrain, test=ncftest, col_user='userId', col_item='movieId', col_timestamp='timestamp', seed=42)

In [26]:
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type='NeuMF',
    n_factors=4,
    layer_sizes=[64, 32, 4],
    n_epochs=50,
    batch_size=256,
    learning_rate=0.001,
    verbose=1
)



In [27]:
model.fit(data)

In [28]:
users, items, preds = [], [], []
item = list(ncftrain.movieId.unique())
for user in ncftrain.userId.unique():
    user = [user] * len(item)
    users.extend(user)
    items.extend(item)
    preds.extend(list(model.predict(user, item, is_list=True)))

all_predictions = pd.DataFrame(data={'userId': users, 'movieId': items, 'pred': preds})
merged = pd.merge(ncftrain, all_predictions, on=['userId', 'movieId'], how='outer')
all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

In [29]:
eval_ndcg = ndcg_at_k(ncftest, all_predictions, col_user='userId', col_item='movieId', col_prediction='pred', k=10)
eval_rmse = rmse(ncftest, all_predictions, col_user='userId', col_item='movieId', col_prediction='pred')
print(f'NDCG = {eval_ndcg}, RMSE = {eval_rmse}')

NDCG = 0.08986394510641643, RMSE = 3.3520211909112154


In [30]:
predictions = [[row.userId, row.movieId, model.predict(row.userId, row.movieId)] for (i, row) in ncftest.iterrows()]

In [31]:
preds_ncf = pd.DataFrame(data=predictions, columns=['userId', 'movieId', 'preds'])

In [32]:
preds_ncf['movieId'] = preds_ncf['movieId'].astype(int)

In [33]:
preds_ncf = preds_ncf.sort_values(['userId', 'preds'], ascending=False)
preds_ncf.head()

Unnamed: 0,userId,movieId,preds
25000,610.0,81591,0.98074
25045,610.0,5449,0.96374
25042,610.0,58998,0.85371
25134,610.0,541,0.794818
25139,610.0,3703,0.787794


In [34]:
final_df = preds_ncf.merge(movies_df, on='movieId', how='left')

In [35]:
def recommend_ncf(user):
    return final_df[final_df['userId'] == float(user)].iloc[:5][['title', 'genres']]

recommend_ncf(15)

Unnamed: 0,title,genres
24813,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi
24814,The Hunger Games (2012),Action|Adventure|Drama|Sci-Fi|Thriller
24815,Up (2009),Adventure|Animation|Children|Drama
24816,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
24817,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# Better recommender net

In [60]:
rating_df = pd.read_csv('mlens/ratings.csv')
movies_df = pd.read_csv('mlens/movies.csv')
rating_df.shape, movies_df.shape

((100836, 4), (9742, 3))

In [61]:
class RecommenderNet(keras.Model):
    def __init__(self, n_users, n_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.n_users = n_users
        self.n_movies = n_movies
        self.embedding_size = embedding_size

        self.user_embedding = keras.layers.Embedding(
            input_dim=n_users, 
            output_dim=embedding_size, 
            embeddings_initializer='he_normal', 
            embeddings_regularizer=keras.regularizers.l2(1e-6)
        )
        self.user_bias = keras.layers.Embedding(n_users, 1)

        self.movie_embedding = keras.layers.Embedding(
            input_dim=n_movies,
            output_dim=embedding_size,
            embeddings_initializer='he_normal',
            embeddings_regularizer=keras.regularizers.l2(1e-6)
        )
        self.movie_bias = keras.layers.Embedding(n_movies, 1)

    def call(self, inputs):
        user_vect = self.user_embedding(inputs[:,0])
        user_bias = self.user_bias(inputs[:,0])
        movie_vect = self.movie_embedding(inputs[:,1])
        movie_bias = self.movie_bias(inputs[:,1])
        dot_prod = tf.tensordot(user_vect, movie_vect, 2)
        tot = dot_prod + user_bias + movie_bias
        return tf.nn.sigmoid(tot)

In [62]:
user_ids = rating_df['userId'].unique().tolist()
user2ind = {x: i for i, x in enumerate(user_ids)}
ind2user = {i: x for i, x in enumerate(user_ids)}

In [63]:
movie_ids = rating_df['movieId'].unique().tolist()
movie2ind = {x: i for i, x in enumerate(movie_ids)}
ind2movie = {i: x for i, x in enumerate(movie_ids)}

In [64]:
rating_df['users'] = rating_df['userId'].map(user2ind)
rating_df['movies'] = rating_df['movieId'].map(movie2ind)

In [65]:
rating_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,users,movies
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,1
2,1,6,4.0,964982224,0,2
3,1,47,5.0,964983815,0,3
4,1,50,5.0,964982931,0,4
5,1,70,3.0,964982400,0,5
6,1,101,5.0,964980868,0,6
7,1,110,4.0,964982176,0,7
8,1,151,5.0,964984041,0,8
9,1,157,5.0,964984100,0,9


In [66]:
n_users = len(user2ind)
n_movies = len(ind2movie)
rating_df['rating'] = rating_df['rating'].values.astype(np.float32)
min_rating = min(rating_df['rating'])
max_rating = max(rating_df['rating'])
print(f'number users: {n_users}, number movies: {n_movies}, min rating: {min_rating}, max rating: {max_rating}')

number users: 610, number movies: 9724, min rating: 0.5, max rating: 5.0


In [67]:
# train and test
x = rating_df[['users', 'movies']].values
y = rating_df['rating'].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values
train_inds = int(0.9*rating_df.shape[0])
x_train, x_val, y_train, y_val = (x[:train_inds], x[train_inds:], y[:train_inds], y[train_inds:])

In [79]:
rnet = RecommenderNet(n_users, n_movies, 32)
rnet.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(learning_rate=0.0001))

In [80]:
history = rnet.fit(x_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(x_val, y_val))

Train on 90752 samples, validate on 10084 samples
Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [70]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [81]:
user_id=474
movies_watched = rating_df[rating_df['userId']==user_id]
movies_not_watched = movies_df[~movies_df['movieId'].isin(movies_watched.movieId.values)]['movieId']
movies_not_watched = list(set(movies_not_watched).intersection(set(movie2ind.keys())))
movies_not_watched = [[movie2ind.get(x)] for x in movies_not_watched]
user_encoded = user2ind.get(user_id)
user_movie_arr = np.hstack(([[user_encoded]] * len(movies_not_watched), movies_not_watched))
user_movie_arr.shape

(7616, 2)

In [82]:
ratings = rnet.predict(user_movie_arr).flatten()
top_inds = ratings.argsort()[-10:][::-1]
recommend_movie_ids = [ind2movie.get(movies_not_watched[x][0]) for x in top_inds]
top_movies_user = (movies_watched.sort_values(by='rating', ascending=False).head(5).movieId.values)
movie_names = movies_df[movies_df['movieId'].isin(top_movies_user)]
print(f'Recommendations for user {user_id}')
print('High rated for user: ')
for row in movie_names.itertuples():
    print(row.title, ": ", row.genres)
print('--'*20)
print('Top ten movie recommendations')
recommend_movie_names = movies_df[movies_df['movieId'].isin(recommend_movie_ids)]
for row in recommend_movie_names.itertuples():
    print(row.title, ": ", row.genres)



Recommendations for user 474
High rated for user: 
Enchanted April (1992) :  Drama|Romance
Strictly Ballroom (1992) :  Comedy|Romance
Moonstruck (1987) :  Comedy|Romance
Safety Last! (1923) :  Action|Comedy|Romance
Harry Potter and the Goblet of Fire (2005) :  Adventure|Fantasy|Thriller|IMAX
----------------------------------------
Top ten movie recommendations
Evil Dead II (Dead by Dawn) (1987) :  Action|Comedy|Fantasy|Horror
Snatch (2000) :  Comedy|Crime|Thriller
Bourne Ultimatum, The (2007) :  Action|Crime|Thriller
Into the Wild (2007) :  Action|Adventure|Drama
In Bruges (2008) :  Comedy|Crime|Drama|Thriller
Dark Knight, The (2008) :  Action|Crime|Drama|IMAX
Toy Story 3 (2010) :  Adventure|Animation|Children|Comedy|Fantasy|IMAX
Inception (2010) :  Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
Harry Potter and the Deathly Hallows: Part 2 (2011) :  Action|Adventure|Drama|Fantasy|Mystery|IMAX
The Martian (2015) :  Adventure|Drama|Sci-Fi
