# Collaborative filtering with deep learning

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/users-score-2023.csv')
df.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


In [2]:
df_ratings = df.copy()
df_ratings = pd.DataFrame(df_ratings).sort_values(by='anime_id')
df_ratings.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
3416062,47744,Shamisen,1,Cowboy Bebop,8
15777187,471696,FAI-TOD,1,Cowboy Bebop,7
14444966,440746,damian87,1,Cowboy Bebop,10
1478505,19682,wadissimo,1,Cowboy Bebop,10
21675460,1105351,StudioG,1,Cowboy Bebop,10


In [76]:
# Filter sparse anime
min_anime_ratings = 1000
filter_anime = (df_ratings['anime_id'].value_counts()>min_anime_ratings)
filter_anime = filter_anime[filter_anime].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df_ratings['user_id'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = df_ratings[(df_ratings['anime_id'].isin(filter_anime)) & (df_ratings['user_id'].isin(filter_users))]
del filter_anime, filter_users, min_anime_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df_ratings.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

df_filtered.head()

Shape User-Ratings unfiltered:	(24325191, 5)
Shape User-Ratings filtered:	(12563786, 5)


Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
3416062,47744,Shamisen,1,Cowboy Bebop,8
15777187,471696,FAI-TOD,1,Cowboy Bebop,7
14444966,440746,damian87,1,Cowboy Bebop,10
1478505,19682,wadissimo,1,Cowboy Bebop,10
7597436,296175,KayAni,1,Cowboy Bebop,10


## Create train and test datasets

In [77]:
# Shuffle DataFrame
df_filtered = df_filtered.drop(['Anime Title', 'Username'], axis=1).sample(frac=1)
# Testingsize
n = 100000

# Split train- & testset
df_train = df_filtered[:-n]
df_test = df_filtered[-n:]
df_train.shape, df_test.shape

((12463786, 3), (100000, 3))

In [78]:
df_filtered.head()

Unnamed: 0,user_id,anime_id,rating
2953085,40245,326,7
4489758,68259,6033,10
1017003,12822,2795,4
15890895,473958,2593,9
4275453,64255,40010,9


In [79]:
# Create user and anime-id mapping to convert to numbers
user_id_mapping = {id:i for i, id in enumerate(df_filtered['user_id'].unique())}
anime_id_mapping = {id:i for i, id in enumerate(df_filtered['anime_id'].unique())}


In [80]:

# use dataframe map function to map users & animes to mapped ids based on above mapping
train_user_data = df_train['user_id'].map(user_id_mapping)
train_anime_data = df_train['anime_id'].map(anime_id_mapping)
print(train_user_data.head())
print(train_anime_data.head())

2953085     0
4489758     1
1017003     2
15890895    3
4275453     4
Name: user_id, dtype: int64
2953085     0
4489758     1
1017003     2
15890895    3
4275453     4
Name: anime_id, dtype: int64


In [81]:
# do the same for test data
test_user_data = df_test['user_id'].map(user_id_mapping)
test_anime_data = df_test['anime_id'].map(anime_id_mapping)
print(train_user_data.head())
print(train_anime_data.head())

2953085     0
4489758     1
1017003     2
15890895    3
4275453     4
Name: user_id, dtype: int64
2953085     0
4489758     1
1017003     2
15890895    3
4275453     4
Name: anime_id, dtype: int64


In [82]:
# Get input variable-sizes
users = len(user_id_mapping)
anime = len(anime_id_mapping)
embedding_size = 100

In [83]:
# To create deep learning models
import tensorflow as tf
import keras
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

# use Input() to create tensors for - 'user' and 'anime'
user_id_input = Input(shape=(1,), name='user')
anime_id_input = Input(shape=(1,), name='anime')
print(anime_id_input)

KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name='anime'), name='anime', description="created by layer 'anime'")


In [84]:
# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)

# create embedding layer for animes just like users
anime_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=anime,
                           input_length=1, 
                           name='anime_embedding')(anime_id_input)

In [85]:
# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
anime_vector = Reshape([embedding_size])(anime_embedding)

In [86]:
# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, anime_vector])

In [87]:
# Setup model
model = Model(inputs=[user_id_input, anime_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 anime (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 100)       4173500     ['user[0][0]']                   
                                                                                                  
 anime_embedding (Embedding)    (None, 1, 100)       344800      ['anime[0][0]']                  
                                                                                            

In [88]:
# Fit model
X = [train_user_data, train_anime_data]
y = df_train['rating']

batch_size = 1024
epochs = 5
validation_split = 0.1

model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs,
          validation_split=validation_split,
          shuffle=True,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1562b01c0>

In [89]:
from sklearn.metrics import mean_squared_error

# Test model by making predictions on test data
y_pred = model.predict([test_user_data, test_anime_data]).ravel()
# clip upper and lower ratings
y_pred = list(map(lambda x: 1.0 if x < 1 else 10.0 if x > 10.0 else x, y_pred))
# get true labels
y_true = df_test['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With DL Matrix-Factorization: 1.1485 RMSE


In [90]:
## Let's see how our collaborative model performs by seeing the predicted and actual rating for the given user and anime pair
results_df = pd.DataFrame({
    'User ID': test_user_data.values,
    'Anime ID': test_anime_data.values,
    'Anime Name': [df['Anime Title'].iloc[item] for item in test_anime_data],
    'Predicted Rating': np.round(y_pred, 1),
    'Actual Rating': y_true
})

results_df.head(20)

Unnamed: 0,User ID,Anime ID,Anime Name,Predicted Rating,Actual Rating
0,8874,94,Ichigo 100%,7.6,7
1,14105,1018,Full Metal Panic! The Second Raid,7.4,9
2,10851,24,Chrno Crusade,6.8,7
3,23441,375,Comet Lucifer,8.7,9
4,29653,703,Tennis no Ouji-sama: Zenkoku Taikai-hen - Final,7.2,9
5,9179,1382,iDOLM@STER Xenoglossia,7.8,10
6,32136,2331,Full Metal Panic? Fumoffu,7.2,7
7,13891,1061,Tenkuu no Shiro Laputa,8.5,9
8,17215,3126,Hellsing Ultimate,7.8,10
9,8206,1705,Yoku Wakaru Gendai Mahou: Cruncha Cruncha Cruncha,8.8,9
