In [1]:
# Setup: Run the following command in your terminal
# download xcode from https://apps.apple.com/us/app/xcode/id497799835?mt=12/
# conda create -n recommendation_env python=3.9
# conda activate recommendation_env
# pip3 install Cython numpy scipy
# python3 -m pip install 'tensorflow'
# pip install tf_slim
# pip install recommenders
# git clone https://github.com/recommenders-team/recommenders.git
# python -m ipykernel install --user --name recommendation_env --display-name recommendation_kernel

# Open this notebook
# Select Jupyter kernel recommendation_kernel
# Run the notebook

In [2]:
# See recommenders/examples/02_model_collaborative_filtering/ncf_deep_dive.ipynb on the details of this model

In [37]:
import pandas as pd
import json
import requests
import os
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import csv
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
import time
import tensorflow as tf
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.evaluation.python_evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k, rmse
)
from recommenders.utils.constants import SEED as DEFAULT_SEED

In [50]:
df_reviews = pd.read_csv('./data/df_reviews_user_book.csv')
df_reviews_test = pd.read_csv('./data/df_reviews_test_user_book.csv')


In [51]:
df_reviews.head()

Unnamed: 0,user_id,book_id,rating
0,23f9c95a62b976fd69e6d729d819b86d,7851761,3
1,27550af3e095e5b8613de27bc326085b,169879,3
2,48016fdb4dd77989a53859b9b8fbf990,21535271,2
3,7cc8c917bea80c465c93e3f662322217,28187,4
4,d588f290bef8ce20201909db0b261232,422235,3


In [52]:
df_reviews_test.head()

Unnamed: 0,user_id,book_id,rating
0,f4451cfff46ee9f747a9e5d36fe8a404,4769247,4
1,cc49cbd1733e25314462e52bf3c54a49,11428984,4
2,41858f2267fa47871a1e48c433b1bc6a,4450057,4
3,b11fb9f8b16ed8c8021fcc2a193a6016,7635629,2
4,6e9988b97064a91c52d651684ffd0998,15783514,5


In [53]:
df_reviews['userID'] = df_reviews['user_id']
df_reviews['itemID'] = df_reviews['book_id']
df_reviews.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID
0,23f9c95a62b976fd69e6d729d819b86d,7851761,3,23f9c95a62b976fd69e6d729d819b86d,7851761
1,27550af3e095e5b8613de27bc326085b,169879,3,27550af3e095e5b8613de27bc326085b,169879
2,48016fdb4dd77989a53859b9b8fbf990,21535271,2,48016fdb4dd77989a53859b9b8fbf990,21535271
3,7cc8c917bea80c465c93e3f662322217,28187,4,7cc8c917bea80c465c93e3f662322217,28187
4,d588f290bef8ce20201909db0b261232,422235,3,d588f290bef8ce20201909db0b261232,422235


In [54]:
df_reviews_test['userID'] = df_reviews_test['user_id']
df_reviews_test['itemID'] = df_reviews_test['book_id']
df_reviews_test.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID
0,f4451cfff46ee9f747a9e5d36fe8a404,4769247,4,f4451cfff46ee9f747a9e5d36fe8a404,4769247
1,cc49cbd1733e25314462e52bf3c54a49,11428984,4,cc49cbd1733e25314462e52bf3c54a49,11428984
2,41858f2267fa47871a1e48c433b1bc6a,4450057,4,41858f2267fa47871a1e48c433b1bc6a,4450057
3,b11fb9f8b16ed8c8021fcc2a193a6016,7635629,2,b11fb9f8b16ed8c8021fcc2a193a6016,7635629
4,6e9988b97064a91c52d651684ffd0998,15783514,5,6e9988b97064a91c52d651684ffd0998,15783514


In [55]:
# Sample data for faster training
df_reviews_sample = df_reviews.sample(n=100000)

In [56]:
df_reviews_sample.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID
137719,1765bcc8fac303e5e41f7e353b06eaa3,31341,4,1765bcc8fac303e5e41f7e353b06eaa3,31341
76739,b1396adc8f9cc120b829c38e13c51948,1421990,5,b1396adc8f9cc120b829c38e13c51948,1421990
80899,a82d4806aec2c9b3772f076e9acc617c,216445,4,a82d4806aec2c9b3772f076e9acc617c,216445
373262,6a5436e1141d98d08a972f3487aaa47a,111450,3,6a5436e1141d98d08a972f3487aaa47a,111450
110542,52f1a6e704261d811b3b79424e63e306,6477531,3,52f1a6e704261d811b3b79424e63e306,6477531


In [57]:
# Filter out any users or items in the test set that do not appear in the training set.
df_reviews_test = df_reviews_test[df_reviews_test["userID"].isin(df_reviews_sample["userID"].unique())]
df_reviews_test = df_reviews_test[df_reviews_test["itemID"].isin(df_reviews_sample["itemID"].unique())]
len(df_reviews_test)

20132

In [58]:
# Create a mapping of unique user IDs to smaller integers
unique_user_ids = df_reviews_sample['userID'].unique()
user_id_mapping = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}

# Replace the userID column with the mapped values
df_reviews_sample['userID'] = df_reviews_sample['userID'].map(user_id_mapping)
df_reviews_sample.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID
137719,1765bcc8fac303e5e41f7e353b06eaa3,31341,4,0,31341
76739,b1396adc8f9cc120b829c38e13c51948,1421990,5,1,1421990
80899,a82d4806aec2c9b3772f076e9acc617c,216445,4,2,216445
373262,6a5436e1141d98d08a972f3487aaa47a,111450,3,3,111450
110542,52f1a6e704261d811b3b79424e63e306,6477531,3,4,6477531


In [59]:
df_reviews_test['userID'] = df_reviews_test['userID'].map(user_id_mapping)
df_reviews_test.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID
0,f4451cfff46ee9f747a9e5d36fe8a404,4769247,4,140,4769247
2,41858f2267fa47871a1e48c433b1bc6a,4450057,4,153,4450057
3,b11fb9f8b16ed8c8021fcc2a193a6016,7635629,2,93,7635629
4,6e9988b97064a91c52d651684ffd0998,15783514,5,1313,15783514
6,eb6af5ab8681384d1f8e90a1cd2cb543,51428,4,648,51428


In [60]:
# Create a mapping of unique item IDs to smaller integers
unique_item_ids = df_reviews_sample['itemID'].unique()
item_id_mapping = {item_id: idx for idx, item_id in enumerate(unique_item_ids)}

# Replace the userID column with the mapped values
df_reviews_sample['itemID'] = df_reviews_sample['itemID'].map(item_id_mapping)

In [61]:
df_reviews_test['itemID'] = df_reviews_test['itemID'].map(item_id_mapping)

In [62]:
df_reviews_sample

Unnamed: 0,user_id,book_id,rating,userID,itemID
137719,1765bcc8fac303e5e41f7e353b06eaa3,31341,4,0,0
76739,b1396adc8f9cc120b829c38e13c51948,1421990,5,1,1
80899,a82d4806aec2c9b3772f076e9acc617c,216445,4,2,2
373262,6a5436e1141d98d08a972f3487aaa47a,111450,3,3,3
110542,52f1a6e704261d811b3b79424e63e306,6477531,3,4,4
...,...,...,...,...,...
64534,1aa8c93789f2f5b08863115f71a43454,11058546,3,356,21591
316709,c60cc5d05de8bbfaaac2944085df48fe,16207549,5,89,32653
315186,232d2fd0531703ceab65d5b8894713f0,20495415,5,361,10224
36146,14c3098690be02f0c2e1397c51eb505d,28799,5,1386,18606


In [None]:
df_reviews_test

In [63]:
#df_reviews_sample = df_reviews_sample[df_reviews_sample['rating'] != 3]
df_reviews_sample['old_rating'] = df_reviews_sample['rating']
# Step 2: Set rating to 1 where rating >= 3 and 0 where rating <= 3
df_reviews_sample['rating'] = df_reviews_sample['rating'].apply(lambda x: 1 if x > 3 else 0)
df_reviews_sample

Unnamed: 0,user_id,book_id,rating,userID,itemID,old_rating
137719,1765bcc8fac303e5e41f7e353b06eaa3,31341,1,0,0,4
76739,b1396adc8f9cc120b829c38e13c51948,1421990,1,1,1,5
80899,a82d4806aec2c9b3772f076e9acc617c,216445,1,2,2,4
373262,6a5436e1141d98d08a972f3487aaa47a,111450,0,3,3,3
110542,52f1a6e704261d811b3b79424e63e306,6477531,0,4,4,3
...,...,...,...,...,...,...
64534,1aa8c93789f2f5b08863115f71a43454,11058546,0,356,21591,3
316709,c60cc5d05de8bbfaaac2944085df48fe,16207549,1,89,32653,5
315186,232d2fd0531703ceab65d5b8894713f0,20495415,1,361,10224,5
36146,14c3098690be02f0c2e1397c51eb505d,28799,1,1386,18606,5


In [64]:
#df_reviews_sample = df_reviews_sample[df_reviews_sample['rating'] != 3]
df_reviews_test['old_rating'] = df_reviews_test['rating']
# Step 2: Set rating to 1 where rating >= 3 and 0 where rating <= 3
df_reviews_test['rating'] = df_reviews_test['rating'].apply(lambda x: 1 if x > 3 else 0)
df_reviews_test

Unnamed: 0,user_id,book_id,rating,userID,itemID,old_rating
0,f4451cfff46ee9f747a9e5d36fe8a404,4769247,1,140,17000,4
2,41858f2267fa47871a1e48c433b1bc6a,4450057,1,153,2588,4
3,b11fb9f8b16ed8c8021fcc2a193a6016,7635629,0,93,12315,2
4,6e9988b97064a91c52d651684ffd0998,15783514,1,1313,1699,5
6,eb6af5ab8681384d1f8e90a1cd2cb543,51428,1,648,2181,4
...,...,...,...,...,...,...
25291,31fe95662c117bad1f20be225669e451,6612005,1,1430,4189,5
25292,559d834709733a06774261fad3f67ac9,284139,1,1267,711,5
25293,490c082bfc6d9c289da69e560c452fd7,92846,0,1993,4089,2
25294,f36e3273dd6ef8a06c4c36d13444b765,915554,1,1367,1111,5


In [65]:
df_reviews_sample.rating.value_counts()

rating
1    67120
0    32880
Name: count, dtype: int64

In [66]:
df_reviews_test.rating.value_counts()

rating
1    13738
0     6394
Name: count, dtype: int64

In [16]:
df_reviews_sample.to_csv('./thomas_df_review_sample_data.csv', index=False)

In [22]:
df_reviews_test.to_csv('./thomas_df_review_sample_test_data.csv', index=False)

In [28]:
print(len(sorted_df_reviews_sample), " ", len(sorted_df_reviews_test))

100000   21107


In [None]:
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 100
BATCH_SIZE = 256

SEED = DEFAULT_SEED  # Set None for non-deterministic results


sorted_df_reviews_sample = df_reviews_sample.sort_values(by='userID', ascending=True)
sorted_df_reviews_test = df_reviews_test.sort_values(by='userID', ascending=True)

# Create a test set containing the last interaction for each user as for the leave-one-out evaluation.
leave_one_out_test = df_reviews_test.groupby("userID").last().reset_index()

# Write datasets to csv files.
train_file = "./df_reviews_train.csv"
test_file = "./df_reviews_test.csv"
leave_one_out_test_file = "./df_review_leave_one_out_test.csv"
sorted_df_reviews_sample.to_csv(train_file, index=False)
sorted_df_reviews_test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)


data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

In [68]:
model = NCF(n_users=data.n_users,n_items=data.n_items,model_type="NeuMF",n_factors=4,layer_sizes=[16, 8, 4],\
            n_epochs=EPOCHS,batch_size=BATCH_SIZE,learning_rate=1e-3,verbose=10,seed=SEED)
# n_factors (int): Dimension of latent space.
# layer_sizes (list): Number of layers for MLP.

# training the model
model.fit(data)

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [7.00s]: train_loss = 0.172686 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [6.84s]: train_loss = 0.129213 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [7.05s]: train_loss = 0.109841 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [6.76s]: train_loss = 0.097227 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [7.01s]: train_loss = 0.088088 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [6.60s]: train_loss = 0.081226 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [6.76s]: train_loss = 0.077137 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [7.34s]: train_loss = 0.073583 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [6.95s]: train_loss = 0.071300 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [6.59s]: train_loss = 0.069781 


In [30]:
model.save("./ncf_model_with_thomas_data")

In [69]:
def GenPredictions(data) :
  predictions = []
  # Iterate through each row of the DataFrame
  for _, row in data.iterrows():
    user_id = row['userID']
    item_id = row['itemID']
    
    # Get the prediction score from the model
    prediction_score = model.predict(user_id, item_id)
    
    # Store the user_id, item_id, and prediction_score in the list
    predictions.append([user_id, item_id, prediction_score])
  return predictions


In [70]:
test_predictions = GenPredictions(df_reviews_test)
test_prediction_df = pd.DataFrame(test_predictions, columns=['userID', 'itemID', 'prediction'])
eval_rmse = rmse(df_reviews_test, test_prediction_df)
eval_rmse

np.float64(0.6604373677815742)

In [82]:
test_merged = pd.merge(df_reviews_test, test_prediction_df, on=["userID", "itemID"], how="outer")
test_merged[test_merged.rating == 1].describe()

Unnamed: 0,book_id,rating,userID,itemID,old_rating,prediction
count,13738.0,13738.0,13738.0,13738.0,13738.0,13738.0
mean,7196954.0,1.0,890.514413,7048.437691,4.482166,0.356524
std,8030693.0,0.0,572.937361,7876.274048,0.4997,0.393065
min,1.0,1.0,0.0,0.0,4.0,0.0
25%,127455.0,1.0,378.25,1198.0,4.0,2e-06
50%,5149118.0,1.0,840.0,3731.0,4.0,0.137008
75%,12680160.0,1.0,1365.0,10292.75,5.0,0.787103
max,36254740.0,1.0,1998.0,32647.0,5.0,0.999951


In [83]:
test_merged[test_merged.rating == 0].describe()

Unnamed: 0,book_id,rating,userID,itemID,old_rating,prediction
count,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0
mean,7395117.0,0.0,884.22396,7980.031905,2.338755,0.2092096
std,8238454.0,0.0,559.77966,8375.31808,1.057273,0.3286545
min,1.0,0.0,0.0,1.0,0.0,0.0
25%,140082.0,0.0,399.0,1364.0,2.0,4.28418e-10
50%,6006518.0,0.0,839.0,4619.0,3.0,0.001620182
75%,12739960.0,0.0,1322.75,12234.75,3.0,0.348946
max,36369510.0,0.0,1999.0,32618.0,3.0,0.9999658


In [72]:
train_predictions = GenPredictions(df_reviews_sample)
train_prediction_df = pd.DataFrame(train_predictions, columns=['userID', 'itemID', 'prediction'])
train_merged = pd.merge(df_reviews_sample, train_prediction_df, on=["userID", "itemID"], how="outer")
train_merged.head()

Unnamed: 0,user_id,book_id,rating,userID,itemID,old_rating,prediction
0,1765bcc8fac303e5e41f7e353b06eaa3,31341,1,0,0,4,0.848869
1,1765bcc8fac303e5e41f7e353b06eaa3,139417,0,0,114,0,0.229466
2,1765bcc8fac303e5e41f7e353b06eaa3,110694,1,0,126,4,0.882311
3,1765bcc8fac303e5e41f7e353b06eaa3,35231,1,0,161,4,0.974876
4,1765bcc8fac303e5e41f7e353b06eaa3,30246,0,0,165,3,0.0507


In [73]:
train_rmse = rmse(df_reviews_sample, train_prediction_df)
train_rmse

np.float64(0.18784760816599078)

In [36]:
test_merged.to_csv('./thomas_data_test_with_prediction.csv', index=False)
train_merged.to_csv('./thomas_data_train_with_prediction.csv', index=False)

In [60]:
train.userID.describe()

count    74980.000000
mean       864.778848
std        561.536369
min          0.000000
25%        362.000000
50%        818.000000
75%       1320.000000
max       1999.000000
Name: userID, dtype: float64

In [61]:
train.itemID.describe()

count    74980.000000
mean      8088.948200
std       8579.653328
min          0.000000
25%       1423.000000
50%       4532.000000
75%      12536.250000
max      32673.000000
Name: itemID, dtype: float64