<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Neural_Collaborative_Filtering_(NCF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow-ranking
!pip install lightfm
!pip install LibRecommender



# Neural Collaborative Filtering (NCF)

This notebook tries to implement a NCF based RecSys for implicit transaction data of IT Hard- and Software purchases. Based on 6 months transaction data.

In [2]:
import os
import datetime
import zipfile

import numpy as np
import pandas as pd
import lightfm
import tensorflow as tf
from tensorflow import keras

from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [3]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [6]:
df.head()

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0
2,1547616,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CE63791,TP L13 YOGA G3 R7P 5875U 16GB,512GB SSD 13.3 WUXGA W10PDG,11,Computer Systems,Portable/Notebook Computers,System,"-1.232,13",-1.0
3,1547617,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CC36816,THINKBOOK 13S G3 R5 5600U 16GB,512GB SSD 13.3 WUXGA W11P,11,Computer Systems,Portable/Notebook Computers,System,-82473,-1.0
4,1547688,01.11.22,,2022FM11,15865338,DISTRELEC SCHWEIZ AG,Export Channel (DE),,J151410,USB2.0 A TO B CABLE 5M BLACK,M/M 100PCT COPPER CONDUCTOR .,1206,Cables,Usb Cables & Adapters,Printers & Peripherals,-660,-3.0


In [7]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2220299 17697 77401


# Data Preprocessing

In [8]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# Entry Date to date
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]
# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]
#insert purchase indication column
df["purchase"] = 1

In [9]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [10]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('BranchCustomerNbr')['Sku'].transform('nunique')

  return df_pl

In [11]:
df = sku_count(df)

In [12]:
# drop customers that only purchased 1 SKU
df = df[df["sku_count"] > 1]

In [13]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2025544 11328 75495


In [14]:
# Create a numeric user_id and sku_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [15]:
users = df.bcn_id.unique()
skus = df.sku_id.unique()
print(len(users), len(skus))

11328 75495


In [16]:
df_2 = df.groupby(['bcn_id', 'sku_id']).agg({
                                        'Qty Shipped':'sum',
                                        'purchase': 'sum'}).reset_index()
df_2["purch_bin"] = 1

## Train test split

In [17]:
from libreco.data import split_by_num, random_split
from libreco.data import DatasetFeat

Instructions for updating:
non-resource variables are not supported in the long term


In [18]:
df_2 = df_2.rename(columns={"bcn_id":"user", "sku_id":"item", "purchase":"label"})

In [19]:
train, test, eval = random_split(df_2[["user", "item", "label"]], multi_ratios=[0.8,0.1,0.1])

In [20]:
test

Unnamed: 0,user,item,label
286347,3953,50569,12
533376,10514,35614,2
27969,333,40106,1
226603,3250,71910,1
295022,4120,29777,3
...,...,...,...
443640,6615,8781,1
114175,1765,40455,12
123141,1776,67031,4
10819,108,21407,1


In [21]:
train, data_info = DatasetFeat.build_trainset(train)
eval = DatasetFeat.build_evalset(eval)
test = DatasetFeat.build_testset(test)

In [22]:
data_info

n_users: 11226, n_items: 68327, data density: 0.0565 %

# LibRec

In [23]:
from libreco.algorithms import NCF
from libreco.evaluation import evaluate
from libreco.data import random_split, DatasetPure

In [24]:
metrics = [
        "loss",
        "balanced_accuracy",
        "roc_auc",
        "precision",
        "recall",
        "map",
        "ndcg",
    ]

In [25]:
model = NCF(
    "ranking",
    data_info=data_info,
    loss_type="cross_entropy",
    embed_size=64,
    n_epochs=10,
    lr=0.01,
    batch_size=256,
    sampler="random",
    num_neg=10,
    dropout_rate=0.5,
    hidden_units=(128, 64, 32, 16)
)

In [26]:
model.fit(train,
          neg_sampling=True,
          shuffle=True,
          verbose=2,
          eval_data=eval,
          metrics=metrics)

Instructions for updating:
Colocations handled automatically by placer.


Training start time: [35m2023-08-10 16:02:21[0m


train: 100%|██████████| 18847/18847 [01:54<00:00, 165.09it/s]


Epoch 1 elapsed: 114.167s
	 [32mtrain_loss: 0.2056[0m
random neg item sampling elapsed: 0.263s


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 304.53it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 189.12it/s]


	 eval log_loss: 0.1566
	 eval balanced_accuracy: 0.7455
	 eval roc_auc: 0.9297
	 eval precision@10: 0.0366
	 eval recall@10: 0.0416
	 eval map@10: 0.0740
	 eval ndcg@10: 0.0990


train: 100%|██████████| 18847/18847 [01:43<00:00, 182.48it/s]


Epoch 2 elapsed: 103.289s
	 [32mtrain_loss: 0.1603[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 373.72it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 187.93it/s]


	 eval log_loss: 0.1473
	 eval balanced_accuracy: 0.7572
	 eval roc_auc: 0.9408
	 eval precision@10: 0.0426
	 eval recall@10: 0.0586
	 eval map@10: 0.0888
	 eval ndcg@10: 0.1183


train: 100%|██████████| 18847/18847 [01:42<00:00, 184.04it/s]


Epoch 3 elapsed: 102.413s
	 [32mtrain_loss: 0.1321[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 359.59it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:34<00:00, 187.38it/s]


	 eval log_loss: 0.1488
	 eval balanced_accuracy: 0.7614
	 eval roc_auc: 0.9374
	 eval precision@10: 0.0407
	 eval recall@10: 0.0677
	 eval map@10: 0.0860
	 eval ndcg@10: 0.1172


train: 100%|██████████| 18847/18847 [01:41<00:00, 185.48it/s]


Epoch 4 elapsed: 101.617s
	 [32mtrain_loss: 0.1092[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 445.87it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 190.26it/s]


	 eval log_loss: 0.1586
	 eval balanced_accuracy: 0.7589
	 eval roc_auc: 0.9289
	 eval precision@10: 0.0419
	 eval recall@10: 0.0682
	 eval map@10: 0.0894
	 eval ndcg@10: 0.1204


train: 100%|██████████| 18847/18847 [01:41<00:00, 186.39it/s]


Epoch 5 elapsed: 101.122s
	 [32mtrain_loss: 0.0919[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 417.57it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 189.52it/s]


	 eval log_loss: 0.1741
	 eval balanced_accuracy: 0.7523
	 eval roc_auc: 0.9257
	 eval precision@10: 0.0419
	 eval recall@10: 0.0730
	 eval map@10: 0.0887
	 eval ndcg@10: 0.1211


train: 100%|██████████| 18847/18847 [01:41<00:00, 185.80it/s]


Epoch 6 elapsed: 101.442s
	 [32mtrain_loss: 0.0807[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 397.78it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 187.78it/s]


	 eval log_loss: 0.1836
	 eval balanced_accuracy: 0.7446
	 eval roc_auc: 0.9231
	 eval precision@10: 0.0439
	 eval recall@10: 0.0769
	 eval map@10: 0.0926
	 eval ndcg@10: 0.1249


train: 100%|██████████| 18847/18847 [01:41<00:00, 185.77it/s]


Epoch 7 elapsed: 101.459s
	 [32mtrain_loss: 0.0733[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 387.47it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:34<00:00, 185.67it/s]


	 eval log_loss: 0.2090
	 eval balanced_accuracy: 0.7269
	 eval roc_auc: 0.9141
	 eval precision@10: 0.0418
	 eval recall@10: 0.0788
	 eval map@10: 0.0917
	 eval ndcg@10: 0.1238


train: 100%|██████████| 18847/18847 [01:43<00:00, 182.85it/s]


Epoch 8 elapsed: 103.084s
	 [32mtrain_loss: 0.0689[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 456.57it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:34<00:00, 184.20it/s]


	 eval log_loss: 0.2066
	 eval balanced_accuracy: 0.7333
	 eval roc_auc: 0.9122
	 eval precision@10: 0.0443
	 eval recall@10: 0.0816
	 eval map@10: 0.0948
	 eval ndcg@10: 0.1285


train: 100%|██████████| 18847/18847 [01:43<00:00, 182.28it/s]


Epoch 9 elapsed: 103.403s
	 [32mtrain_loss: 0.0652[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 433.09it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:34<00:00, 185.56it/s]


	 eval log_loss: 0.2224
	 eval balanced_accuracy: 0.7223
	 eval roc_auc: 0.9036
	 eval precision@10: 0.0423
	 eval recall@10: 0.0828
	 eval map@10: 0.0955
	 eval ndcg@10: 0.1288


train: 100%|██████████| 18847/18847 [01:43<00:00, 182.90it/s]


Epoch 10 elapsed: 103.053s
	 [32mtrain_loss: 0.0627[0m


eval_pointwise: 100%|██████████| 68/68 [00:00<00:00, 405.86it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:33<00:00, 187.67it/s]


	 eval log_loss: 0.2262
	 eval balanced_accuracy: 0.7270
	 eval roc_auc: 0.9033
	 eval precision@10: 0.0419
	 eval recall@10: 0.0834
	 eval map@10: 0.0916
	 eval ndcg@10: 0.1256


In [27]:
eval_result = evaluate(model=model,
        data=test,
        neg_sampling=True,
        eval_batch_size=2048,
        k=10,
        metrics=metrics)

random neg item sampling elapsed: 0.274s


eval_pointwise: 100%|██████████| 270/270 [00:00<00:00, 530.33it/s]
eval_listwise: 100%|██████████| 6365/6365 [00:36<00:00, 174.69it/s]


In [31]:
eval_result_2 = evaluate(model=model,
        data=eval,
        neg_sampling=True,
        eval_batch_size=2048,
        k=10,
        metrics=metrics)

eval_pointwise: 100%|██████████| 270/270 [00:00<00:00, 649.09it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:34<00:00, 187.37it/s]


In [29]:
eval_result

{'loss': 0.22616448168659567,
 'balanced_accuracy': 0.7269985262487055,
 'roc_auc': 0.9032840754404702,
 'precision': 0.04186265286923801,
 'recall': 0.08337867745768121,
 'map': 0.09164034518334102,
 'ndcg': 0.12555834430053095}

In [32]:
eval_result_2

{'loss': 0.22616448168659567,
 'balanced_accuracy': 0.7269985262487055,
 'roc_auc': 0.9032840754404702,
 'precision': 0.04186265286923801,
 'recall': 0.08337867745768121,
 'map': 0.09164034518334102,
 'ndcg': 0.12555834430053095}

## Recommendations

In [103]:
sku_list = df[["sku_id", "Sku", "Product Descr1", "ProductGroupDescription", "ProductGroupMasterDescription"]].drop_duplicates()

In [105]:
from libreco.recommendation import rank_recommendations
from google.colab import files

In [93]:
user_id = 3472
num_recommendations = 10
recs = model.recommend_user(user=user_id, n_rec=num_recommendations)
recs
# # # Extract item IDs from recommendations
# model_recs = []
# for i in recs[1]:
#   model_recs.append(i)
# model_recs
# # # # Calculate predicted scores using the model
# model_preds = model.predict(user_id, model_recs)
# model_preds
# # # # Rank the recommendations
# ranks = rank_recommendations(recs, n_rec=10, n_items=len(recs[1]), user_ids=[user_id], model_preds=recs[1], user_consumed)

{3472: array([35160, 75202, 75197, 44147, 44154, 68123, 21853, 34933, 45356,
        40939])}

In [106]:
#recs[user_id]
rec_tab = pd.DataFrame(data=[recs[user_id]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id}_NCF_FREQ_REC.csv')
files.download(f'{user_id}_NCF_FREQ_REC.csv')
rec_tab

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,Sku,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,35160,CC91182,OFFICE HOME AND BUSINESS 2021,Desktop Operating System Software,Softwares
1,75202,Z691861,BACKUP ESSENT ENT 2S BNDL,Back-Up Software,Softwares
2,75197,Z691844,BACKUP ESSENT STD 2S BNDL,Back-Up Software,Softwares
3,44147,CE63778,TP E15 G4 I5-1235U 16GB,Portable/Notebook Computers,Computer Systems
4,44154,CE63783,TP E15 G4 I7-1255U 16GB,Portable/Notebook Computers,Computer Systems
5,68123,S608RP6,LENOVO 65W STAND. AC ADAPTER,Notebook Battery & Ac Adapter,Computer Systems
6,21853,803C8Z8,WIN SRV STD CORE,Server Operating System Software,Softwares
7,34933,CC88035,OFFICE HOME AND BUSINESS 2021,Suites & Integrated Software Packages,Softwares
8,45356,CE79455,TP THUNDERBOLT 4 DOCK G2,"Port Replicator, Docking Station",Accessories
9,40939,CE19919,TC M70Q TINY G2 I5-11400T 16GB,Desktop/Tower Computers,Computer Systems


In [109]:
user_id_2 = 1532
num_recommendations = 10
recs_2 = model.recommend_user(user=user_id_2, n_rec=num_recommendations)
recs_2

{1532: array([38317, 50533, 52803, 22310, 27878, 66761, 29763,  4386, 26136,
        30882])}

In [111]:
#recs[user_id]
rec_tab_2 = pd.DataFrame(data=[recs_2[user_id_2]])
rec_tab_2 = rec_tab_2.T.rename(columns={0:"sku_id"})
rec_tab_2 = rec_tab_2.merge(sku_list, on="sku_id", how="left")
rec_tab_2.to_csv(f'{user_id_2}_NCF_FREQ_REC.csv')
files.download(f'{user_id_2}_NCF_FREQ_REC.csv')
rec_tab_2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,Sku,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,38317,CD58770,FRITZ BOX 7510,Wireless Lan Broadband Routers/Gateways,Communications & Networking
1,50533,CF63417,PH317-55-70S1 I7-11800H 17.3IN,Portable/Notebook Computers,Computer Systems
2,52803,CF90560,SANDISK ULTRA 3D SATA 2.5IN SSD,Solid State Drive (Ssd),Hard Drives & Optical Drives
3,22310,865FP89,ED270RPBIIPX 69CM 27IN VA,Lcd Monitor,Display
4,27878,CA89018,SANDISK PORTABLE SSD 2TB,Mobile Drive,Hard Drives & Optical Drives
5,66761,M222242,ULTRA LUXE 128GB USB 3.1,Usb Storage Media,Memory and Processors
6,29763,CB38194,COMBO TOUCH IPAD PRO 11IN 1-3G,Keyboards & Keypads,Input Devices
7,4386,222A769,UNIVERSAL FOLIO/INTEGRATED KEYB,Keyboards & Keypads,Input Devices
8,26136,CA61749,SANDISK EXTREME PORTABLE SSD,Solid State Drive (Ssd),Hard Drives & Optical Drives
9,30882,CB74725,IP 5 15ALC05 R7-5700U 8GB,Portable/Notebook Computers,Computer Systems
