<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Neural_Collaborative_Filtering_(NCF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install LibRecommender



# Neural Collaborative Filtering (NCF)

This notebook tries to implement a NCF based RecSys for implicit transaction data of IT Hard- and Software purchases. Based on 6 months transaction data.

In [2]:
import os
import datetime
import zipfile

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [3]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/Colab Notebooks/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [4]:
df.head()

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0
2,1547616,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CE63791,TP L13 YOGA G3 R7P 5875U 16GB,512GB SSD 13.3 WUXGA W10PDG,11,Computer Systems,Portable/Notebook Computers,System,"-1.232,13",-1.0
3,1547617,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CC36816,THINKBOOK 13S G3 R5 5600U 16GB,512GB SSD 13.3 WUXGA W11P,11,Computer Systems,Portable/Notebook Computers,System,-82473,-1.0
4,1547688,01.11.22,,2022FM11,15865338,DISTRELEC SCHWEIZ AG,Export Channel (DE),,J151410,USB2.0 A TO B CABLE 5M BLACK,M/M 100PCT COPPER CONDUCTOR .,1206,Cables,Usb Cables & Adapters,Printers & Peripherals,-660,-3.0


In [5]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2220299 17697 77401


# Data Preprocessing

In [6]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# Entry Date to date
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]
# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]
#insert purchase indication column
df["purchase"] = 1

In [7]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [8]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('BranchCustomerNbr')['Sku'].transform('nunique')

  return df_pl

In [9]:
df = sku_count(df)

In [10]:
# drop customers that only purchased 1 SKU
df = df[df["sku_count"] > 1]

In [11]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2025544 11328 75495


In [12]:
# Create a numeric user_id and sku_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [13]:
users = df.bcn_id.unique()
skus = df.sku_id.unique()
print(len(users), len(skus))

11328 75495


In [14]:
df_2 = df.groupby(['bcn_id', 'sku_id']).agg({
                                        'Qty Shipped':'sum',
                                        'purchase': 'sum'}).reset_index()
df_2["purch_bin"] = 1

## Train test split

In [15]:
from libreco.data import split_by_num, random_split
from libreco.data import DatasetFeat

In [16]:
df_2 = df_2.rename(columns={"bcn_id":"user", "sku_id":"item", "purchase":"label"})

In [17]:
train, test, eval = random_split(df_2[["user", "item", "label"]], multi_ratios=[0.9,0.05,0.05])

In [18]:
train, data_info = DatasetFeat.build_trainset(train)
eval = DatasetFeat.build_evalset(eval)
test = DatasetFeat.build_testset(test)

In [19]:
data_info

n_users: 11302, n_items: 71957, data density: 0.0600 %

# LibRec

In [20]:
from libreco.algorithms import NCF
from libreco.evaluation import evaluate
from libreco.data import random_split, DatasetPure

Instructions for updating:
non-resource variables are not supported in the long term


In [21]:
metrics = [
        "loss",
        "balanced_accuracy",
        "roc_auc",
        "precision",
        "recall",
        "map",
        "ndcg",
    ]

In [23]:
model = NCF(
    "ranking",
    data_info=data_info,
    loss_type="cross_entropy",
    embed_size=16,
    n_epochs=10,
    lr=0.001,
    batch_size=256,
    sampler="random",
    num_neg=10,
    dropout_rate=0.5,
    hidden_units=(128, 64, 32)
)

In [24]:
model.fit(train,
          neg_sampling=True,
          shuffle=True,
          verbose=2,
          eval_data=eval,
          metrics=metrics)

Instructions for updating:
Colocations handled automatically by placer.


Training start time: [35m2023-08-15 14:20:21[0m


train: 100%|██████████| 21203/21203 [01:39<00:00, 214.17it/s]


Epoch 1 elapsed: 99.006s
	 [32mtrain_loss: 0.2123[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 212.08it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:25<00:00, 188.41it/s]


	 eval log_loss: 0.1788
	 eval balanced_accuracy: 0.6766
	 eval roc_auc: 0.9078
	 eval precision@10: 0.0315
	 eval recall@10: 0.0361
	 eval map@10: 0.0590
	 eval ndcg@10: 0.0776


train: 100%|██████████| 21203/21203 [01:39<00:00, 212.26it/s]


Epoch 2 elapsed: 99.897s
	 [32mtrain_loss: 0.183[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 391.60it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:23<00:00, 199.97it/s]


	 eval log_loss: 0.1625
	 eval balanced_accuracy: 0.7162
	 eval roc_auc: 0.9282
	 eval precision@10: 0.0352
	 eval recall@10: 0.0423
	 eval map@10: 0.0669
	 eval ndcg@10: 0.0891


train: 100%|██████████| 21203/21203 [01:39<00:00, 212.27it/s]


Epoch 3 elapsed: 99.891s
	 [32mtrain_loss: 0.1668[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 380.39it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 194.73it/s]


	 eval log_loss: 0.1546
	 eval balanced_accuracy: 0.7414
	 eval roc_auc: 0.9358
	 eval precision@10: 0.0351
	 eval recall@10: 0.0481
	 eval map@10: 0.0722
	 eval ndcg@10: 0.0965


train: 100%|██████████| 21203/21203 [01:37<00:00, 216.58it/s]


Epoch 4 elapsed: 97.908s
	 [32mtrain_loss: 0.1563[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 432.30it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 190.78it/s]


	 eval log_loss: 0.1508
	 eval balanced_accuracy: 0.7464
	 eval roc_auc: 0.9401
	 eval precision@10: 0.0376
	 eval recall@10: 0.0483
	 eval map@10: 0.0719
	 eval ndcg@10: 0.0968


train: 100%|██████████| 21203/21203 [01:39<00:00, 213.26it/s]


Epoch 5 elapsed: 99.433s
	 [32mtrain_loss: 0.1481[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 429.29it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:23<00:00, 200.18it/s]


	 eval log_loss: 0.1485
	 eval balanced_accuracy: 0.7605
	 eval roc_auc: 0.9421
	 eval precision@10: 0.0368
	 eval recall@10: 0.0533
	 eval map@10: 0.0704
	 eval ndcg@10: 0.0970


train: 100%|██████████| 21203/21203 [01:39<00:00, 213.15it/s]


Epoch 6 elapsed: 99.477s
	 [32mtrain_loss: 0.1411[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 445.26it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 196.58it/s]


	 eval log_loss: 0.1483
	 eval balanced_accuracy: 0.7620
	 eval roc_auc: 0.9420
	 eval precision@10: 0.0399
	 eval recall@10: 0.0584
	 eval map@10: 0.0773
	 eval ndcg@10: 0.1043


train: 100%|██████████| 21203/21203 [01:38<00:00, 214.61it/s]


Epoch 7 elapsed: 98.804s
	 [32mtrain_loss: 0.1349[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 417.61it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 195.25it/s]


	 eval log_loss: 0.1474
	 eval balanced_accuracy: 0.7666
	 eval roc_auc: 0.9439
	 eval precision@10: 0.0396
	 eval recall@10: 0.0548
	 eval map@10: 0.0776
	 eval ndcg@10: 0.1041


train: 100%|██████████| 21203/21203 [01:39<00:00, 212.97it/s]


Epoch 8 elapsed: 99.564s
	 [32mtrain_loss: 0.13[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 444.33it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:23<00:00, 202.32it/s]


	 eval log_loss: 0.1490
	 eval balanced_accuracy: 0.7645
	 eval roc_auc: 0.9428
	 eval precision@10: 0.0416
	 eval recall@10: 0.0614
	 eval map@10: 0.0801
	 eval ndcg@10: 0.1085


train: 100%|██████████| 21203/21203 [01:39<00:00, 212.88it/s]


Epoch 9 elapsed: 99.605s
	 [32mtrain_loss: 0.125[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 435.70it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 193.48it/s]


	 eval log_loss: 0.1495
	 eval balanced_accuracy: 0.7694
	 eval roc_auc: 0.9436
	 eval precision@10: 0.0431
	 eval recall@10: 0.0615
	 eval map@10: 0.0808
	 eval ndcg@10: 0.1092


train: 100%|██████████| 21203/21203 [01:37<00:00, 216.88it/s]


Epoch 10 elapsed: 97.770s
	 [32mtrain_loss: 0.121[0m


eval_pointwise: 100%|██████████| 34/34 [00:00<00:00, 372.35it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 194.86it/s]


	 eval log_loss: 0.1497
	 eval balanced_accuracy: 0.7713
	 eval roc_auc: 0.9425
	 eval precision@10: 0.0404
	 eval recall@10: 0.0607
	 eval map@10: 0.0808
	 eval ndcg@10: 0.1082


In [25]:
eval_result = evaluate(model=model,
        data=test,
        neg_sampling=True,
        eval_batch_size=2048,
        k=10,
        metrics=metrics)

eval_pointwise: 100%|██████████| 136/136 [00:00<00:00, 640.10it/s]
eval_listwise: 100%|██████████| 4697/4697 [00:23<00:00, 202.16it/s]


In [26]:
eval_result_2 = evaluate(model=model,
        data=eval,
        neg_sampling=True,
        eval_batch_size=2048,
        k=10,
        metrics=metrics)

eval_pointwise: 100%|██████████| 136/136 [00:00<00:00, 576.56it/s]
eval_listwise: 100%|██████████| 4730/4730 [00:24<00:00, 196.61it/s]


In [27]:
eval_result

{'loss': 0.1494828754153262,
 'balanced_accuracy': 0.7703235771713731,
 'roc_auc': 0.9424415164068418,
 'precision': 0.04175005322546306,
 'recall': 0.0663707796663832,
 'map': 0.08059820313104193,
 'ndcg': 0.11071078857440515}

In [28]:
eval_result_2

{'loss': 0.14970442245751894,
 'balanced_accuracy': 0.7713274932614556,
 'roc_auc': 0.9425378327518891,
 'precision': 0.040359408033826635,
 'recall': 0.06070015149034597,
 'map': 0.0807687729376567,
 'ndcg': 0.10815454766923806}

## Recommendations

In [29]:
sku_list = df[["sku_id", "Sku", "Product Descr1", "ProductGroupDescription", "ProductGroupMasterDescription"]].drop_duplicates()

In [30]:
from libreco.recommendation import rank_recommendations
from google.colab import files

In [31]:
# randomly selected userids on two clusters, see ALS repo
user_id_bcn = "44508633"
user_id_2_bcn = "44510285"

In [32]:
df.loc[df['BranchCustomerNbr'] == user_id_bcn]

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,...,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped,purchase,sku_count,bcn_id,sku_id
205345,4458274,2023-06-13,13.06.2023 16:23:58,2023FM06,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,F613010,128GB MSATA SSD,...,517,Hard Drives & Optical Drives,Solid State Drive (Ssd),Components,1632,1.0,1,8,5865,64636
283407,4413628,2023-06-01,01.06.2023 16:48:33,2023FM06,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,342C151,4GB 2666MHZ DDR4 NON-ECC,...,913,Memory and Processors,Generic Memory,Components,67500,50.0,1,8,5865,12016
931271,4414530,2023-03-13,13.03.2023 18:17:59,2023FM03,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,CF96638,WD PURPLE 4TB 256MB 3.5IN SATA,...,520,Hard Drives & Optical Drives,Hard Drive Sata,Components,66496,8.0,1,8,5865,53566
1148618,4409909,2023-02-15,16.02.2023 08:41:25,2023FM02,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,T380760,SYSTEM POWER 9 700W CM,...,34,Computer Systems,Power Supplies,System,67490,10.0,1,8,5865,68787
1542910,4418779,2023-01-04,04.01.2023 16:25:26,2023FM01,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,CC03824,8GB DDR4-2666MHZ CL15 SODIMM,...,913,Memory and Processors,Generic Memory,Components,3630,2.0,1,8,5865,33050
1542911,4418779,2023-01-18,04.01.2023 16:25:26,2023FM01,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,CG34643,TX-2202A 54.6CM 21.5IN,...,372,Display,Led Small Format,Printers & Peripherals,"6.159,90",10.0,1,8,5865,57133
1821121,4409513,2022-12-07,07.12.2022 16:38:08,2022FM12,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,CA75922,K243YBMIX 60.5CM (23.8IN) TFT,...,372,Display,Led Small Format,Printers & Peripherals,37564,4.0,1,8,5865,26983
2010661,4418760,2022-11-21,21.11.2022 16:17:27,2022FM12,44508633,GBM GESELLSCHAFT FUER,SMB Channel,MI,T380549,PURE WINGS 2 80MM,...,30,Computer Systems,Computer Cooling Systems & Fans,System,3696,6.0,1,8,5865,68737


In [33]:
# find the bcn_id by bcn id
user_id = df.loc[df['BranchCustomerNbr'] == user_id_bcn, 'bcn_id'].head(1).values[0]
user_id_2 = df.loc[df['BranchCustomerNbr'] == user_id_2_bcn, 'bcn_id'].head(1).values[0]
print(user_id)
print(user_id_2)

5865
5990


In [34]:
num_recommendations = 10
recs = model.recommend_user(user=user_id, n_rec=num_recommendations)
recs
# # # Extract item IDs from recommendations
# model_recs = []
# for i in recs[1]:
#   model_recs.append(i)
# model_recs
# # # # Calculate predicted scores using the model
# model_preds = model.predict(user_id, model_recs)
# model_preds
# # # # Rank the recommendations
# ranks = rank_recommendations(recs, n_rec=10, n_items=len(recs[1]), user_ids=[user_id], model_preds=recs[1], user_consumed)

{5865: array([ 2882, 49003, 26802, 65692, 12009, 47259, 68789, 12660, 68788,
        33046])}

In [35]:
#recs[user_id]
rec_tab = pd.DataFrame(data=[recs[user_id]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id}_NCF_FREQ_REC.csv')
files.download(f'{user_id}_NCF_FREQ_REC.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
num_recommendations = 10
recs_2 = model.recommend_user(user=user_id_2, n_rec=num_recommendations)

In [38]:
#recs[user_id]
rec_tab_2 = pd.DataFrame(data=[recs_2[user_id_2]])
rec_tab_2 = rec_tab_2.T.rename(columns={0:"sku_id"})
rec_tab_2 = rec_tab_2.merge(sku_list, on="sku_id", how="left")
rec_tab_2.to_csv(f'{user_id_2}_NCF_FREQ_REC.csv')
files.download(f'{user_id_2}_NCF_FREQ_REC.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>