<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Bayesian_Personalized_Ranking_(BPR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bayesian Personalized Ranking

This notebook will use the pairwise-ranking algorithm BPR to recommend and rank Top10 items based on the paper of Rendle et al. 2009 (https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf)

Therefore two different RecSys libraries will be used:



*   Implicit BPR(https://benfred.github.io/implicit/ by Ben Frederickson)

In [101]:
import warnings
import zipfile
import time
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import scipy.sparse as sparse

from datetime import datetime, timedelta
warnings.filterwarnings("ignore")

## Data Preprocessing

In [140]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [141]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0


In [142]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2220299 17697 77401


In [143]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# Entry Date to date
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]
# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]
#insert purchase indication column
df["purchase"] = 1

In [144]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [145]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('BranchCustomerNbr')['Sku'].transform('nunique')

  return df_pl

In [146]:
df = sku_count(df)

In [147]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [148]:
# drop users
df = df[df.sku_count > 1]

In [149]:
# Create a numeric user_id and artist_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [150]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,...,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped,purchase,sku_count,bcn_id,sku_id
213,1545306,2022-11-02,,2022FM11,15885514,AXIS SOLUTION (PRIVATE) LIMITED,Export Channel (DE),,CF55877,DT PRINT ZQ320 KIT LABEL SENSOR,...,5805,AIDC/PoS Printers,Mobile Receipt Printer,"Other (incl. AIDC/POS, V7)","10.713,30",41.0,1,20,417,49730
458,4422886,2022-11-03,,2023FM02,44413224,BWG INFORMATIONSYSTEME GMBH,Business Channel,,CF89211,Z-SELECT 2000D REMOVABLE NS,...,5812,AIDC/PoS Printers,Label Printers Supplies,"Other (incl. AIDC/POS, V7)","1.393,00",140.0,1,111,803,52649


In [151]:
users = df.bcn_id.unique()
skus = df.sku_id.unique()
print(len(users), len(skus))

11328 75495


In [152]:
df_2 = df[["bcn_id", "sku_id", "Entry Date", "Qty Shipped", "purchase"]]

In [153]:
grouped_df = df_2.groupby(["bcn_id", "sku_id"]).agg({
          "Entry Date":"max",
          "Qty Shipped":"sum",
          "purchase":"sum"}).reset_index()

In [154]:
# create binary column
grouped_df["purch_bin"] = 1

In [155]:
grouped_df

Unnamed: 0,bcn_id,sku_id,Entry Date,Qty Shipped,purchase,purch_bin
0,0,925,2023-02-03,1.0,1,1
1,0,3417,2023-03-31,1.0,1,1
2,0,3418,2023-06-26,4.0,2,1
3,0,3419,2023-04-17,3.0,1,1
4,0,4184,2023-01-25,4.0,1,1
...,...,...,...,...,...,...
541846,11326,33694,2023-01-11,9.0,1,1
541847,11326,42224,2023-02-24,2.0,1,1
541848,11326,45153,2023-01-11,2.0,1,1
541849,11327,35982,2022-12-14,25.0,1,1


In [156]:
def train_test_split(df, holdout_num):
    """ perform training testing split

    @param df: dataframe
    @param holdhout_num: number of items to be held out per user as testing items

    @return df_train: training data
    @return df_test testing data

    """
    # first sort the data by time
    df = df.sort_values(['bcn_id', 'Entry Date'], ascending=[True, False])

    # perform deep copy to avoid modification on the original dataframe
    df_train = df.copy(deep=True)
    df_test = df.copy(deep=True)

    # get test set
    df_test = df_test.groupby(['bcn_id']).head(holdout_num).reset_index()

    # get train set
    df_train = df_train.merge(
        df_test[['bcn_id', 'sku_id']].assign(remove=1),
        how='left'
    ).query('remove != 1').drop('remove', 1).reset_index(drop=True)

    # Sanity check to make sure we're not duplicating/losing data
    assert len(df) == len(df_train) + len(df_test)

    return df_train, df_test

In [157]:
df_train, df_test = train_test_split(grouped_df[["bcn_id","sku_id", "Entry Date", "purch_bin"]], holdout_num=1)

In [158]:
set(df_test.bcn_id.unique()).issubset(set(df_train.bcn_id.unique()))

True

In [159]:
train_bcn_ids = set(df_train['bcn_id'].unique())

# Filter train DataFrame to include only bcn_ids present in the test set
df_test_filtered = df_test[df_test['bcn_id'].isin(set(df_train['bcn_id'].unique()))]
#df_test_filtered = df_test_filtered[df_test_filtered['sku_id'].isin(set(df_train['sku_id'].unique()))]

In [160]:
set(df_test_filtered.bcn_id.unique()).issubset(set(df_train.bcn_id.unique()))

True

## Negative Sampling

In [161]:
grouped_df_binary = grouped_df[["bcn_id", "sku_id", "purch_bin"]]

In [162]:
len(df_train)

530523

In [163]:
len(df_test_filtered)

11328

In [164]:
len(df_test_filtered) / (len(df_test_filtered) + len(df_train))

0.020906116257052215

In [165]:
def negative_sampling(bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param movie_ids: list of movie ids
    @param items: unique list of movie ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'purch_bin'])

    return df_neg

In [166]:
df_neg = negative_sampling(df_train.bcn_id, df_train.sku_id, grouped_df.sku_id.unique(), 1)

In [167]:
df_train_sam = pd.concat([df_train[["bcn_id","sku_id","purch_bin"]], df_neg], ignore_index=True).sort_values(by="bcn_id", ascending=True)

## Implicit BPR

In [168]:
#!pip install implicit

In [169]:
import implicit
from implicit.gpu.bpr import BayesianPersonalizedRanking as BPR
from implicit.evaluation import leave_k_out_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k, AUC_at_k, train_test_split
from sklearn.model_selection import GridSearchCV

In [170]:
csr_train = sparse.csr_matrix((df_train_sam['purch_bin'], (df_train_sam['bcn_id'], df_train_sam['sku_id'])))
csr_test = sparse.csr_matrix((df_test_filtered['purch_bin'], (df_test_filtered['bcn_id'], df_test_filtered['sku_id'])))

### BINARY Based BPR

In [171]:
model = BPR(factors=200, regularization=0.01, learning_rate=0.01, iterations=20)
model.fit(csr_train)

  0%|          | 0/20 [00:00<?, ?it/s]

In [172]:
auc10 = AUC_at_k(model, csr_train, csr_test, K=10)
prec10 = precision_at_k(model, csr_train, csr_test, K=10)
ndcg10 = ndcg_at_k(model, csr_train, csr_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10}")

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

AUC@10: 0.521077600919676; PREC@10: 0.04228460451977401; NDCG@10: 0.027627283101858788


## Hyperparam tuning

In [173]:
from IPython.utils.sysinfo import num_cpus
from sklearn.model_selection import ParameterGrid

# Grid of hyperparameters to search
param_grid = {
    'factors': [10, 50, 100, 150, 200],
    'iterations': [10, 50, 100, 200],
    'regularization': [0.01, 0.1],
    'learning_rate': [0.01, 0.1]
}

best_auc = -np.inf
best_params = {}

# Iterate through all parameter combinations
for params in ParameterGrid(param_grid):
    model = BPR(factors=params['factors'],
                    iterations=params['iterations'],
                    regularization=params['regularization'],
                    learning_rate=params['learning_rate'])
    model.fit(csr_train, show_progress=False)

    auc10 = AUC_at_k(model, csr_train, csr_test,
                    K=10, num_threads=10, show_progress=False)

    if auc10 > best_auc:
        best_auc = auc10
        best_params = params

# Print the best parameters and AUC
print("Best parameters:", best_params)
print("Best AUC:", best_auc)

Best parameters: {'factors': 200, 'iterations': 200, 'learning_rate': 0.1, 'regularization': 0.01}
Best AUC: 0.5632320776831162


In [174]:
model = BPR(**best_params)
model.fit(csr_train)

  0%|          | 0/200 [00:00<?, ?it/s]

In [175]:
user_factors = model.user_factors
item_factors = model.item_factors
user_factors, item_factors

(Matrix([[-0.34346637  0.32806098 -0.177197   ...  0.21606286 -0.11018754
   -0.591745  ]
  [ 0.26829696 -0.45042846 -0.2405558  ... -0.04287981  0.15635853
   -0.47761747]
  [-0.13479488 -0.04089148 -0.2976222  ... -0.18251424 -0.06504384
   -0.7362829 ]
  ...
  [ 0.0495281  -0.20916565 -0.07627782 ... -0.2287593   0.20837255
    0.32372245]
  [-0.03238364  0.06493343 -0.13956533 ...  0.12140673 -0.1293126
    0.01616186]
  [-0.02331626 -0.07413499 -0.2023835  ...  0.03386206 -0.12746373
    0.28489223]]),
 Matrix([[-0.16582112  0.5825652  -0.08664478 ...  0.01847396 -0.0791307
   -0.3510412 ]
  [ 0.11612154  0.08364809 -0.34448853 ... -0.3081144  -0.20977359
   -0.26884407]
  [-0.21986581 -0.03130478  0.01734632 ...  0.19393004 -0.04652978
   -0.38795105]
  ...
  [ 0.14170998  0.08057016  0.00067038 ...  0.1594337   0.18555018
   -0.02847651]
  [-0.03677226  0.06016164  0.15337013 ... -0.1411368  -0.11841463
   -0.06581392]
  [ 0.11320848  0.08293843  0.03322919 ...  0.07754045  0.26

In [179]:
auc10 = AUC_at_k(model, csr_train, csr_test, K=10)
prec10 = precision_at_k(model, csr_train, csr_test, K=10)
map10 = mean_average_precision_at_k(model, csr_train, csr_test, K=10)
ndcg10 = ndcg_at_k(model, csr_train, csr_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; MAP@10: {map10}; NDCG@10: {ndcg10}")

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

AUC@10: 0.5641592499341598; PREC@10: 0.1284427966101695; MAP@10: 0.06440520328894277; NDCG@10: 0.07952201551873987


# Recommending

In [180]:
sku_list = df[["sku_id", "Product Descr1", "ProductGroupDescription", "ProductGroupMasterDescription"]].drop_duplicates()
sku_list.head(1)

Unnamed: 0,sku_id,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
213,49730,DT PRINT ZQ320 KIT LABEL SENSOR,Mobile Receipt Printer,AIDC/PoS Printers


In [43]:
# randomly selected userids on two clusters, see ALS repo
user_id = 3417
user_id_2 = 1532

In [181]:
from google.colab import files
# Now you can call the recommend function
userid = [user_id]
ids, scores = model.recommend(userid,csr_train[userid], N=10, filter_already_liked_items=True)
ids, scores

rec_tab = pd.DataFrame(data=[ids[0],scores[0]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id", 1:"score", 2:"sku_id", 3:"score"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id}_BPR_BIN_REC.csv')
files.download(f'{user_id}_BPR_BIN_REC.csv')
rec_tab
# rec_tab.sort_values(by="conf", ascending=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,score,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,14383.0,3.562218,MAGIC KEYBOARD FOR IPAD 11.0,Keyboards & Keypads,Input Devices
1,41725.0,3.552271,IPAD AIR WI-FI + CELL 256GB,Tablets (Slate Style),Computer Systems
2,29259.0,3.347409,IPAD PRO 13 SMART FOLIO,Tablet Pc Accs,Computer Systems
3,49918.0,3.337677,IPHONE 14 PRO 256GB,Smart Phones,Mobility
4,49892.0,3.218035,IPHONE 14 512GB,Smart Phones,Mobility
5,51570.0,3.20815,IPAD PRO 11 WI-FI + CELL 256GB,Tablets (Slate Style),Computer Systems
6,41720.0,3.153667,IPAD AIR WI-FI 256GB,Tablets (Slate Style),Computer Systems
7,50021.0,3.134775,IPHONE 14 PRO LEATHER CASE,Housings / Covers,Mobility
8,5405.0,3.13078,TM-T88V (042) SERIAL EDG,Receipt Printer,AIDC/PoS Printers
9,41715.0,3.098956,IPAD AIR WI-FI 256GB,Tablets (Slate Style),Computer Systems


In [183]:
# Now you can call the recommend function
userid = [user_id_2]
ids, scores = model.recommend(userid, csr_train[userid], N=10, filter_already_liked_items=True)
ids, scores

rec_tab = pd.DataFrame(data=[ids[0],scores[0]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id", 1:"score", 2:"sku_id", 3:"score"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id_2}_BPR_BIN_REC.csv')
files.download(f'{user_id_2}_BPR_BIN_REC.csv')
rec_tab
# rec_tab.sort_values(by="conf", ascending=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,score,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,47928.0,3.150486,G502 X LIGHTSPEED BLACK/CORE,Mouse,Input Devices
1,48291.0,3.10063,LOGITECH BRIO 500 WEBCAM,Webcamera,Multimedia & Games
2,38726.0,3.084955,FRITZ BOX 4060,Wireless Lan Broadband Routers/Gateways,Communications & Networking
3,28001.0,3.002584,FRITZ FON C6 BLACK,Wireless Phones,Telephony Equipment
4,51178.0,2.936845,HP 27-CB1300NG AIO R5-5625U,All-In-One Pc,Computer Systems
5,66758.0,2.883036,ULTRA LUXE 64GB USB 3.1,Usb Storage Media,Memory and Processors
6,66694.0,2.810019,USB STICK 32GB CRUZER BLADE,Usb Storage Media,Memory and Processors
7,21441.0,2.808991,FRITZ POWERLINE 540 WLAN SET,Powerline (Homeplug Enet),Communications & Networking
8,30877.0,2.732996,IP 3 15IML05 I5-10210U 8GB,Portable/Notebook Computers,Computer Systems
9,34863.0,2.702142,BUERO EASY START 2022 POSA,Accounting Software,Softwares


# LibRecommender

In [184]:
#!pip install LibRecommender

In [50]:
from libreco.data import split_by_num, random_split
from libreco.data import DatasetFeat
import tensorflow as tf
from tensorflow import keras

from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [52]:
df_3 = grouped_df.copy()
df_3 = df_3.rename(columns={"bcn_id":"user","sku_id":"item","purch_bin":"label"})

In [53]:
train, test, eval = random_split(df_3[["user", "item", "label"]], multi_ratios=[0.8,0.1,0.1])
train, data_info = DatasetFeat.build_trainset(train)
eval = DatasetFeat.build_evalset(eval)
test = DatasetFeat.build_testset(test)

In [54]:
from libreco.algorithms import BPR as BPR_lib
from libreco.evaluation import evaluate
from libreco.data import random_split, DatasetPure
from sklearn.model_selection import train_test_split

Instructions for updating:
non-resource variables are not supported in the long term


In [55]:
def reset_state(name):
    tf.compat.v1.reset_default_graph()
    print("\n", "=" * 30, name, "=" * 30)

In [56]:
metrics = [
        "loss",
        "balanced_accuracy",
        "roc_auc",
        "precision",
        "recall",
        "map",
        "ndcg",
    ]

In [58]:
model = BPR_lib(
    "ranking",
    data_info=data_info,
    loss_type='bpr',
    embed_size=16,
    n_epochs=10,
    lr=0.001,
    reg=0.001,
    batch_size=256,
    use_tf=True,
    optimizer='sgd',
    num_neg=1,
    sampler='random',
    num_threads=4,
)

In [59]:
model.fit(
        train_data=train,
        neg_sampling=True,
        verbose=1,
        shuffle=True,
        eval_data=eval,
        metrics=metrics,
        k=10,
        eval_batch_size=256,
        eval_user_num=None,
          )

Training start time: [35m2023-08-12 20:30:28[0m


train: 100%|██████████| 1694/1694 [00:03<00:00, 543.77it/s]


Epoch 1 elapsed: 3.119s


train: 100%|██████████| 1694/1694 [00:02<00:00, 586.75it/s]


Epoch 2 elapsed: 2.892s


train: 100%|██████████| 1694/1694 [00:03<00:00, 468.20it/s]


Epoch 3 elapsed: 3.624s


train: 100%|██████████| 1694/1694 [00:04<00:00, 414.04it/s]


Epoch 4 elapsed: 4.102s


train: 100%|██████████| 1694/1694 [00:03<00:00, 455.74it/s]


Epoch 5 elapsed: 3.722s


train: 100%|██████████| 1694/1694 [00:03<00:00, 516.26it/s]


Epoch 6 elapsed: 3.287s


train: 100%|██████████| 1694/1694 [00:02<00:00, 589.78it/s]


Epoch 7 elapsed: 2.877s


train: 100%|██████████| 1694/1694 [00:03<00:00, 494.26it/s]


Epoch 8 elapsed: 3.433s


train: 100%|██████████| 1694/1694 [00:02<00:00, 588.76it/s]


Epoch 9 elapsed: 2.883s


train: 100%|██████████| 1694/1694 [00:02<00:00, 601.89it/s]

Epoch 10 elapsed: 2.822s





In [60]:
eval_result = evaluate(model=model,
        data=test,
        neg_sampling=True,
        eval_batch_size=2568,
        k=10,
        metrics=metrics)
eval_result

eval_pointwise: 100%|██████████| 40/40 [00:00<00:00, 1584.34it/s]
eval_listwise: 100%|██████████| 6365/6365 [00:17<00:00, 359.04it/s]


{'loss': 0.6817061186806096,
 'balanced_accuracy': 0.7218066056506167,
 'roc_auc': 0.7912235992813496,
 'precision': 0.026975648075412414,
 'recall': 0.017411417632752536,
 'map': 0.042833374528461615,
 'ndcg': 0.055880135253159186}

In [61]:
eval_result = evaluate(model=model,
        data=eval,
        neg_sampling=True,
        eval_batch_size=2568,
        k=10,
        metrics=metrics)
eval_result

eval_pointwise: 100%|██████████| 40/40 [00:00<00:00, 1522.35it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:19<00:00, 322.51it/s]


{'loss': 0.6816812007456293,
 'balanced_accuracy': 0.7207639608061818,
 'roc_auc': 0.7900884744421722,
 'precision': 0.027453747256193164,
 'recall': 0.01782588851672822,
 'map': 0.04305681543276952,
 'ndcg': 0.05594847895843843}