<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Bayesian_Personalized_Ranking_(BPR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bayesian Personalized Ranking

This notebook will use the pairwise-ranking algorithm BPR to recommend and rank Top10 items based on the paper of Rendle et al. 2009 (https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf)

Therefore two different RecSys libraries will be used:



*   Implicit BPR(https://benfred.github.io/implicit/ by Ben Frederickson)
*   LightFM BPR(https://making.lyst.com/lightfm/docs/home.html by Maciej Kula)



In [1]:
import warnings
import zipfile
import time
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import scipy.sparse as sparse

from datetime import datetime, timedelta
warnings.filterwarnings("ignore")

## Data Preprocessing

In [2]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [3]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0


In [4]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2220299 17697 77401


In [5]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# Entry Date to date
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]
# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]
#insert purchase indication column
df["purchase"] = 1

In [6]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [7]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('BranchCustomerNbr')['Sku'].transform('nunique')

  return df_pl

In [8]:
df = sku_count(df)

In [9]:
# drop customers that only purchased 1 SKU
df = df[df["sku_count"] > 1]

In [10]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2025544 11328 75495


In [11]:
# Create a numeric user_id and artist_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [12]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,...,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped,purchase,sku_count,bcn_id,sku_id
213,1545306,2022-11-02,,2022FM11,15885514,AXIS SOLUTION (PRIVATE) LIMITED,Export Channel (DE),,CF55877,DT PRINT ZQ320 KIT LABEL SENSOR,...,5805,AIDC/PoS Printers,Mobile Receipt Printer,"Other (incl. AIDC/POS, V7)","10.713,30",41.0,1,20,417,49730
458,4422886,2022-11-03,,2023FM02,44413224,BWG INFORMATIONSYSTEME GMBH,Business Channel,,CF89211,Z-SELECT 2000D REMOVABLE NS,...,5812,AIDC/PoS Printers,Label Printers Supplies,"Other (incl. AIDC/POS, V7)","1.393,00",140.0,1,111,803,52649


In [21]:
users = df.bcn_id.unique()
skus = df.sku_id.unique()
print(len(users), len(skus))

11328 75495


In [22]:
df_2 = df[["bcn_id", "sku_id", "Entry Date", "Qty Shipped", "purchase"]]

In [23]:
grouped_df = df_2.groupby(["bcn_id", "sku_id"]).agg({
          "Qty Shipped":"sum",
          "purchase":"sum"}).reset_index()

In [24]:
# create binary column
grouped_df["purch_bin"] = 1

In [25]:
grouped_df.head(2)

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,purch_bin
0,0,925,1.0,1,1
1,0,3417,1.0,1,1


## Negative Sampling

In [26]:
grouped_df_2 = grouped_df.copy()

In [30]:
def negative_sampling(df_pl, bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param sku_ids: list of sku ids
    @param items: unique list of sku ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'purchase'])

    #df_train = df_train[['bcn_id', 'sku_id']].assign(purchase=1)
    df_pl = pd.concat([df_pl, df_neg], ignore_index=True)

    return df_pl

In [32]:
neg_grouped_df = negative_sampling(grouped_df_2, users, skus, skus, n_neg=100)

In [35]:
neg_grouped_df = neg_grouped_df.fillna(0)

In [39]:
print(len(grouped_df), len(neg_grouped_df))

541851 1674651


## Implicit BPR

In [40]:
!pip install implicit



In [41]:
import implicit
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
from implicit.evaluation import leave_k_out_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k, AUC_at_k, train_test_split
from sklearn.model_selection import GridSearchCV

In [42]:
from implicit.gpu.als import AlternatingLeastSquares as gpu_ALS
from implicit.gpu.bpr import BayesianPersonalizedRanking as gpu_BPR

In [45]:
csr_qty_matrix = sparse.csr_matrix((grouped_df['Qty Shipped'], (grouped_df['bcn_id'], grouped_df['sku_id'])))
csr_freq_matrix = sparse.csr_matrix((grouped_df['purchase'], (grouped_df['bcn_id'], grouped_df['sku_id'])))
csr_bin_matrix = sparse.csr_matrix((grouped_df['purch_bin'], (grouped_df['bcn_id'], grouped_df['sku_id'])))

In [44]:
neg_csr_qty_matrix = sparse.csr_matrix((neg_grouped_df['Qty Shipped'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))
neg_csr_freq_matrix = sparse.csr_matrix((neg_grouped_df['purchase'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))
neg_csr_bin_matrix = sparse.csr_matrix((neg_grouped_df['purch_bin'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))

In [90]:
csr_qty_matrix_train, csr_qty_matrix_test = leave_k_out_split(csr_qty_matrix, K=1, random_state=None)
csr_freq_matrix_train, csr_freq_matrix_test = leave_k_out_split(csr_freq_matrix, K=1, random_state=None)
csr_bin_matrix_train, csr_bin_matrix_test = leave_k_out_split(csr_bin_matrix, K=1, random_state=None)

In [126]:
csr_qty_matrix_train_2, csr_qty_matrix_test_2 = train_test_split(csr_qty_matrix, train_percentage=0.9, random_state=None)
csr_freq_matrix_train_2, csr_freq_matrix_test_2 = train_test_split(csr_freq_matrix, train_percentage=0.9,  random_state=None)
csr_bin_matrix_train_2, csr_bin_matrix_test_2 = train_test_split(csr_bin_matrix, train_percentage=0.9, random_state=None)

In [47]:
neg_csr_qty_matrix_train, neg_csr_qty_matrix_test = leave_k_out_split(neg_csr_qty_matrix, K=1, random_state=None)
csr_freq_matrix_train, neg_csr_freq_matrix_test = leave_k_out_split(neg_csr_freq_matrix, K=1, random_state=None)
neg_csr_bin_matrix_train, neg_csr_bin_matrix_test = leave_k_out_split(neg_csr_bin_matrix, K=1, random_state=None)

### QTY Based BPR

In [48]:
model = gpu_BPR(factors=200, regularization=0.01, learning_rate=0.01)
model.fit(csr_qty_matrix_train)

  0%|          | 0/100 [00:00<?, ?it/s]

In [51]:
auc10 = AUC_at_k(model, csr_qty_matrix_train, csr_qty_matrix_test, K=10)
prec10 = precision_at_k(model, csr_qty_matrix_train, csr_qty_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_qty_matrix_train, csr_qty_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10};; ")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.5316260847166964; PREC@10: 0.06338028169014084; NDCG@10: 0.041402531127243236;; 


In [52]:
model = gpu_BPR(factors=200, regularization=0.01, learning_rate=0.01)
model.fit(neg_csr_qty_matrix_train)

  0%|          | 0/100 [00:00<?, ?it/s]

In [54]:
auc10 = AUC_at_k(model, neg_csr_qty_matrix_train, neg_csr_qty_matrix_test, K=10)
prec10 = precision_at_k(model, neg_csr_qty_matrix_train, neg_csr_qty_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, neg_csr_qty_matrix_train, neg_csr_qty_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10};; ")

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

AUC@10: 0.5142357980444549; PREC@10: 0.028601694915254237; NDCG@10: 0.020536909852123586;; 


### Freq Based BPR

In [96]:
model = gpu_BPR(factors=200, regularization=0.01, learning_rate=0.01)
model.fit(csr_freq_matrix_train)

  0%|          | 0/100 [00:00<?, ?it/s]

In [101]:
auc10 = AUC_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
prec10 = precision_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10}")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.5297617461558763; PREC@10: 0.05965202982601491; NDCG@10: 0.037049842852907494


In [99]:
model = gpu_BPR(factors=200, regularization=0.01, learning_rate=0.01)
model.fit(csr_freq_matrix_train_2)

  0%|          | 0/100 [00:00<?, ?it/s]

In [100]:
auc10 = AUC_at_k(model, csr_freq_matrix_train_2, csr_freq_matrix_test_2, K=10)
prec10 = precision_at_k(model, csr_freq_matrix_train_2, csr_freq_matrix_test_2, K=10)
ndcg10 = ndcg_at_k(model, csr_freq_matrix_train_2, csr_freq_matrix_test_2, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10};; ")

  0%|          | 0/8577 [00:00<?, ?it/s]

  0%|          | 0/8577 [00:00<?, ?it/s]

  0%|          | 0/8577 [00:00<?, ?it/s]

AUC@10: 0.5144105645077919; PREC@10: 0.16040659117894066; NDCG@10: 0.07393155307435943;; 


### BINARY Based BPR

In [102]:
model = gpu_BPR(factors=200, regularization=0.01, learning_rate=0.01, iterations=20)
model.fit(csr_bin_matrix_train)

  0%|          | 0/20 [00:00<?, ?it/s]

In [103]:
auc10 = AUC_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
prec10 = precision_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10}")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.5287777386699383; PREC@10: 0.057684341342170674; NDCG@10: 0.03672581786509389


## Hyperparam tuning

In [113]:
from IPython.utils.sysinfo import num_cpus
from sklearn.model_selection import ParameterGrid

# Grid of hyperparameters to search
param_grid = {
    'factors': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200],
    'iterations': [10, 12, 14, 16, 18, 20],
    'regularization': [0.01, 0.1],
    'learning_rate': [0.01, 0.1]
}

best_auc = -np.inf
best_params = {}

# Iterate through all parameter combinations
for params in ParameterGrid(param_grid):
    model = gpu_BPR(factors=params['factors'],
                    iterations=params['iterations'],
                    regularization=params['regularization'],
                    learning_rate=params['learning_rate'])
    model.fit(csr_freq_matrix_train, show_progress=False)

    auc10 = AUC_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test,
                    K=10, num_threads=10, show_progress=False)

    if auc10 > best_auc:
        best_auc = auc10
        best_params = params

# Print the best parameters and AUC
print("Best parameters:", best_params)
print("Best AUC:", best_auc)

Best parameters: {'factors': 110, 'iterations': 20, 'learning_rate': 0.1, 'regularization': 0.01}
Best AUC: 0.5350952683291531


In [129]:
model = gpu_BPR(**best_params)
model.fit(csr_freq_matrix_train)

  0%|          | 0/20 [00:00<?, ?it/s]

In [130]:
auc10 = AUC_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
prec10 = precision_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_freq_matrix_train, csr_freq_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10}")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.532868628196506; PREC@10: 0.06586578293289147; NDCG@10: 0.04068855024042962
