<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Bayesian_Personalized_Ranking_(BPR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bayesian Personalized Ranking

This notebook will use the pairwise-ranking algorithm BPR to recommend and rank Top10 items based on the paper of Rendle et al. 2009 (https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf)

Therefore two different RecSys libraries will be used:



*   Implicit BPR(https://benfred.github.io/implicit/ by Ben Frederickson)

In [1]:
import warnings
import zipfile
import time
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import scipy.sparse as sparse

from datetime import datetime, timedelta
warnings.filterwarnings("ignore")

## Data Preprocessing

In [2]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

In [3]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0


In [4]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2220299 17697 77401


In [5]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)
# Entry Date to date
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]
# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]
#insert purchase indication column
df["purchase"] = 1

In [6]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2028956 13894 75643


In [7]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('BranchCustomerNbr')['Sku'].transform('nunique')

  return df_pl

In [8]:
df = sku_count(df)

In [9]:
# drop customers that only purchased 1 SKU
df = df[df["sku_count"] > 1]

In [10]:
print(len(df), len(df["BranchCustomerNbr"].unique()), len(df["Sku"].unique()))

2025544 11328 75495


In [11]:
# Create a numeric user_id and artist_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [12]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,...,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped,purchase,sku_count,bcn_id,sku_id
213,1545306,2022-11-02,,2022FM11,15885514,AXIS SOLUTION (PRIVATE) LIMITED,Export Channel (DE),,CF55877,DT PRINT ZQ320 KIT LABEL SENSOR,...,5805,AIDC/PoS Printers,Mobile Receipt Printer,"Other (incl. AIDC/POS, V7)","10.713,30",41.0,1,20,417,49730
458,4422886,2022-11-03,,2023FM02,44413224,BWG INFORMATIONSYSTEME GMBH,Business Channel,,CF89211,Z-SELECT 2000D REMOVABLE NS,...,5812,AIDC/PoS Printers,Label Printers Supplies,"Other (incl. AIDC/POS, V7)","1.393,00",140.0,1,111,803,52649


In [13]:
users = df.bcn_id.unique()
skus = df.sku_id.unique()
print(len(users), len(skus))

11328 75495


In [14]:
df_2 = df[["bcn_id", "sku_id", "Entry Date", "Qty Shipped", "purchase"]]

In [15]:
grouped_df = df_2.groupby(["bcn_id", "sku_id"]).agg({
          "Qty Shipped":"sum",
          "purchase":"sum"}).reset_index()

In [16]:
# create binary column
grouped_df["purch_bin"] = 1

In [17]:
grouped_df.head(2)

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,purch_bin
0,0,925,1.0,1,1
1,0,3417,1.0,1,1


## Negative Sampling

In [18]:
grouped_df_2 = grouped_df.copy()

In [19]:
def negative_sampling(df_pl, bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param sku_ids: list of sku ids
    @param items: unique list of sku ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'purchase'])

    #df_train = df_train[['bcn_id', 'sku_id']].assign(purchase=1)
    df_pl = pd.concat([df_pl, df_neg], ignore_index=True)

    return df_pl

In [20]:
neg_grouped_df = negative_sampling(grouped_df_2, users, skus, skus, n_neg=100)

In [21]:
neg_grouped_df = neg_grouped_df.fillna(0)

In [22]:
print(len(grouped_df), len(neg_grouped_df))

541851 1674651


## Implicit BPR

In [23]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.0-cp310-cp310-manylinux2014_x86_64.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.0


In [24]:
import implicit
from implicit.gpu.bpr import BayesianPersonalizedRanking as BPR
from implicit.evaluation import leave_k_out_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k, AUC_at_k, train_test_split
from sklearn.model_selection import GridSearchCV

In [25]:
csr_qty_matrix = sparse.csr_matrix((grouped_df['Qty Shipped'], (grouped_df['bcn_id'], grouped_df['sku_id'])))
csr_freq_matrix = sparse.csr_matrix((grouped_df['purchase'], (grouped_df['bcn_id'], grouped_df['sku_id'])))
csr_bin_matrix = sparse.csr_matrix((grouped_df['purch_bin'], (grouped_df['bcn_id'], grouped_df['sku_id'])))

In [26]:
neg_csr_qty_matrix = sparse.csr_matrix((neg_grouped_df['Qty Shipped'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))
neg_csr_freq_matrix = sparse.csr_matrix((neg_grouped_df['purchase'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))
neg_csr_bin_matrix = sparse.csr_matrix((neg_grouped_df['purch_bin'], (neg_grouped_df['bcn_id'], neg_grouped_df['sku_id'])))

In [27]:
csr_qty_matrix_train, csr_qty_matrix_test = leave_k_out_split(csr_qty_matrix, K=1, random_state=None)
csr_freq_matrix_train, csr_freq_matrix_test = leave_k_out_split(csr_freq_matrix, K=1, random_state=None)
csr_bin_matrix_train, csr_bin_matrix_test = leave_k_out_split(csr_bin_matrix, K=1, random_state=None)

In [28]:
csr_qty_matrix_train_2, csr_qty_matrix_test_2 = train_test_split(csr_qty_matrix, train_percentage=0.9, random_state=None)
csr_freq_matrix_train_2, csr_freq_matrix_test_2 = train_test_split(csr_freq_matrix, train_percentage=0.9,  random_state=None)
csr_bin_matrix_train_2, csr_bin_matrix_test_2 = train_test_split(csr_bin_matrix, train_percentage=0.9, random_state=None)

In [29]:
neg_csr_qty_matrix_train, neg_csr_qty_matrix_test = leave_k_out_split(neg_csr_qty_matrix, K=1, random_state=None)
neg_csr_freq_matrix_train, neg_csr_freq_matrix_test = leave_k_out_split(neg_csr_freq_matrix, K=1, random_state=None)
neg_csr_bin_matrix_train, neg_csr_bin_matrix_test = leave_k_out_split(neg_csr_bin_matrix, K=1, random_state=None)

### BINARY Based BPR

In [30]:
model = BPR(factors=200, regularization=0.01, learning_rate=0.01, iterations=20)
model.fit(csr_bin_matrix_train)

  0%|          | 0/20 [00:00<?, ?it/s]

In [31]:
auc10 = AUC_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
prec10 = precision_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; NDCG@10: {ndcg10}")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.528000813761812; PREC@10: 0.056130903065451536; NDCG@10: 0.03482581374131517


## Hyperparam tuning

In [32]:
from IPython.utils.sysinfo import num_cpus
from sklearn.model_selection import ParameterGrid

# Grid of hyperparameters to search
param_grid = {
    'factors': [50, 100, 150, 200],
    'iterations': [10, 15, 20],
    'regularization': [0.01, 0.1],
    'learning_rate': [0.01, 0.1]
}

best_auc = -np.inf
best_params = {}

# Iterate through all parameter combinations
for params in ParameterGrid(param_grid):
    model = BPR(factors=params['factors'],
                    iterations=params['iterations'],
                    regularization=params['regularization'],
                    learning_rate=params['learning_rate'])
    model.fit(csr_bin_matrix_train, show_progress=False)

    auc10 = AUC_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test,
                    K=10, num_threads=10, show_progress=False)

    if auc10 > best_auc:
        best_auc = auc10
        best_params = params

# Print the best parameters and AUC
print("Best parameters:", best_params)
print("Best AUC:", best_auc)

Best parameters: {'factors': 100, 'iterations': 20, 'learning_rate': 0.1, 'regularization': 0.01}
Best AUC: 0.5316777397871307


In [33]:
best_auc_2 = -np.inf
best_params_2 = {}

# Iterate through all parameter combinations
for params in ParameterGrid(param_grid):
    model_2 = BPR(factors=params['factors'],
                    iterations=params['iterations'],
                    regularization=params['regularization'],
                    learning_rate=params['learning_rate'])
    model_2.fit(neg_csr_bin_matrix_train, show_progress=False)

    auc10_2 = AUC_at_k(model_2, neg_csr_bin_matrix_train, neg_csr_bin_matrix_test,
                    K=10, num_threads=10, show_progress=False)

    if auc10_2 > best_auc_2:
        best_auc_2 = auc10_2
        best_params_2 = params

# Print the best parameters and AUC
print("Best parameters:", best_params_2)
print("Best AUC:", best_auc_2)

Best parameters: {'factors': 100, 'iterations': 10, 'learning_rate': 0.1, 'regularization': 0.01}
Best AUC: 0.5125582750598785


In [34]:
model = BPR(**best_params)
model.fit(csr_bin_matrix_train)

  0%|          | 0/20 [00:00<?, ?it/s]

In [35]:
auc10 = AUC_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
prec10 = precision_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
map10 = mean_average_precision_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
ndcg10 = ndcg_at_k(model, csr_bin_matrix_train, csr_bin_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; MAP@10: {map10}; NDCG@10: {ndcg10}")

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.5334900762125113; PREC@10: 0.06710853355426678; MAP@10: 0.033484855209689475; NDCG@10: 0.04137292006712586


In [36]:
model_2 = BPR(**best_params_2)
model_2.fit(neg_csr_bin_matrix_train)

  0%|          | 0/10 [00:00<?, ?it/s]

In [37]:
auc10 = AUC_at_k(model_2, neg_csr_bin_matrix_train, neg_csr_bin_matrix_test, K=10)
prec10 = precision_at_k(model_2, neg_csr_bin_matrix_train, neg_csr_bin_matrix_test, K=10)
map10 = mean_average_precision_at_k(model_2, neg_csr_bin_matrix_train, neg_csr_bin_matrix_test, K=10)
ndcg10 = ndcg_at_k(model_2, neg_csr_bin_matrix_train, csr_bin_matrix_test, K=10)
print(f"AUC@10: {auc10}; PREC@10: {prec10}; MAP@10: {map10}; NDCG@10: {ndcg10}")

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/11328 [00:00<?, ?it/s]

  0%|          | 0/9656 [00:00<?, ?it/s]

AUC@10: 0.512867311807139; PREC@10: 0.025865112994350282; MAP@10: 0.013769014550264544; NDCG@10: 0.00032172682183366014


# Recommending

In [38]:
sku_list = df[["sku_id", "Product Descr1", "ProductGroupDescription", "ProductGroupMasterDescription"]].drop_duplicates()
sku_list.head(1)

Unnamed: 0,sku_id,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
213,49730,DT PRINT ZQ320 KIT LABEL SENSOR,Mobile Receipt Printer,AIDC/PoS Printers


In [39]:
# randomly selected userids on two clusters, see ALS repo
user_id = 3417
user_id_2 = 1532

In [40]:
from google.colab import files
# Now you can call the recommend function
userid = [user_id]
ids, scores = model.recommend(userid, csr_bin_matrix[userid], N=10, filter_already_liked_items=True)
ids, scores

rec_tab = pd.DataFrame(data=[ids[0],scores[0]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id", 1:"score", 2:"sku_id", 3:"score"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id}_BPR_BIN_REC.csv')
files.download(f'{user_id}_BPR_BIN_REC.csv')
rec_tab
# rec_tab.sort_values(by="conf", ascending=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,score,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,51580.0,2.894855,IPAD PRO 13 WIFI + CELL 256GB,Tablets (Slate Style),Computer Systems
1,3533.0,2.876788,A170 HEADSET,Ip Phones,Communications & Networking
2,3542.0,2.819121,SNOM M70,Ip Phones,Communications & Networking
3,24567.0,2.816234,TANDBERG RDX 4TB CARTRIDGE,Lto (Ultrium),Consumables
4,14383.0,2.805463,MAGIC KEYBOARD FOR IPAD 11.0,Keyboards & Keypads,Input Devices
5,38398.0,2.688366,ASPIRE 517-52-599P 17.3IN,Portable/Notebook Computers,Computer Systems
6,44778.0,2.647599,OPTIPLEX 7400 AIO I7-12700 16GB,All-In-One Pc,Computer Systems
7,14533.0,2.632974,EARPODS,Bluetooth Headsets,Mobility
8,5549.0,2.615489,EPSON TM-J7700 (321) WHITE,Receipt Printer,AIDC/PoS Printers
9,18068.0,2.580905,HL-L2370DN MONO LASER 30PPM A4,"Laser, Led, Solid Ink Monochrome",Printers/AIO/Copiers/Fax


In [41]:
# Now you can call the recommend function
userid = [user_id_2]
ids, scores = model.recommend(userid, csr_bin_matrix[userid], N=10, filter_already_liked_items=True)
ids, scores

rec_tab = pd.DataFrame(data=[ids[0],scores[0]])
rec_tab = rec_tab.T.rename(columns={0:"sku_id", 1:"score", 2:"sku_id", 3:"score"})
rec_tab = rec_tab.merge(sku_list, on="sku_id", how="left")
rec_tab.to_csv(f'{user_id_2}_BPR_BIN_REC.csv')
files.download(f'{user_id_2}_BPR_BIN_REC.csv')
rec_tab
# rec_tab.sort_values(by="conf", ascending=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,sku_id,score,Product Descr1,ProductGroupDescription,ProductGroupMasterDescription
0,38726.0,5.251674,FRITZ BOX 4060,Wireless Lan Broadband Routers/Gateways,Communications & Networking
1,66773.0,5.20785,SANDISK ULTRA USB C FLASH DRIVE,Usb Storage Media,Memory and Processors
2,52752.0,5.057828,NH.QEWEG.00B I5-11400H 15.6IN,Portable/Notebook Computers,Computer Systems
3,66689.0,5.052831,USB STICK CRUZER BLADE 64GB,Usb Storage Media,Memory and Processors
4,48291.0,5.043551,LOGITECH BRIO 500 WEBCAM,Webcamera,Multimedia & Games
5,50723.0,4.907997,MSD 27GQ50F-B 27IN 68.4CM,Lcd Monitor,Display
6,26136.0,4.890251,SANDISK EXTREME PORTABLE SSD,Solid State Drive (Ssd),Hard Drives & Optical Drives
7,28001.0,4.84428,FRITZ FON C6 BLACK,Wireless Phones,Telephony Equipment
8,26837.0,4.821978,SANDISK ULTRA DUAL DRIVE GO-C,Usb Storage Media,Memory and Processors
9,66760.0,4.7967,ULTRA LUXE 32GB USB 3.1,Usb Storage Media,Memory and Processors


# LibRecommender

In [43]:
!pip install LibRecommender

Collecting LibRecommender
  Downloading LibRecommender-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: LibRecommender
Successfully installed LibRecommender-1.2.2


In [44]:
from libreco.data import split_by_num, random_split
from libreco.data import DatasetFeat
import tensorflow as tf
from tensorflow import keras

from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

Instructions for updating:
non-resource variables are not supported in the long term


In [47]:
grouped_df

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,purch_bin
0,0,925,1.0,1,1
1,0,3417,1.0,1,1
2,0,3418,4.0,2,1
3,0,3419,3.0,1,1
4,0,4184,4.0,1,1
...,...,...,...,...,...
541846,11326,33694,9.0,1,1
541847,11326,42224,2.0,1,1
541848,11326,45153,2.0,1,1
541849,11327,35982,25.0,1,1


In [48]:
df_3 = grouped_df.copy()

In [49]:
df_3 = df_3.rename(columns={"bcn_id":"user", "sku_id":"item", "purch_bin":"label"})

In [52]:
train, test, eval = random_split(df_3[["user", "item", "label"]], multi_ratios=[0.8,0.1,0.1])
train, data_info = DatasetFeat.build_trainset(train)
eval = DatasetFeat.build_evalset(eval)
test = DatasetFeat.build_testset(test)

In [53]:
train_2, eval_2 = split_by_num(df_3[["user", "item", "label"]], test_size=1)
train_2, data_info_2 = DatasetFeat.build_trainset(train_2)
eval_2 = DatasetFeat.build_evalset(eval_2)

In [54]:
from libreco.algorithms import BPR as BPR_lib
from libreco.evaluation import evaluate
from libreco.data import random_split, DatasetPure

In [None]:
#  task : {'ranking'}
#         Recommendation task. See :ref:`Task`.
#     data_info : :class:`~libreco.data.DataInfo` object
#         Object that contains useful information for training and inference.
#     loss_type : {'bpr'}
#         Loss for model training.
#     embed_size: int, default: 16
#         Vector size of embeddings.
#     norm_embed : bool, default: False
#         Whether to l2 normalize output embeddings.
#     n_epochs: int, default: 10
#         Number of epochs for training.
#     lr : float, default 0.001
#         Learning rate for training.
#     lr_decay : bool, default: False
#         Whether to use learning rate decay.
#     epsilon : float, default: 1e-5
#         A small constant added to the denominator to improve numerical stability in
#         Adam optimizer.
#         According to the `official comment <https://github.com/tensorflow/tensorflow/blob/v1.15.0/tensorflow/python/training/adam.py#L64>`_,
#         default value of `1e-8` for `epsilon` is generally not good, so here we choose `1e-5`.
#         Users can try tuning this hyperparameter if the training is unstable.
#     reg : float or None, default: None
#         Regularization parameter, must be non-negative or None.
#     batch_size : int, default: 256
#         Batch size for training.
#     sampler : {'random', 'unconsumed', 'popular'}, default: 'random'
#         Negative sampling strategy.

#         - ``'random'`` means random sampling.
#         - ``'unconsumed'`` samples items that the target user did not consume before.
#         - ``'popular'`` has a higher probability to sample popular items as negative samples.

#         .. versionadded:: 1.1.0

#     num_neg : int, default: 1
#         Number of negative samples for each positive sample, only used in `ranking` task.
#     use_tf : bool, default: True
#         Whether to use TensorFlow or Cython version. The TensorFlow version is more
#         accurate, whereas the Cython version is faster.
#     seed : int, default: 42
#         Random seed.
#     lower_upper_bound : tuple or None, default: None
#         Lower and upper score bound for `rating` task.
#     tf_sess_config : dict or None, default: None
#         Optional TensorFlow session config, see `ConfigProto options
#         <https://github.com/tensorflow/tensorflow/blob/v2.10.0/tensorflow/core/protobuf/config.proto#L431>`_.
#     optimizer : {'sgd', 'momentum', 'adam'}, default: 'adam'
#         Optimizer used in Cython version.
#     num_threads : int, default: 1
#         Number of threads used in Cython version.

In [57]:
metrics = [
        "loss",
        "balanced_accuracy",
        "roc_auc",
        "precision",
        "recall",
        "map",
        "ndcg",
    ]

In [56]:
model = BPR_lib(
    "ranking",
    data_info=data_info,
    loss_type='bpr',
    embed_size=16,
    n_epochs=20,
    lr=0.001,
    reg=0.01,
    batch_size=256,
    use_tf=True,
    optimizer='sgd',
)

In [61]:
model.fit(
        train_data=train,
        neg_sampling=True,
        verbose=1,
        shuffle=True,
        eval_data=eval,
        metrics=metrics,
        k=10,
        eval_batch_size=2048,
        eval_user_num=None,
          )

Training start time: [35m2023-08-10 21:07:49[0m


train: 100%|██████████| 1694/1694 [00:03<00:00, 461.40it/s]


Epoch 1 elapsed: 3.676s


train: 100%|██████████| 1694/1694 [00:03<00:00, 521.56it/s]


Epoch 2 elapsed: 3.256s


train: 100%|██████████| 1694/1694 [00:02<00:00, 572.90it/s]


Epoch 3 elapsed: 2.965s


train: 100%|██████████| 1694/1694 [00:02<00:00, 571.77it/s]


Epoch 4 elapsed: 2.969s


train: 100%|██████████| 1694/1694 [00:02<00:00, 568.02it/s]


Epoch 5 elapsed: 2.989s


train: 100%|██████████| 1694/1694 [00:03<00:00, 515.77it/s]


Epoch 6 elapsed: 3.290s


train: 100%|██████████| 1694/1694 [00:02<00:00, 574.10it/s]


Epoch 7 elapsed: 2.956s


train: 100%|██████████| 1694/1694 [00:02<00:00, 577.57it/s]


Epoch 8 elapsed: 2.941s


train: 100%|██████████| 1694/1694 [00:02<00:00, 570.16it/s]


Epoch 9 elapsed: 2.977s


train: 100%|██████████| 1694/1694 [00:03<00:00, 537.89it/s]


Epoch 10 elapsed: 3.156s


train: 100%|██████████| 1694/1694 [00:03<00:00, 545.31it/s]


Epoch 11 elapsed: 3.115s


train: 100%|██████████| 1694/1694 [00:02<00:00, 579.13it/s]


Epoch 12 elapsed: 2.931s


train: 100%|██████████| 1694/1694 [00:02<00:00, 590.84it/s]


Epoch 13 elapsed: 2.873s


train: 100%|██████████| 1694/1694 [00:02<00:00, 576.91it/s]


Epoch 14 elapsed: 2.942s


train: 100%|██████████| 1694/1694 [00:03<00:00, 541.54it/s]


Epoch 15 elapsed: 3.136s


train: 100%|██████████| 1694/1694 [00:02<00:00, 582.53it/s]


Epoch 16 elapsed: 2.915s


train: 100%|██████████| 1694/1694 [00:02<00:00, 584.46it/s]


Epoch 17 elapsed: 2.905s


train: 100%|██████████| 1694/1694 [00:02<00:00, 578.12it/s]


Epoch 18 elapsed: 2.936s


train: 100%|██████████| 1694/1694 [00:03<00:00, 515.55it/s]


Epoch 19 elapsed: 3.292s


train: 100%|██████████| 1694/1694 [00:02<00:00, 577.83it/s]

Epoch 20 elapsed: 2.944s





In [62]:
eval_result = evaluate(model=model,
        data=eval,
        neg_sampling=True,
        eval_batch_size=2048,
        k=10,
        metrics=metrics)
eval_result

random neg item sampling elapsed: 0.049s


eval_pointwise: 100%|██████████| 50/50 [00:00<00:00, 1038.55it/s]
eval_listwise: 100%|██████████| 6378/6378 [00:13<00:00, 489.16it/s]


{'loss': 0.69188733878235,
 'balanced_accuracy': 0.6560682705329404,
 'roc_auc': 0.6995442962403418,
 'precision': 0.02085293195359047,
 'recall': 0.011146551094890196,
 'map': 0.035542659079507065,
 'ndcg': 0.046946642048460906}