<a href="https://colab.research.google.com/github/vincm1/RecSys_Implicit/blob/master/Alternating_Least_Squares_(ALS).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade implicit



In [None]:
import zipfile
import time
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import scipy.sparse as sparse

from datetime import datetime, timedelta


# Loading DataFrames



In [None]:
df_zip = zipfile.ZipFile('/content/drive/MyDrive/RecSys/Orders_Nov22_Jun23.zip')
df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")

  df = pd.read_csv(df_zip.open('Bericht 1.csv'), delimiter=";")


In [None]:
df.head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
0,1547606,01.11.22,,2022FM11,15515778,NET-S M. CHMIELEWSKI,Export Channel (DE),,9433B9X,INK CARTRIDGE SPS,BLACK 370ML 600 DPI INKJET BULK,1037,Consumables,Ink,Supplies,-1533,-1.0
1,1547615,01.11.22,,2022FM11,15509465,DIGITAL RIVER IRELAND LIMITED,Export Channel (DE),DIRL,CB31510,LENOVO KEYBOARD PACK,FOR TAB P11-DE,641,Input Devices,Keyboards & Keypads,Printers & Peripherals,-10461,-1.0


In [None]:
len(df)

2220299

In [None]:
len(df["BranchCustomerNbr"].unique())

17697

In [None]:
len(df["Sku"].unique())

77401

# Data Preprocessing

In [None]:
# converting the customerid to string
df["BranchCustomerNbr"] = df["BranchCustomerNbr"].astype(str)
# converting the skuid to string
df["Sku"] = df["Sku"].astype(str)

In [None]:
df['Entry Date'] = pd.to_datetime(df['Entry Date'], format='%d.%m.%y')

In [None]:
# lines with zero shipment qty
len(df[df["Qty Shipped"] == 0])

113860

In [None]:
df[df["Qty Shipped"] == 0].head(2)

Unnamed: 0,Order Nbr,Entry Date,Entry DateTime,FiscalMonth,BranchCustomerNbr,CustomerName,BusinessUnitLevel2,KDGroup,Sku,Product Descr1,Product Descr2,ProductGroup,ProductGroupMasterDescription,ProductGroupDescription,ProductGroup2ndDescription,Sales,Qty Shipped
24204,4434497,2023-01-10,,2023FM01,44475054,EVERIT GMBH,Business Channel,,CD71519,1830 24G 12P CLASS4 POE-STOCK,.,490,Communications & Networking,Lan Switches Managed Layer 2,Networking,0,0.0
35112,4490586,2023-02-03,,2023FM06,44840876,KOERBER SUPPLY CHAIN SOFTWARE GMBH,DC-POS,,V36E390,TC21/TC26 HC WHITE FIVE SLOT,CHARGE ONLY CRADLE FIVE DEVICES,6026,AIDC/PoS Accessories & Supplies,Docking Station,"Other (incl. AIDC/POS, V7)",0,0.0


In [None]:
# dropping retours (orders with negative Qty shipped) and zero Qty shipped orders
df = df[df["Qty Shipped"] > 0]

# dropping backlog invoices, Specified date to filter the rows
specific_date = pd.to_datetime('2022-11-01')
# Filter the DataFrame to keep only the rows that are before or equal to the specific date
df = df[df["Entry Date"] >= specific_date]

#insert purchase indication column
df["purchase"] = 1

In [None]:
len(df)

2028956

In [None]:
len(df["BranchCustomerNbr"].unique())

13894

In [None]:
len(df["Sku"].unique())

75643

In [None]:
df["Order_BCN"] = df["Order Nbr"].astype(str) + "_" + df["BranchCustomerNbr"].astype(str)

In [None]:
num_users = len(df.BranchCustomerNbr.unique())
num_items = len(df.Sku.unique())
num_orders_sku = len(df.Order_BCN.unique())
combos = num_users * num_items

In [None]:
num_orders_sku

817208

In [None]:
combos

1050983842

In [None]:
sparsity = (1 - (num_orders_sku / combos))
sparsity * 100

99.92224352389235

## User and sku unqiue list

In [None]:
# Create a numeric user_id and artist_id column
df['BranchCustomerNbr'] = df['BranchCustomerNbr'].astype("category")
df['Sku'] = df['Sku'].astype("category")
df['bcn_id'] = df['BranchCustomerNbr'].cat.codes
df['sku_id'] = df['Sku'].cat.codes

In [None]:
df_2 = df[["bcn_id","sku_id", "Entry Date", "FiscalMonth", "Qty Shipped", "purchase"]].copy()

In [None]:
df_2.head(2)

Unnamed: 0,bcn_id,sku_id,Entry Date,FiscalMonth,Qty Shipped,purchase
213,497,49835,2022-11-02,2022FM11,41.0,1
458,1000,52759,2022-11-03,2023FM02,140.0,1


In [None]:
grouped_df = df_2.groupby(['bcn_id', 'sku_id']).agg({
    'Qty Shipped': 'sum',
    'purchase': 'sum'
}).reset_index()

In [None]:
# add column that represent sku count
def sku_count(df_pl):

  df_pl['sku_count'] = df_pl.groupby('bcn_id')['sku_id'].transform('nunique')

  return df_pl

In [None]:
grouped_df = sku_count(grouped_df)
grouped_df.head(4)

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,sku_count
0,0,928,1.0,1,99
1,0,3422,1.0,1,99
2,0,3423,4.0,2,99
3,0,3424,3.0,1,99


### Threshold weight (QTY / purchase)



1.   Sku Count >= 2
2.   Sku Count >= 2 & Purchase > 2



In [None]:
data_1 = grouped_df[grouped_df.sku_count >= 2]
data_1.head(2)

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,sku_count
0,0,928,1.0,1,99
1,0,3422,1.0,1,99


In [None]:
data_2 = grouped_df[(grouped_df.sku_count >= 2) & (grouped_df.purchase > 3)]
data_2.head(2)

Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,sku_count
27,0,14912,18.0,7,99
79,0,45256,11.0,6,99


In [None]:
data_bin = grouped_df[(grouped_df.sku_count >= 2)]
data_bin["purchase"] = 1
data_bin.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_bin["purchase"] = 1


Unnamed: 0,bcn_id,sku_id,Qty Shipped,purchase,sku_count
0,0,928,1.0,1,99
1,0,3422,1.0,1,99


In [None]:
# change to binary data for BPR
bpr_data = data_2[["bcn_id","sku_id"]]
bpr_data["purchase"] = 1
bpr_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bpr_data["purchase"] = 1


Unnamed: 0,bcn_id,sku_id,purchase
27,0,14912,1
79,0,45256,1


In [None]:
len(bpr_data)

541851

## LOOCV

with K=1 so 1 item per user gets dropped out

In [None]:
def train_test_split(df_pl, holdout_num):
    """ perform training testing split

    @param df: dataframe
    @param holdhout_num: number of items to be held out per user as testing items

    @return df_train: training data
    @return df_test testing data

    """
    # perform deep copy to avoid modification on the original dataframe
    df_train = df_pl.copy(deep=True)
    df_test = df_pl.copy(deep=True)

    # get test set
    df_test = df_test.groupby(['bcn_id']).head(holdout_num).reset_index()

    # get train set
    df_train = df_train.merge(
        df_test[['bcn_id', 'sku_id']].assign(remove=1),
        how='left'
    ).query('remove != 1').drop('remove', 1).reset_index(drop=True)

    # drop index
    df_test = df_test.drop(columns=["index"])

    # sanity check to make sure we're not duplicating/losing data
    assert len(df_pl) == len(df_train) + len(df_test)

    return df_train, df_test

In [None]:
df_qty_train_1, df_qty_test_1 = train_test_split(data_1[["bcn_id", "sku_id", "Qty Shipped"]], 1)
df_frequency_train_1, df_frequency_test_1 = train_test_split(data_1[["bcn_id", "sku_id", "purchase"]], 1)
df_bin_train, df_bin_test = train_test_split(data_bin[["bcn_id", "sku_id", "purchase"]], 1)

  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)
  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)
  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)


In [None]:
df_qty_train_2, df_qty_test_2 = train_test_split(data_2[["bcn_id", "sku_id", "Qty Shipped"]], 1)
df_frequency_train_2, df_frequency_test_2 = train_test_split(data_2[["bcn_id", "sku_id", "purchase"]], 1)

  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)
  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)


In [None]:
bpr_data_train, bpr_data_test = train_test_split(bpr_data, 1)

  ).query('remove != 1').drop('remove', 1).reset_index(drop=True)


In [None]:
bpr_data_train, bpr_data_test

(        bcn_id  sku_id  purchase
 0            0   45256         1
 1            0   45260         1
 2            3   30625         1
 3            3   30628         1
 4            3   42771         1
 ...        ...     ...       ...
 127028   13720   22131         1
 127029   13727   45754         1
 127030   13843   42597         1
 127031   13843   45721         1
 127032   13843   46704         1
 
 [127033 rows x 3 columns],
       bcn_id  sku_id  purchase
 0          0   14912         1
 1          3   25971         1
 2          9    1955         1
 3         12   35101         1
 4         18   14806         1
 ...      ...     ...       ...
 3621   13718   75345         1
 3622   13720   21791         1
 3623   13727   45750         1
 3624   13843    2402         1
 3625   13881   67057         1
 
 [3626 rows x 3 columns])

### Stratifying for same bcns in train test

In [None]:
def sku_bcn_check(train_df, test_df):

  train_bcn_ids = set(train_df.bcn_id.unique())
  test_bcn_ids = set(test_df.bcn_id.unique())

  train_sku_ids = set(df_qty_train_2.sku_id.unique())
  test_sku_ids = set(df_qty_test_2.sku_id.unique())

  return test_bcn_ids.issubset(train_bcn_ids), test_sku_ids.issubset(train_sku_ids)

In [None]:
print(sku_bcn_check(df_qty_train_1, df_qty_test_1), sku_bcn_check(df_qty_train_2, df_qty_test_2), sku_bcn_check(df_frequency_train_1,df_frequency_test_1),
      sku_bcn_check(df_frequency_train_2,df_frequency_test_2), sku_bcn_check(df_bin_train, df_bin_train), sku_bcn_check(bpr_data_train, bpr_data_test))

(True, False) (False, False) (True, False) (False, False) (True, False) (False, False)


In [None]:
def stratify_bcn(train_df, test_df):
  """ remove users from test set, that are not in train
    @params: train and test dfs

    @return stratified_df_test testing data
  """
  # Get the unique 'bcn_id' values from df_train
  trained_bcn_ids = train_df['bcn_id'].unique()
  # Filter out the rows in df_test that have 'bcn_id' values not present in trained_bcn_ids
  stratified_df_test = test_df[test_df['bcn_id'].isin(trained_bcn_ids)]

  return stratified_df_test

In [None]:
def stratify_sku(train_df, test_df):
  """ remove skus from test set, that are not in train
    @params: train and test dfs

    @return stratified_df_test testing data
  """
  # Get the unique 'bcn_id' values from df_train
  trained_sku_ids = train_df['sku_id'].unique()
  # Filter out the rows in df_test that have 'bcn_id' values not present in trained_bcn_ids
  stratified_df_test_sks = test_df[test_df['sku_id'].isin(trained_sku_ids)]

  return stratified_df_test_sks

In [None]:
stratified_df_qty_test_2 = stratify_bcn(df_qty_train_2, df_qty_test_2)
stratified_df_frequency_test_2 = stratify_bcn(df_frequency_train_2, df_frequency_test_2)
stratified_bpr_data_test = stratify_bcn(bpr_data_train, bpr_data_test)

In [None]:
stratified_bpr_data_test = stratify_sku(bpr_data_train, stratified_bpr_data_test)

In [None]:
print(len(bpr_data_train.sku_id.unique()),len(stratified_bpr_data_test.sku_id.unique()))

18094 836


In [None]:
print(sku_bcn_check(df_qty_train_1, df_qty_test_1), sku_bcn_check(df_qty_train_2, stratified_df_qty_test_2), sku_bcn_check(df_frequency_train_1,df_frequency_test_1),
      sku_bcn_check(df_frequency_train_2,stratified_df_frequency_test_2), sku_bcn_check(df_bin_train, df_bin_test), sku_bcn_check(bpr_data_train, stratified_bpr_data_test))

(True, False) (True, False) (True, False) (True, False) (True, False) (True, False)


In [None]:
bpr_data

Unnamed: 0,bcn_id,sku_id,purchase
27,0,14912,1
79,0,45256,1
80,0,45260,1
164,3,25971,1
167,3,30625,1
...,...,...,...
544223,13843,2402,1
544249,13843,42597,1
544255,13843,45721,1
544256,13843,46704,1


### Negative sampling


In [None]:
def negative_sampling(df_train, bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param sku_ids: list of sku ids
    @param items: unique list of sku ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'purchase'])

    #df_train = df_train[['bcn_id', 'sku_id']].assign(purchase=1)
    df_train = pd.concat([df_train, df_neg], ignore_index=True)

    return df_train

In [None]:
def negative_sampling_2(df_train, bcn_ids, sku_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label

    @param user_ids: list of user ids
    @param sku_ids: list of sku ids
    @param items: unique list of sku ids
    @param n_neg: number of negative labels to sample

    @return df_neg: negative sample dataframe

    """

    neg = []
    ui_pairs = zip(bcn_ids, sku_ids)
    records = set(ui_pairs)

    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            j = np.random.choice(items)
            # resample if the movie already exists for that user
            while (u, j) in records:
                j = np.random.choice(items)
            neg.append([u, j, 0])

    # convert to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['bcn_id', 'sku_id', 'Qty Shipped'])

    #df_train = df_train[['bcn_id', 'sku_id']].assign(purchase=1)
    df_train = pd.concat([df_train, df_neg], ignore_index=True)

    return df_train

In [None]:
# create final training
neg_df_qty_train_1 = negative_sampling_2(
    df_train = df_qty_train_1,
    bcn_ids = df_qty_train_1.bcn_id.unique(),
    sku_ids = df_qty_train_1.sku_id.unique(),
    items = df.sku_id.unique(),
    n_neg = 10
)
neg_df_qty_train_2 = negative_sampling_2(
    df_train = df_qty_train_2,
    bcn_ids = df_qty_train_2.bcn_id.unique(),
    sku_ids = df_qty_train_2.sku_id.unique(),
    items = df.sku_id.unique(),
    n_neg = 10
)

In [None]:
# create final training
neg_df_frequency_train_1 = negative_sampling(
    df_train = df_frequency_train_1,
    bcn_ids = df_frequency_train_1.bcn_id.unique(),
    sku_ids = df_frequency_train_1.sku_id.unique(),
    items = df.sku_id.unique(),
    n_neg = 10

)
neg_df_frequency_train_2 = negative_sampling(
    df_train = df_frequency_train_2,
    bcn_ids = df_frequency_train_2.bcn_id.unique(),
    sku_ids = df_frequency_train_2.sku_id.unique(),
    items = df.sku_id.unique(),
    n_neg = 10
)

In [None]:
neg_df_bin_train = negative_sampling(
    df_train = df_bin_train,
    bcn_ids = df_bin_train.bcn_id.unique(),
    sku_ids = df_bin_train.sku_id.unique(),
    items = df.sku_id.unique(),
    n_neg = 10
)

## Create sparse user x item and item x user matrices

In [None]:
def create_qty_user_item_csr_matrix(df_pl_train, df_pl_test):
  """ create a sparse data matrix with scipy sparse matrix
      @return sparse user x item matrix
  """
  sparse_user_item_train = sparse.csr_matrix((df_pl_train['Qty Shipped'], (df_pl_train['bcn_id'], df_pl_train['sku_id'])))
  sparse_user_item_test = sparse.csr_matrix((df_pl_test['Qty Shipped'], (df_pl_test['bcn_id'], df_pl_test['sku_id'])))

  return sparse_user_item_train, sparse_user_item_test

In [None]:
def create_user_item_csr_matrix(df_pl_train, df_pl_test):
  """ create a sparse data matrix with scipy sparse matrix
      @return sparse user x item matrix
  """
  sparse_user_item_train = sparse.csr_matrix((df_pl_train['purchase'].astype(float), (df_pl_train['bcn_id'], df_pl_train['sku_id'])))
  sparse_user_item_test = sparse.csr_matrix((df_pl_test['purchase'].astype(float), (df_pl_test['bcn_id'], df_pl_test['sku_id'])))

  return sparse_user_item_train, sparse_user_item_test

In [None]:
def create_item_user_csr_matrix(df_pl):
  """ create a sparse data matrix with scipy sparse matrix
      @return sparse user x item matrix
  """
  sparse_item_user = sparse.csr_matrix((df_pl['purchase'].astype(float), (df_pl['sku_id'], df_pl['bcn_id'])))

  return sparse_item_user

In [None]:
# only positive sparse matrices
sparse_user_item_qty_train_1, sparse_user_item_qty_test_1 = create_qty_user_item_csr_matrix(df_qty_train_1, df_qty_test_1)
sparse_user_item_qty_train_2, sparse_user_item_qty_test_2 = create_qty_user_item_csr_matrix(df_qty_train_2, stratified_df_qty_test_2)

sparse_user_item_frequency_train_1, sparse_user_item_frequency_test_1 = create_user_item_csr_matrix(df_frequency_train_1, df_frequency_test_1)
sparse_user_item_frequency_train_2, sparse_user_item_frequency_test_2 = create_user_item_csr_matrix(df_frequency_train_2, stratified_df_frequency_test_2)

sparse_user_item_bin_train, sparse_user_item_bin_test = create_user_item_csr_matrix(df_bin_train, df_bin_test)

In [None]:
sparse_user_item_frequency_train_1, sparse_user_item_frequency_test_1

(<13894x75643 sparse matrix of type '<class 'numpy.float64'>'
 	with 530523 stored elements in Compressed Sparse Row format>,
 <13894x75631 sparse matrix of type '<class 'numpy.float64'>'
 	with 11328 stored elements in Compressed Sparse Row format>)

In [None]:
# negative sampled sparse matrices
neg_sparse_user_item_qty_train_1, neg_sparse_user_item_qty_test_1 = create_qty_user_item_csr_matrix(neg_df_qty_train_1, df_qty_test_1)
neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2 = create_qty_user_item_csr_matrix(neg_df_qty_train_2, stratified_df_qty_test_2)

neg_sparse_user_item_frequency_train_1, neg_sparse_user_item_frequency_test_1 = create_user_item_csr_matrix(neg_df_frequency_train_1, df_frequency_test_1)
neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2 = create_user_item_csr_matrix(neg_df_frequency_train_2, stratified_df_frequency_test_2)

##Create unique sku and user lists

In [None]:
skus = df[["sku_id", "Sku" , "Product Descr1", "Product Descr2", "ProductGroupDescription"]].drop_duplicates()

In [None]:
users = df[["bcn_id", "BranchCustomerNbr"]].drop_duplicates()

# **ALS Model**

ALS hyperparameters are:


*   factors: The number of latent factors to compute
*   regularization: The regularization factor to use
*   alpha: The weight to give to positive examples
*   iterations: The number of ALS iterations to use when fitting data





## ALS Model param tuning

In [None]:
import implicit
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import leave_k_out_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k, AUC_at_k
from sklearn.model_selection import GridSearchCV



In [None]:
from IPython.utils.sysinfo import num_cpus
# Define the hyperparameter search space
factors_list = [100, 150, 200]
iterations_list = [10, 20]
regularization_list = [0.01, 0.1]
alpha_list = [5, 20, 40]

best_precision = 0
best_factors = None
best_iterations = None
best_regularization = None
best_alpha = None

# Perform manual grid search
for factors in factors_list:
    for iterations in iterations_list:
        for regularization in regularization_list:
            for alpha in alpha_list:
                # Create the ALS model with the current hyperparameters
                als_model = AlternatingLeastSquares(factors=factors, iterations=iterations,
                                                    regularization=regularization, alpha=alpha)
                als_model.fit(neg_sparse_user_item_qty_train_2, show_progress=False)

                # Evaluate the model on the test set
                precision = precision_at_k(als_model, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10, show_progress=False)

                # Check if current hyperparameters give better precision
                if precision > best_precision:
                    best_precision = precision
                    best_factors = factors
                    best_iterations = iterations
                    best_regularization = regularization
                    best_alpha = alpha

print(f"Best Factors: {best_factors}")
print(f"Best Iterations: {best_iterations}")
print(f"Best Regularization: {best_regularization}")
print(f"Best Alpha: {best_alpha}")
print(f"Precision@10 on Test Set: {best_precision}")

KeyboardInterrupt: ignored

*Qty based model:*



1.   Best Factors: 100
2.   Best Iterations: 20
3.   Best Regularization: 0.01
4.   Best alpha: 5

Precision@10 on Test Set: 0.21248025276461296

In [None]:
from IPython.utils.sysinfo import num_cpus
# Define the hyperparameter search space
factors_list = [100, 150, 200]
iterations_list = [10, 20]
regularization_list = [0.01, 0.1]
alpha_list = [5, 20, 40]

best_precision = 0
best_factors = None
best_iterations = None
best_regularization = None
best_alpha = None

# Perform manual grid search
for factors in factors_list:
    for iterations in iterations_list:
        for regularization in regularization_list:
            for alpha in alpha_list:
                # Create the ALS model with the current hyperparameters
                als_model = AlternatingLeastSquares(factors=factors, iterations=iterations,
                                                    regularization=regularization, alpha=alpha)
                als_model.fit(neg_sparse_user_item_frequency_train_2, show_progress=False)

                # Evaluate the model on the test set
                precision = precision_at_k(als_model, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10, show_progress=False)

                # Check if current hyperparameters give better precision
                if precision > best_precision:
                    best_precision = precision
                    best_factors = factors
                    best_iterations = iterations
                    best_regularization = regularization
                    best_alpha = alpha

print(f"Best Factors: {best_factors}")
print(f"Best Iterations: {best_iterations}")
print(f"Best Regularization: {best_regularization}")
print(f"Best Alpha: {best_alpha}")
print(f"Precision@10 on Test Set: {best_precision}")

Best Factors: 100
Best Iterations: 10
Best Regularization: 0.01
Best Alpha: 5
Precision@10 on Test Set: 0.24328593996840442


*Frequency model*:


1.   Best Factors: 100
2.   Best Iterations: 10
3.   Best Regularization: 0.01
4.   Best Alpha: 5

Precision@10 on Test Set: 0.24328593996840442

In [None]:
# def validate(df_train, df_test, factors=200, iterations=20, regularization=0.01, alpha=1, show_progress=True):
#     """ Train an ALS model with <<factors>> (embeddings dimension)
#     for <<iterations>> over matrices and validate with MAP@12
#     """
#     model = implicit.als.AlternatingLeastSquares(factors=factors,
#                                                  iterations=iterations,
#                                                  regularization=regularization,
#                                                  alpha = alpha,
#                                                  random_state=42)
#     model.fit(df_train, show_progress=show_progress)

#     # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
#     # TODO: change MAP@12 to a library that allows repeated items in prediction
#     prec10 = precision_at_k(model, df_train, df_test, K=10, show_progress=show_progress, num_threads=4)
#     print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} - Alpha: {alpha:>1} ==> PREC@10: {prec10:6.5f}")
#     return prec10

In [None]:
# %%time
# best_prec10 = 0
# for factors in [50, 100, 150, 200]:
#     for iterations in [5, 10, 20]:
#         for regularization in [0.01, 0.1]:
#           for alpha in [5, 20, 40]:
#             prec10 = validate(sparse_user_item_qty_train_2, sparse_user_item_qty_test_2, factors, iterations, regularization, show_progress=False)
#             if prec10 > best_prec10:
#                 best_prec10 = prec10
#                 best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization, 'alpha':alpha}
#                 print(f"Best MAP@12 found. Updating: {best_params}")

In [None]:
#test if model works on data
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=10)
model.fit(neg_sparse_user_item_qty_train_2)

  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 6.82 s, sys: 11.1 s, total: 17.9 s
Wall time: 13.9 s


### ALS QTY Based

In [None]:
%%time
model_2 = implicit.als.AlternatingLeastSquares(factors=100, alpha=5, regularization=0.01, iterations=10)
model_2.fit(neg_sparse_user_item_qty_train_2)
prec10 = precision_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
auc_10 = AUC_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
print(prec10, auc_10)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

0.1966824644549763 0.5982805370127654
CPU times: user 22.1 s, sys: 21.3 s, total: 43.3 s
Wall time: 29.1 s


In [None]:
%%time
model_2 = implicit.als.AlternatingLeastSquares(factors=150, alpha=5, regularization=0.01, iterations=20)
model_2.fit(neg_sparse_user_item_qty_train_2)
prec10 = precision_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
auc_10 = AUC_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
print(prec10, auc_10)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

0.2109004739336493 0.6053901948382557
CPU times: user 1min 4s, sys: 1min 4s, total: 2min 9s
Wall time: 1min 25s


## ALS Best Model Performance

In [None]:
precision_als = precision_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
precision_als

  0%|          | 0/2532 [00:00<?, ?it/s]

0.2109004739336493

In [None]:
map_precision_als = mean_average_precision_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
precision_als

  0%|          | 0/2532 [00:00<?, ?it/s]

0.2109004739336493

In [None]:
ndcg_10 = ndcg_at_k(model_2, neg_sparse_user_item_qty_train_2, neg_sparse_user_item_qty_test_2, K=10, num_threads=10)
ndcg_10

  0%|          | 0/2532 [00:00<?, ?it/s]

0.12875319335171892

### Frequency Based ALS

In [None]:
%%time
model_4 = implicit.als.AlternatingLeastSquares(factors = 100, iterations = 10, regularization = 0.01, alpha = 5)
model_4.fit(neg_sparse_user_item_frequency_train_2)
prec10 = precision_at_k(model_4, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10)
auc_10 = AUC_at_k(model_4, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10)
print(prec10, auc_10)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

  0%|          | 0/2532 [00:00<?, ?it/s]

0.23854660347551343 0.6192141582557356
CPU times: user 22.1 s, sys: 20.3 s, total: 42.4 s
Wall time: 27.7 s


## ALS Best Model Performance

In [None]:
precision_als = precision_at_k(model_4, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10)
precision_als

  0%|          | 0/2532 [00:00<?, ?it/s]

0.23854660347551343

In [None]:
map_precision_als = mean_average_precision_at_k(model_4, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10)
precision_als

  0%|          | 0/2532 [00:00<?, ?it/s]

0.23854660347551343

In [None]:
nendcg_10 = ndcg_at_k(model_4, neg_sparse_user_item_frequency_train_2, neg_sparse_user_item_frequency_test_2, K=10, num_threads=10)
ndcg_10

  0%|          | 0/2532 [00:00<?, ?it/s]

0.12875319335171892

## ALS Best Model Recommendations

In [None]:
user_skucounts = data_1[["bcn_id", "sku_count"]].drop_duplicates()
users_less_20 = user_skucounts[user_skucounts.sku_count < 20]
users_less_20 = users_less_20["bcn_id"].values
users_more_20 = user_skucounts[user_skucounts.sku_count >= 20]
users_more_20 = users_more_20["bcn_id"].values

In [None]:
 # Assuming you want recommendations for user with ID 10
import random
user_id = random.choice(users_less_20)
user_id_2 = random.choice(users_more_20)
print(user_id)
print(user_id_2)

2611
5709


In [None]:
# Now you can call the recommend function
recommended = model_3.recommend(user_id, user_items[user_id], N=100, filter_already_liked_items=True)
rec_tab = pd.DataFrame(recommended)
rec_tab = rec_tab.T.rename(columns={0:"sku_id", 1:"conf"})
rec_tab = rec_tab.merge(skus, on="sku_id", how="left")
rec_tab.sort_values(by="conf", ascending=False)

NameError: ignored

In [None]:
user_purchases = pd.DataFrame(df[df.bcn_id == user_id][["Product Descr1","Sku", "ProductGroupDescription", "purchase"]].groupby(
                              ["Sku","Product Descr1", "ProductGroupDescription"])["purchase"].sum().sort_values(ascending=False))
user_purchases[0:20]