# Import Libraries

In [1]:
# Pandas & Numpy 
import pandas as pd 
import numpy as np 

# Suprise & Recommenders 
import surprise 
from recommenders.models.surprise.surprise_utils import compute_ranking_predictions, predict
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k

# Reading Data

In [2]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (311229, 9)
Shape of Test: 	 (77842, 9)


# Data Preparation: Value of Purchase

In [81]:
# Change values of purchased to 5
train['purchased'] = 1
test['purchased'] = 1

In [82]:
# Sanity Check with Describe 
train.purchased.describe()

count   311229.00
mean         1.00
std          0.00
min          1.00
25%          1.00
50%          1.00
75%          1.00
max          1.00
Name: purchased, dtype: float64

# Preparation of Train Dataset for Surprise Models



In [93]:
# Reader 
reader = surprise.Reader(rating_scale=(0,1))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [94]:
# Delete Model first 
try:
    del(basemodel)
except:
    pass

# Create Object for Model
basemodel = surprise.NMF(random_state=0, verbose=False, n_factors=10, n_epochs=50, reg_pu=0.1, reg_qi=0.1)

# Fit Model
basemodel.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fdb58132d60>

In [95]:
# Predict Test Only
test_pred = predict(basemodel, test, usercol='CustomerID', itemcol='StockCode')

In [96]:
# Check Range of predicions
test_pred.prediction.describe()

count   77842.00
mean        0.90
std         0.00
min         0.88
25%         0.90
50%         0.90
75%         0.90
max         0.91
Name: prediction, dtype: float64

In [103]:
# Predict All Pairs of Users & Items that are NOT in the Trainset 
predictions = compute_ranking_predictions(basemodel, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [88]:
# Write function to filter out top N 
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

In [104]:
# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

In [105]:
# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

recall at 10 	: 0.003002190400353022


# Hyperparameter Tuning

In [127]:
# Define Set of Hyperparameters 
n_factors = [20, 2000]
reg_bu = [0.01]
reg_bi = [0.01]
lr_bu = [0.001]
lr_bi = [0.001]

# Initiate Lists 
factor_nr = []
regulation_user = []
regulation_item = []
lr_user = []
lr_item = []
recall_10 = []

for factors in n_factors:
    for reg_user in reg_bu:
        for reg_item in reg_bi:
            for learning_user in lr_bu:
                for learning_item in lr_bi:
                    try:
                        del(model)
                    except:
                        pass
                    
                    # Define Model 
                    model = surprise.NMF(random_state=0,
                                        verbose=False,
                                        biased=False,
                                        n_factors=factors,
                                        n_epochs=20,
                                        reg_bu=reg_user,
                                        reg_bi=reg_item,
                                        lr_bu=learning_user,
                                        lr_bi=learning_item)
                    
                    # Fit model
                    model.fit(train_set)
                    
                    # Predict All Pairs of Users & Items that are NOT in the Trainset 
                    predictions = compute_ranking_predictions(basemodel, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)
                    
                    # Filter Top 10 
                    top_10 = filter_top_n(predictions, 10)
                    
                    # Evaluate Recall at 10 
                    eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)

                    print("recall at for", factors, "factors: \t", eval_recall_10)
                    factor_nr.append(factors)
                    regulation_user.append(reg_user)
                    regulation_item.append(reg_item)
                    lr_user.append(learning_user)
                    lr_item.append(learning_item)
                    recall_10.append(eval_recall_10)


recall at for 20 factors: 	 0.003002190400353022
recall at for 2000 factors: 	 0.003002190400353022


In [123]:
dict_param = {
    "factor": factor_nr,
    "regulation_user": regulation_user,
    "regulation_item": regulation_item,
    "learning_rate_user": lr_user,
    "learning_rate_item": lr_item,
    "recall_10": recall_10
}

In [124]:
df_param = pd.DataFrame(dict_param)
df_param.head()

Unnamed: 0,factor,recall_10
0,20,0.0
1,200,0.0


In [125]:
df_param.describe()

Unnamed: 0,factor,recall_10
count,2.0,2.0
mean,110.0,0.0
std,127.28,0.0
min,20.0,0.0
25%,65.0,0.0
50%,110.0,0.0
75%,155.0,0.0
max,200.0,0.0
