# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

import surprise
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k


# Reading Data

In [2]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/rec_sys_binary_train_mixed.csv", index_col=0)
test = pd.read_csv("../../00_Data/rec_sys_binary_test_mixed.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (193076, 3)
Shape of Test: 	 (48269, 3)


In [4]:
# Check Head of Train
train.head()

Unnamed: 0,StockCode,CustomerID,purchased
147307,47518F,16701,1
26104,21498,12758,1
46697,22030,17864,1
111830,22960,13804,1
94927,22725,12560,1


# Preparation of Train Dataset for Surprise Models


In [6]:
# Reader 
reader = surprise.Reader(rating_scale=(0,1))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [7]:
# base_model
try:
    del(base_model)
except:
    pass

# Create Object for base_model 
base_model = surprise.SVD(random_state=0, n_factors=4, n_epochs=20, verbose=True)

# Fit the base_model
base_model.fit(train_set)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fda212ea700>

# Prediction: Test Set Only

In [8]:
# Make Predictions
test_pred = predict(base_model, test, usercol='CustomerID', itemcol='StockCode')

# Show first five rows
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,17865,23072,0.51488
1,17960,22139,0.809761
2,15358,10133,0.910735
3,17787,22188,0.866835
4,17076,84843,0.233209


In [9]:
# Check Statistics of Prediction
test_pred.prediction.describe()

count    48269.000000
mean         0.710230
std          0.229926
min          0.000000
25%          0.558595
50%          0.745421
75%          0.901520
max          1.000000
Name: prediction, dtype: float64

In [10]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']<0.5), 0,1)

In [11]:
# Check Distribution
test_pred['prediction'].value_counts()

1    39062
0     9207
Name: prediction, dtype: int64

In [12]:
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [13]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,12347,21035,1
1,12347,21041,1
2,12347,21064,1
3,12347,21307,0
4,12347,21719,0


In [14]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,12347,21035,1
1,12347,21041,1
2,12347,21064,0
3,12347,21307,0
4,12347,21719,1


In [15]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.7817232592347055

In [16]:
# Accuracy on positive Test only
test_positive = test[test['purchased']==1]
test_positive_pred = test_positive.merge(test_pred, on=['CustomerID', 'StockCode'], how='left', indicator=True)
test_positive_pred.purchased.value_counts()
# Accuracy 
accuracy_score(test_positive_pred.purchased, test_positive_pred.prediction)


0.9129415170816445

# Prediction: Top N 

In [17]:
# Predict all pairs that are not in the train set
predictions = compute_ranking_predictions(base_model, test, usercol='CustomerID', itemcol='StockCode')

In [18]:
# Check first five rows
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,12347,21035,0.846218
1,12347,21041,0.756729
2,12347,21064,0.475415
3,12347,21307,0.43088
4,12347,21719,0.556178


In [19]:
# Filter Test for purchased items only
test = test[test['purchased']==1]

In [21]:
# Now filter out top N 
# First write a function
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)


In [22]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.029352566252935263
recall at 10 	: 0.033121535440447616


In [23]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.02604830593760483
recall at 20 	: 0.05728047680015305


# Hyper-Parameter Tuning

In [25]:
# Define Set of Hyperparameters: 
n_factors = [10, 100, 1000]
reg_all = [0.01, 0.02, 0.03]
lr_det = [0.001, 0.005, 0.01]

# Initiate Lists 
factor_nr = []
regulation_all = []
lr_all = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []

for factors in n_factors:
    for regulation in reg_all:
        for learning in lr_det:
            try:
                del(model)
            except:
                pass 
            # Define Model 
            model = surprise.SVD(random_state=0, n_factors=factors, n_epochs=20, lr_all=learning, reg_all=regulation)
            # Fit model
            model.fit(train_set)
            
            # Predict All Pairs of Users & Items that are NOT in the Trainset 
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')
            
            # Filter Top 10 & 20 
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)
            
            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 
            
            print("recall at for", factors, "factors: \t", eval_recall_10)
            factor_nr.append(factors)
            regulation_all.append(regulation)
            lr_all.append(learning)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)

recall at for 10 factors: 	 0.03186748542614955
recall at for 10 factors: 	 0.03174882961016662
recall at for 10 factors: 	 0.031248892707325657
recall at for 10 factors: 	 0.032358900563196166
recall at for 10 factors: 	 0.03309569095600246
recall at for 10 factors: 	 0.032294506703396744
recall at for 10 factors: 	 0.03245631225804461
recall at for 10 factors: 	 0.033214458672739915
recall at for 10 factors: 	 0.03306789574394007
recall at for 100 factors: 	 0.016480618240815294
recall at for 100 factors: 	 0.018523786702088926
recall at for 100 factors: 	 0.018766178779844083
recall at for 100 factors: 	 0.01612218837035838
recall at for 100 factors: 	 0.020624527156083126
recall at for 100 factors: 	 0.023449332752796467
recall at for 100 factors: 	 0.016051105126865698
recall at for 100 factors: 	 0.02429361917714946
recall at for 100 factors: 	 0.030377021627008477
recall at for 1000 factors: 	 0.007078983268806366
recall at for 1000 factors: 	 0.007249396053599859
recall at for 

In [26]:
dict_param1 = {
    "factor": factor_nr,
    "regulation_all": regulation_all,
    "learning_rate_all": lr_all,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param1 = pd.DataFrame(dict_param1)
df_param1.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_all,learning_rate_all,recall_10,recall_20,precision_10,precision_20
7,10,0.03,0.005,0.033214,0.057316,0.028313,0.025461
4,10,0.02,0.005,0.033096,0.056792,0.028313,0.025109
8,10,0.03,0.01,0.033068,0.056258,0.027172,0.024639
6,10,0.03,0.001,0.032456,0.055622,0.03113,0.026652
3,10,0.02,0.001,0.032359,0.055045,0.030527,0.026182
5,10,0.02,0.01,0.032295,0.055402,0.026568,0.024522
0,10,0.01,0.001,0.031867,0.05442,0.029956,0.025964
1,10,0.01,0.005,0.031749,0.056421,0.027877,0.024908
2,10,0.01,0.01,0.031249,0.052811,0.025696,0.0237
17,100,0.03,0.01,0.030377,0.051135,0.026401,0.023868


# Tuning 2 

# Tuning 3

In [28]:
# Define Set of Hyperparameters: 
n_factors = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
reg_all = [0.03]
lr_det = [0.005]

# Initiate Lists 
factor_nr = []
regulation_all = []
lr_all = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []

for factors in n_factors:
    for regulation in reg_all:
        for learning in lr_det:
            try:
                del(model)
            except:
                pass 
            # Define Model 
            model = surprise.SVD(random_state=0, n_factors=factors, n_epochs=20, lr_all=learning, reg_all=regulation)
            # Fit model
            model.fit(train_set)
            
            # Predict All Pairs of Users & Items that are NOT in the Trainset 
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')
            
            # Filter Top 10 & 20 
            top_10 = filter_top_n(predictions, 10)
            top_20 = filter_top_n(predictions, 20)
            
            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                        col_rating="purchased", col_prediction="prediction", 
                                        relevancy_method="top_k", k=10)
            # Evaluate Recall at 20 
            eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20)
            
            # Evaluate Precision at 10 
            eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=10)
            # Evaluate Precision at 20 
            eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                col_rating="purchased", col_prediction="prediction", 
                                relevancy_method="top_k", k=20) 
            
            print("recall at for", factors, "factors: \t", eval_recall_10)
            factor_nr.append(factors)
            regulation_all.append(regulation)
            lr_all.append(learning)
            recall_10.append(eval_recall_10)
            recall_20.append(eval_recall_20)
            precision_10.append(eval_precision_10)
            precision_20.append(eval_precision_20)

recall at for 2 factors: 	 0.034080804336768356
recall at for 3 factors: 	 0.034025537423100743
recall at for 4 factors: 	 0.033185292833132506
recall at for 5 factors: 	 0.03460621243382238
recall at for 6 factors: 	 0.03240642748564997
recall at for 7 factors: 	 0.03264990808817199
recall at for 8 factors: 	 0.034257817812939365
recall at for 9 factors: 	 0.03275858214956856
recall at for 10 factors: 	 0.033214458672739915
recall at for 11 factors: 	 0.032733862766040185
recall at for 12 factors: 	 0.033448360278214954
recall at for 13 factors: 	 0.0331043734004974
recall at for 14 factors: 	 0.03391893494860463
recall at for 15 factors: 	 0.033423999960995765


In [29]:
dict_param3 = {
    "factor": factor_nr,
    "regulation_all": regulation_all,
    "learning_rate_all": lr_all,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param3 = pd.DataFrame(dict_param3)
df_param3.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,regulation_all,learning_rate_all,recall_10,recall_20,precision_10,precision_20
3,5,0.03,0.005,0.034606,0.058932,0.029722,0.0263
6,8,0.03,0.005,0.034258,0.057188,0.028246,0.025109
0,2,0.03,0.005,0.034081,0.057463,0.02848,0.025679
1,3,0.03,0.005,0.034026,0.057005,0.028648,0.025914
12,14,0.03,0.005,0.033919,0.055842,0.028883,0.025461
10,12,0.03,0.005,0.033448,0.057465,0.028648,0.025512
13,15,0.03,0.005,0.033424,0.056567,0.028514,0.025159
8,10,0.03,0.005,0.033214,0.057316,0.028313,0.025461
2,4,0.03,0.005,0.033185,0.057818,0.028514,0.025864
11,13,0.03,0.005,0.033104,0.056554,0.028715,0.026065


# Best Model: Accuracy on Test

In [30]:
# Create Object for Model
best_model = surprise.SVD(random_state=0, n_factors=2, n_epochs=20, lr_all=0.005, reg_all=0.03)

# Fit the base_model
best_model.fit(train_set)

# Make Predictions
test_pred = predict(best_model, test, usercol='CustomerID', itemcol='StockCode')

# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0.5), 1,0)

# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.9151129125651418