# Import Libraries

In [1]:
# Pandas & Numpy 
import pandas as pd 
import numpy as np 

# Suprise & Recommenders 
import surprise 
from recommenders.models.surprise.surprise_utils import compute_ranking_predictions, predict
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k

# Reading Data

In [8]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/rec_sys_ratings_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/rec_sys_ratings_test.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (393208, 3)
Shape of Test: 	 (98401, 3)


# Preparation of Train Dataset for Surprise Models



In [4]:
# Reader 
reader = surprise.Reader(rating_scale=(-2,2))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [5]:
# Delete Model first 
try:
    del(basemodel)
except:
    pass

# Create Object for Model
basemodel = surprise.NMF(random_state=0, verbose=False, n_factors=10, n_epochs=50, reg_pu=0.1, reg_qi=0.1)

# Fit Model
basemodel.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fcf1943be80>

# Prediction: Test Set only

In [9]:
# Predict Test Only
test_pred = predict(basemodel, test, usercol='CustomerID', itemcol='StockCode')

In [11]:
# Check Range of predicions
test_pred.prediction.describe()

count    98401.000000
mean         0.000252
std          0.423192
min         -2.000000
25%         -0.028340
50%          0.000667
75%          0.037375
max          2.000000
Name: prediction, dtype: float64

In [12]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0),1,0)

# Convert Test
test['purchased'] = np.where((test['purchased']>0),1,0)

In [13]:
# Check Distribution
test_pred['prediction'].value_counts()

1    50421
0    47980
Name: prediction, dtype: int64

In [14]:
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [15]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,12347,21232,0
1,12347,21754,0
2,12347,21790,0
3,12347,22086,0
4,12347,22149,0


In [16]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,12347,21232,0
1,12347,21754,1
2,12347,21790,0
3,12347,22086,0
4,12347,22149,1


In [17]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.4549953760632514

# Prediction: Top N

In [18]:
# Predict All Pairs of Users & Items that are NOT in the Trainset 
predictions = compute_ranking_predictions(basemodel, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [19]:
# Filter Test for purchased items only
test = test[test['purchased']==1]

In [20]:
# Write function to filter out top N 
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)

In [21]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.003136593591905564
recall at 10 	: 0.002616319016825246


In [23]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.0035413153456998313
recall at 20 	: 0.006203886120830733


# Hyperparameter Tuning

In [26]:
# Define Set of Hyperparameters 
n_factors = [10, 20, 100, 200, 500, 1000]
biases = [False]
reg_all = [0.01]
lr_det = [0.001]

# Initiate Lists 
factor_nr = []
biased = []
regulation = []
learning = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []
 

for factors in n_factors:
    for bias in biases:
        for reg in reg_all:
            for lr in lr_det: 
                try:
                    del(model)
                except:
                    pass
                
                # Define Model 
                model = surprise.NMF(random_state=0,
                                    verbose=False,
                                    biased=bias,
                                    n_factors=factors,
                                    n_epochs=50,
                                    reg_bi=reg,
                                    reg_bu=reg,
                                    lr_bi=lr,
                                    lr_bu=lr)
                
                # Fit model
                model.fit(train_set)
                
                # Predict All Pairs of Users & Items that are NOT in the Trainset 
                predictions = compute_ranking_predictions(basemodel, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)
                
                # Filter Top 10 & 20 
                top_10 = filter_top_n(predictions, 10)
                top_20 = filter_top_n(predictions, 20)

                # Evaluate Recall at 10 
                eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                            col_rating="purchased", col_prediction="prediction", 
                                            relevancy_method="top_k", k=10)
                # Evaluate Recall at 20 
                eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)

                # Evaluate Precision at 10 
                eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
                # Evaluate Precision at 20 
                eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)

                print("recall at 10 for", factors, "factors: \t", eval_recall_10)
                factor_nr.append(factors)
                biased.append(bias)
                regulation.append(reg)
                learning.append(lr)
                recall_10.append(eval_recall_10)
                recall_20.append(eval_recall_20)
                precision_10.append(eval_precision_10)
                precision_20.append(eval_precision_20)

recall at 10 for 10 factors: 	 0.002616319016825246


KeyboardInterrupt: 

In [25]:
dict_param = {
    "factor": factor_nr,
    "recall_10": recall_10,
    "recall_20": recall_20,
    "precision_10" : precision_10,
    "precision_20" : precision_20
}

df_param1 = pd.DataFrame(dict_param)
df_param1.sort_values(by='recall_10', ascending=False)

Unnamed: 0,factor,recall_10,recall_20,precision_10,precision_20
0,10,0.002616,0.006204,0.003137,0.003541
12,20,0.002616,0.006204,0.003137,0.003541
21,100,0.002616,0.006204,0.003137,0.003541
20,100,0.002616,0.006204,0.003137,0.003541
19,100,0.002616,0.006204,0.003137,0.003541
18,100,0.002616,0.006204,0.003137,0.003541
17,100,0.002616,0.006204,0.003137,0.003541
16,100,0.002616,0.006204,0.003137,0.003541
15,20,0.002616,0.006204,0.003137,0.003541
14,20,0.002616,0.006204,0.003137,0.003541


In [28]:
# Create Object for Model
best_model = surprise.NMF(random_state=0,
                        verbose=False,
                        biased=False,
                        n_factors=10,
                        n_epochs=20)

# Fit the base_model
best_model.fit(train_set)

# Make Predictions
test_pred = predict(best_model, test, usercol='CustomerID', itemcol='StockCode')

# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0), 1,0)

# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

0.47087434858135496