# Loading Libraries

In [2]:
import pandas as pd
import numpy as np

import surprise
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k


# Reading Data

In [3]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [4]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (206861, 3)
Shape of Test: 	 (51879, 3)


In [5]:
# Check Head of Train
train.head()

Unnamed: 0,StockCode,CustomerID,purchased
248155,3141,1,1
215779,2307,1,1
50745,823,1,1
259024,3352,1,1
106243,1358,1,1


# Preparation of Train Dataset for Surprise Models


In [6]:
# Reader 
reader = surprise.Reader(rating_scale=(0,1))

# Build Train Set from Custom Dataset
train_set = surprise.Dataset.load_from_df(train[['CustomerID', 'StockCode', 'purchased']], reader=reader).build_full_trainset()

# Baseline Model

In [8]:
# base_model
try:
    del(base_model)
except:
    pass

# Create Object for base_model 
base_model = surprise.SVDpp(random_state=0, n_factors=200, n_epochs=20, verbose=True)

# Fit the base_model
base_model.fit(train_set)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f8992688820>

# Prediction: Test Set Only

In [9]:
# Make Predictions
test_pred = predict(base_model, test, usercol='CustomerID', itemcol='StockCode')

# Show first five rows
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,1249,0.931707
1,1,396,0.888109
2,1,2601,1.0
3,1,912,1.0
4,1,1032,0.949045


In [10]:
# Check Statistics of Prediction
test_pred.prediction.describe()

count    51892.000000
mean         0.974526
std          0.042014
min          0.598758
25%          0.962804
50%          0.999690
75%          1.000000
max          1.000000
Name: prediction, dtype: float64

In [11]:
# Convert Predictions
test_pred['prediction'] = np.where((test_pred['prediction']>0.5), 1,0)

In [12]:
# Check Distribution
test_pred['prediction'].value_counts()

1    51892
Name: prediction, dtype: int64

In [29]:
# Sort Index of both Datasets to use Accuracy Score 
test = test.sort_values(by=['CustomerID', 'StockCode'])
test_pred = test_pred.sort_values(by=['CustomerID', 'StockCode'])

# Reset indeces for both DataFrames
test = test.reset_index(drop=True)
test_pred = test_pred.reset_index(drop=True)

In [33]:
# First Reorder columns for test 
test = test[['CustomerID', 'StockCode', 'purchased']]
# Head of Test
test.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,1,87,1
1,1,208,1
2,1,275,1
3,1,333,1
4,1,396,1


In [34]:
# Head of Test_pred
test_pred.head()

Unnamed: 0,CustomerID,StockCode,prediction
0,1,87,1
1,1,208,1
2,1,275,1
3,1,333,1
4,1,396,1


In [36]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test.purchased, test_pred.prediction)

1.0

# Prediction: Top N 

In [38]:
# Predict all pairs that are not in the train set
predictions = compute_ranking_predictions(base_model, train, usercol='CustomerID', itemcol='StockCode', remove_seen=True)

In [39]:
# Check first five rows
predictions.head()

Unnamed: 0,CustomerID,StockCode,prediction
207084,1,1916,1.0
207085,1,754,0.826573
207086,1,1797,0.965186
207087,1,3137,1.0
207088,1,920,0.888303


In [40]:
# Now filter out top N 
# First write a function
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

# Filter Top 10 
top_10 = filter_top_n(predictions, 10)

# Filter Top 20 
top_20 = filter_top_n(predictions, 20)


In [41]:
# Evaluate Precision at 10 
eval_precision_10 = precision_at_k(test, top_10, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('precision at 10 \t:', eval_precision_10)

# Evaluate Recall at 10 
eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
print('recall at 10 \t:', eval_recall_10)

precision at 10 	: 0.0051462621885157095
recall at 10 	: 0.0035153447699063064


In [42]:
# Evaluate Precision at 20 
eval_precision_20 = precision_at_k(test, top_20, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('precision at 20 \t:', eval_precision_20)

# Evaluate Recall at 20 
eval_recall_20 = recall_at_k(test, top_20,col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
print('recall at 20 \t:', eval_recall_20)

precision at 20 	: 0.00545774647887324
recall at 20 	: 0.007222494403110586


# Hyper-Parameter Tuning

In [7]:
# Define Function for Top N
def filter_top_n(predictions: pd.DataFrame, n: int) -> pd.DataFrame:
    
    # Group the dataframe by 'CustomerID', and for each group, sort by 'prediction' in descending order, then take the top N rows
    top_n_per_customer = predictions.groupby('CustomerID', group_keys=False).apply(lambda group: group.sort_values('prediction', ascending=False).head(n))
    
    return top_n_per_customer

In [10]:
# Define Set of Hyperparameters: 
n_factors = [10, 100, 1000]
lr_rates = [0.001, 0.007, 0.01]
reg_terms = [0.01, 0.02]

# Initiate List for Metrics
factors = []
recall_k = []
learning_rates = []
regulation_terms = []


# Start for loop
for factor in n_factors:
    for lr in lr_rates:
        for reg in reg_terms:
            # First Delete Model Everytime
            try:
                del(model)
            except:
                pass

            # Define Model with Parameters 
            model = surprise.SVDpp(random_state=0,
                                    n_factors=factor,
                                    n_epochs=5,
                                    lr_all=lr,
                                    reg_all=reg)

            # Fit model
            model.fit(train_set)

            # Predict all pairs that are not in the train set
            predictions = compute_ranking_predictions(model, test, usercol='CustomerID', itemcol='StockCode')

            # Filter Top 10 
            top_10 = filter_top_n(predictions, 10)

            # Evaluate Recall at 10 
            eval_recall_10 = recall_at_k(test, top_10,col_user="CustomerID", col_item="StockCode",
                                                col_rating="purchased", col_prediction="prediction", 
                                                relevancy_method="top_k", k=10)

            print("recall at for", factor, "factors: \t", eval_recall_10)

            # Append Lists
            factors.append(factor)
            recall_k.append(eval_recall_10)
            learning_rates.append(lr)
            regulation_terms.append(reg) 

recall at for 10 factors: 	 0.0028158996422393066
recall at for 10 factors: 	 0.0027228156618936817
recall at for 10 factors: 	 0.00302434526259574
recall at for 10 factors: 	 0.0024397430205545137
recall at for 10 factors: 	 0.0028601158440778288
recall at for 10 factors: 	 0.002349815967424364
recall at for 100 factors: 	 0.0032783145317850405
recall at for 100 factors: 	 0.002807545554463402
recall at for 100 factors: 	 0.0027855866776388574
recall at for 100 factors: 	 0.002989214341941537
recall at for 100 factors: 	 0.0025662649860623077
recall at for 100 factors: 	 0.002331084085043991
recall at for 1000 factors: 	 0.00306062897367376
recall at for 1000 factors: 	 0.002735119667401047
recall at for 1000 factors: 	 0.0023959079063993333
recall at for 1000 factors: 	 0.002791422838101298
recall at for 1000 factors: 	 0.0024806064395272716
recall at for 1000 factors: 	 0.0024978563636800797


In [13]:
dict_param = {
    "factor": factors,
    "regulation_term": regulation_terms,
    "learning_rate": learning_rates,
    "recall_10": recall_k
}

In [15]:
df_param = pd.DataFrame(dict_param)
df_param

Unnamed: 0,factor,regulation_term,learning_rate,recall_10
0,10,0.01,0.001,0.002816
1,10,0.02,0.001,0.002723
2,10,0.01,0.007,0.003024
3,10,0.02,0.007,0.00244
4,10,0.01,0.01,0.00286
5,10,0.02,0.01,0.00235
6,100,0.01,0.001,0.003278
7,100,0.02,0.001,0.002808
8,100,0.01,0.007,0.002786
9,100,0.02,0.007,0.002989


In [16]:
df_param.to_csv('svdpp_binary_hpt_f10to1000_lr0001to01_reg001to002.csv')