# Loading Libraries

In [117]:
import pandas as pd
import numpy as np

import logging
import tensorflow as tf


#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
)


# Reading Data

In [118]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [119]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (207084, 3)
Shape of Test: 	 (51892, 3)


In [120]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3692
Unique Users in Test: 3692
Unique Items in Train: 2716
Unique Items in Test: 2716


# Preparation of Datasets for Model

In [121]:
# create a pivot table from the dataframe
train_matrix = pd.pivot_table(train, values='purchased', index='CustomerID', columns='StockCode')

# replace non-zero values with 1 and missing values with 0
train_matrix[train_matrix > 0] = 5
train_matrix = train_matrix.fillna(1)

# Show Matrix
train_matrix.head()

StockCode,0,1,2,6,7,8,9,11,12,13,...,3548,3552,3594,3622,3623,3625,3626,3677,3681,3683
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0


In [122]:
# create a pivot table from the dataframe
test_matrix = pd.pivot_table(test, values='purchased', index='CustomerID', columns='StockCode', aggfunc=np.sum)

# replace non-zero values with 1 and missing values with 0
test_matrix[test_matrix > 0] = 5
test_matrix = test_matrix.fillna(1)

# Show Matrix
test_matrix.head()

StockCode,0,1,2,6,7,8,9,11,12,13,...,3548,3552,3594,3622,3623,3625,3626,3677,3681,3683
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [123]:
# Sanity Check: Print Shape of Train & Test Matrix 
print(train_matrix.shape)
print(test_matrix.shape)

(3692, 2716)
(3692, 2716)


# Baseline Model

In [124]:
try:
    del(model)
except:
    pass

#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(train_matrix), np.array([0])),
    visible_units=train_matrix.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42
)

In [272]:
np.setdiff1d(np.unique(train_matrix), np.array([0]))

array([1., 5.])

In [125]:
#Model Fit
with Timer() as train_time:
    model.fit(train_matrix)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 40.00 seconds for training.


# Prediction: Test Only

In [223]:
# Make Predictions for entire Test Matrix
predicted_matrix = model.predict(test_matrix)

In [235]:
test_pred.predictions.value_counts()

5.0    9380321
1.0     647151
Name: predictions, dtype: int64

In [224]:
# Convert back to matrix
predicted_matrix = pd.DataFrame(data=predicted_matrix, index=test_matrix.index, columns=test_matrix.columns)

# Convert back to dataframe as table for each row
test_pred  = pd.melt(predicted_matrix.reset_index(), id_vars=['CustomerID'], value_vars=predicted_matrix.columns, var_name='StockCode', value_name='predictions')

# Convert all values in test to 5 
test['purchased'] = 5.0

# Merge test_pred with test
test_prediction_assessment = test.merge(test_pred, on=['StockCode', 'CustomerID'], how='left')

In [225]:
# Check first five rows 
test_prediction_assessment.head()

Unnamed: 0,StockCode,CustomerID,purchased,predictions
0,1249,1,5.0,5.0
1,396,1,5.0,5.0
2,2601,1,5.0,5.0
3,912,1,5.0,5.0
4,1032,1,5.0,5.0


In [226]:
# Check Distribution of Prediction Values
test_prediction_assessment.predictions.value_counts()

5.0    49693
1.0     2199
Name: predictions, dtype: int64

In [227]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test_prediction_assessment.purchased, test_prediction_assessment.predictions)

0.9576235257843213

# Prediction: Top N

## Data Preparation for TOP N

In [261]:
# Import Affinity Matrix
from recommenders.datasets.sparse import AffinityMatrix

# Convert train Purchased to 5
train['purchased'] = 5.0

#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am_train = AffinityMatrix(df = train, col_user='CustomerID', col_item='StockCode', col_rating='purchased')
am_test = AffinityMatrix(df = test, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

## Training for TOP N

In [273]:
# Delete First Model 
try:
    del(model)
except:
    pass

# First we initialize the model class
model = RBM(
    possible_ratings=[1., 5.],
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42)

#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 64.05 seconds for training.


In [274]:
# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k_20 =  model.recommend_k_items(Xtst, top_k=20, remove_seen=True)

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 1.95 seconds for prediction.


In [275]:
# Map Back Xtst & Top_k 
df_top_k_20 = am_test.map_back_sparse(top_k_20, kind = 'prediction')
df_test = am_test.map_back_sparse(Xtst, kind = 'ratings')

In [276]:
eval_recall = recall_at_k(df_test, df_top_k_20, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
eval_recall

0.42329911525583824