In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [2]:
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [3]:
# Train data set with Normalized Purchase
User_Prod_NP = pd.read_csv('./Files_Folder/User_Prod_NormPurc.csv')

# Statistics of each Product_ID in train data sets
train_Prod_sta = pd.read_csv('./Files_Folder/train_User_Prod_stat.csv') 

# Statistics of new Product_IDs; those are Prodcut_IDs in test data set but not in train data set
New_Prod_LUT = pd.read_csv('./Files_Folder/New_Product_Details.csv')

In [4]:
User_Prod_NP.head(2)

Unnamed: 0,User_ID,Product_ID,Normalized_Purchase
0,1000001,P00069042,3.067944
1,1000001,P00248942,3.857687


In [5]:
User_Prod_NP.shape

(495062, 3)

In [6]:
train_Prod_sta.head(2)

Unnamed: 0.1,Unnamed: 0,P00000142,P00000242,P00000342,P00000442,P00000542,P00000642,P00000742,P00000842,P00000942,...,P0098942,P0099042,P0099142,P0099242,P0099342,P0099442,P0099642,P0099742,P0099842,P0099942
0,count,1042.0,344.0,220.0,80.0,135.0,455.0,213.0,31.0,48.0,...,30.0,135.0,6.0,235.0,394.0,181.0,13.0,115.0,100.0,10.0
1,mean,11127.769674,10488.671512,5353.263636,4778.95,5412.511111,14921.810989,6051.098592,10004.322581,10611.416667,...,6917.866667,6188.933333,6018.666667,6778.174468,6986.515228,14298.375691,6439.230769,7929.956522,7269.55,5148.4


In [7]:
New_Prod_LUT.head(2)

Unnamed: 0,Product_ID,mean,50%,std,count,min,max,25%,75%
0,P00301942,6101.874236,6886.0,1960.053653,57290.0,1713.0,8907.0,5211.0,7139.0
1,P00300742,6101.874236,6886.0,1960.053653,57290.0,1713.0,8907.0,5211.0,7139.0


In [8]:
# Different possible algorithms
algo_SVD = SVD()
algo_SVDpp = SVDpp()
algo_NMF = NMF()
algo_KNNBasic = KNNBasic()
algo_KNNWithZScore = KNNWithZScore()

In [9]:
# A reader is still needed but only the rating_scale param is required ... Comment from Surprise library
reader = Reader(rating_scale=(1, 5))

In [10]:
# The columns must correspond to user id, item id and ratings (in that order) ... Comment from Surprise library
data = Dataset.load_from_df(User_Prod_NP[['User_ID', 'Product_ID', 'Normalized_Purchase']], reader)

In [11]:
# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(algo_SVD, data, cv=3, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9135  0.9111  0.9107  0.9118  0.0012  
MAE (testset)     0.7219  0.7201  0.7184  0.7201  0.0014  
Fit time          18.70   18.73   18.64   18.69   0.04    
Test time         1.28    1.28    1.33    1.30    0.03    


{'test_rmse': array([0.91348968, 0.91110248, 0.91074862]),
 'test_mae': array([0.72189201, 0.72011006, 0.71842158]),
 'fit_time': (18.703083276748657, 18.7264187335968, 18.63799262046814),
 'test_time': (1.2772295475006104, 1.2790248394012451, 1.3330228328704834)}

In [12]:
cross_validate(algo_NMF, data, cv=3, verbose = True)

Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9339  0.9373  0.9338  0.9350  0.0016  
MAE (testset)     0.7347  0.7368  0.7344  0.7353  0.0011  
Fit time          19.36   19.46   19.75   19.52   0.16    
Test time         1.31    1.23    1.19    1.24    0.05    


{'test_rmse': array([0.93391361, 0.93732433, 0.93381975]),
 'test_mae': array([0.73471345, 0.73678018, 0.73440258]),
 'fit_time': (19.358744859695435, 19.459253787994385, 19.747405529022217),
 'test_time': (1.306314468383789, 1.2260515689849854, 1.1918790340423584)}

In [13]:
cross_validate(algo_KNNBasic, data, cv=3, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9542  0.9481  0.9462  0.9495  0.0034  
MAE (testset)     0.7539  0.7489  0.7475  0.7501  0.0028  
Fit time          10.33   10.31   10.26   10.30   0.03    
Test time         44.14   44.55   44.29   44.32   0.17    


{'test_rmse': array([0.95417657, 0.94806799, 0.94618105]),
 'test_mae': array([0.75392277, 0.74888526, 0.7475225 ]),
 'fit_time': (10.328772783279419, 10.306680679321289, 10.261749505996704),
 'test_time': (44.13553810119629, 44.55254912376404, 44.28618812561035)}

In [14]:
cross_validate(algo_KNNWithZScore, data, cv=3, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithZScore on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9326  0.9340  0.9309  0.9325  0.0013  
MAE (testset)     0.7342  0.7360  0.7329  0.7344  0.0013  
Fit time          10.70   10.75   10.71   10.72   0.02    
Test time         51.19   47.74   47.96   48.96   1.58    


{'test_rmse': array([0.93260649, 0.93402983, 0.93085368]),
 'test_mae': array([0.73418965, 0.73603964, 0.73291   ]),
 'fit_time': (10.69895887374878, 10.754914045333862, 10.71305513381958),
 'test_time': (51.190239667892456, 47.73562026023865, 47.9622700214386)}

In [15]:
cross_validate(algo_SVDpp, data, cv=3, verbose = True)

Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8924  0.8986  0.8950  0.8953  0.0026  
MAE (testset)     0.7013  0.7068  0.7033  0.7038  0.0023  
Fit time          573.41  569.10  559.91  567.47  5.63    
Test time         20.46   21.74   21.00   21.07   0.53    


{'test_rmse': array([0.89236132, 0.89859065, 0.8949642 ]),
 'test_mae': array([0.70125509, 0.70676811, 0.70325833]),
 'fit_time': (573.412454366684, 569.1013038158417, 559.9104628562927),
 'test_time': (20.459402084350586, 21.74066185951233, 21.00440502166748)}