In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import GridSearchCV

In [2]:
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [12]:
# Train data set with Normalized Purchase
User_Prod_NP = pd.read_csv('./Files_Folder/User_Prod_NormPurc.csv')

# Statistics of each Product_ID in train data sets
train_Prod_sta = pd.read_csv('./Files_Folder/train_User_Prod_stat.csv') 

# Statistics of new Product_IDs; those are Prodcut_IDs in test data set but not in train data set
New_Prod_LUT = pd.read_csv('./Files_Folder/New_Product_Details.csv')

# Test Data Set
test_data_sub = pd.read_csv('./Files_Folder/test_data_sub.csv')

# Hold Out Data Set
hold_data_sub = pd.read_csv('./Files_Folder/hold_data_sub.csv')

# Train Data Set
train_data = pd.read_csv('./Files_Folder/train_data.csv')

In [4]:
algo_SVD = SVD(n_epochs=130, lr_all = 0.005, reg_all = 0.1)

# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(User_Prod_NP[['User_ID', 'Product_ID', 'Normalized_Purchase']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(algo_SVD, data, cv=3, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8849  0.8834  0.8820  0.8834  0.0012  
MAE (testset)     0.6989  0.6983  0.6958  0.6977  0.0013  
Fit time          129.04  130.33  124.48  127.95  2.51    
Test time         1.34    1.35    1.37    1.35    0.01    


{'test_rmse': array([0.88494019, 0.88342689, 0.88197835]),
 'test_mae': array([0.69894886, 0.69825194, 0.69584483]),
 'fit_time': (129.03629350662231, 130.3284363746643, 124.47913932800293),
 'test_time': (1.3436517715454102, 1.3451571464538574, 1.3657407760620117)}

In [5]:
new_prod = list(New_Prod_LUT.Product_ID)

In [6]:
User_Prod_NP.describe()

Unnamed: 0,User_ID,Normalized_Purchase
count,495062.0,495062.0
mean,1003027.0,3.538755
std,1727.099,1.082961
min,1000001.0,1.0
25%,1001514.0,2.955803
50%,1003075.0,3.887335
75%,1004473.0,4.035739
max,1006040.0,5.0


In [7]:
k = list(test_data_sub.Product_ID.unique())
print ('# of unique Product IDs in test data set = ', len(k))

print ('Shape of test data set using unique Product ID as filter = ', test_data_sub[test_data_sub.Product_ID.isin(k)].shape)

k1 = list(New_Prod_LUT.Product_ID.unique())
print ('# of unique Product IDs in New Product (not present in Big Matrix) = ', len(k1))

print ('Asociated Shape of the dataframe = ', test_data_sub[test_data_sub.Product_ID.isin(k1)].shape)

# of unique Product IDs in test data set =  2819
Shape of test data set using unique Product ID as filter =  (27503, 11)
# of unique Product IDs in New Product (not present in Big Matrix) =  16
Asociated Shape of the dataframe =  (8, 11)


In [8]:
test_data_sub_newid = test_data_sub[test_data_sub.Product_ID.isin(list(New_Prod_LUT.Product_ID.unique()))]
print ('test_data_sub_newid_shape = ', test_data_sub_newid.shape)
test_data_sub_newid.to_csv('./Files_Folder/test_data_sub_newid.csv')

test_data_sub_notnewid = test_data_sub.drop(test_data_sub_newid.index)
print ('test_data_sub_notnewid_shape = ', test_data_sub_notnewid.shape)

test_data_sub_newid_shape =  (8, 11)
test_data_sub_notnewid_shape =  (27495, 11)


In [9]:
hold_data_sub_newid = hold_data_sub[hold_data_sub.Product_ID.isin(list(New_Prod_LUT.Product_ID.unique()))]
print ('hold_data_sub_newid_shape = ', hold_data_sub_newid.shape)
hold_data_sub_newid.to_csv('./Files_Folder/hold_data_sub_newid.csv')

hold_data_sub_notnewid = hold_data_sub.drop(hold_data_sub_newid.index)
print ('hold_data_sub_notnewid_shape = ', hold_data_sub_notnewid.shape)

hold_data_sub_newid_shape =  (11, 11)
hold_data_sub_notnewid_shape =  (27492, 11)


In [13]:
# Using our model to predict the prices of the train data set
# This is used to determine the training error
train_user_list = []
train_prod_list = []
train_price = []
for i in range(train_data.shape[0]):
    p1 = algo_SVD.predict(train_data.User_ID[i],train_data.Product_ID[i]).est
    rmin = train_Prod_sta.loc[3, train_data.Product_ID[i]]
    rmax = train_Prod_sta.loc[7, train_data.Product_ID[i]]
    dmin = 1
    dmax = 5
    p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
    train_price.append(p)
    train_user_list.append(train_data.User_ID[i])
    train_prod_list.append(train_data.Product_ID[i])
final_train_tally = {'User_ID':train_user_list, 'Product_ID':train_prod_list, 'Purchase': train_price}
final_train_tally_df = pd.DataFrame(final_train_tally)
final_train_tally_df.to_csv('./Files_Folder/Predicted_train_Result_SVD.csv', index=False)

In [14]:
# Predicting test data set. 
# New products are predicted using the median purchase of similar products based on their product categorizations 
test_user_list = []
test_prod_list = []
price = []
for i in range(test_data_sub.shape[0]):
    if test_data_sub.Product_ID[i] in new_prod:
        p = New_Prod_LUT[New_Prod_LUT.Product_ID == test_data_sub.Product_ID[i]]['50%'].iloc[0]      
    else:
        p1 = algo_SVD.predict(test_data_sub.User_ID[i],test_data_sub.Product_ID[i]).est
        rmin = train_Prod_sta.loc[3, test_data_sub.Product_ID[i]]
        rmax = train_Prod_sta.loc[7, test_data_sub.Product_ID[i]]
        dmin = 1
        dmax = 5
        p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
    price.append(p)
    test_user_list.append(test_data_sub.User_ID[i])
    test_prod_list.append(test_data_sub.Product_ID[i])
final_tally = {'User_ID':test_user_list, 'Product_ID':test_prod_list, 'Purchase': price}
final_tally_df = pd.DataFrame(final_tally)
final_tally_df.to_csv('./Files_Folder/Predicted_test_SVD_median.csv', index=False)

In [15]:
# Predicting test data set. 
# New products are predicted using the mean purchase of similar products based on their product categorizations
test_user_list = []
test_prod_list = []
price = []
for i in range(test_data_sub.shape[0]):
    if test_data_sub.Product_ID[i] in new_prod:
        p = New_Prod_LUT[New_Prod_LUT.Product_ID == test_data_sub.Product_ID[i]]['mean'].iloc[0]      
    else:
        p1 = algo_SVD.predict(test_data_sub.User_ID[i],test_data_sub.Product_ID[i]).est
        rmin = train_Prod_sta.loc[3, test_data_sub.Product_ID[i]]
        rmax = train_Prod_sta.loc[7, test_data_sub.Product_ID[i]]
        dmin = 1
        dmax = 5
        p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
    price.append(p)
    test_user_list.append(test_data_sub.User_ID[i])
    test_prod_list.append(test_data_sub.Product_ID[i])
final_tally = {'User_ID':test_user_list, 'Product_ID':test_prod_list, 'Purchase': price}
final_tally_df = pd.DataFrame(final_tally)
final_tally_df.to_csv('./Files_Folder/Predicted_test_SVD_mean.csv', index=False)