In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import GridSearchCV

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
# Train data set with Normalized Purchase
User_Prod_NP = pd.read_csv('./Files_Folder/User_Prod_NormPurc.csv')

# Statistics of each Product_ID in train data sets
train_Prod_sta = pd.read_csv('./Files_Folder/train_User_Prod_stat.csv') 

# Statistics of new Product_IDs; those are Prodcut_IDs in test data set but not in train data set
New_Prod_LUT = pd.read_csv('./Files_Folder/New_Product_Details.csv')

# Test Data Set
test_data_sub = pd.read_csv('./Files_Folder/test_data_sub.csv')

# Hold Out Data Set
hold_data_sub = pd.read_csv('./Files_Folder/hold_data_sub.csv')

# Train Data Set
train_data = pd.read_csv('./Files_Folder/train_data.csv')

test_data = pd.read_csv('./Files_Folder/test_data.csv')

In [3]:
new_prod = list(New_Prod_LUT.Product_ID)

cv_mean = []
cv_std = []
rmse_train = []
rmse_test = []

for j in range(20):
    
    j1 = 0.01*(j+1)
    algo_SVD = SVD(n_epochs=130, lr_all = 0.005, reg_all = j1)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(User_Prod_NP[['User_ID', 'Product_ID', 'Normalized_Purchase']], reader)

    # We can now use this dataset as we please, e.g. calling cross_validate
    crva = cross_validate(algo_SVD, data, cv=3, verbose = True)

    cv_mean.append(crva['test_rmse'].mean())

    cv_std.append(crva['test_rmse'].std())
    
    # Using our model to predict the prices of the train data set
    # This is used to determine the training error
    train_user_list = []
    train_prod_list = []
    train_price = []
    for i in range(train_data.shape[0]):
        p1 = algo_SVD.predict(train_data.User_ID[i],train_data.Product_ID[i]).est
        rmin = train_Prod_sta.loc[3, train_data.Product_ID[i]]
        rmax = train_Prod_sta.loc[7, train_data.Product_ID[i]]
        dmin = 1
        dmax = 5
        p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
        train_price.append(p)
        train_user_list.append(train_data.User_ID[i])
        train_prod_list.append(train_data.Product_ID[i])
    final_train_tally = {'User_ID':train_user_list, 'Product_ID':train_prod_list, 'Purchase': train_price}
    final_train_tally_df = pd.DataFrame(final_train_tally)
    
    
    test_user_list = []
    test_prod_list = []
    price = []
    for i in range(test_data_sub.shape[0]):
        if test_data_sub.Product_ID[i] in new_prod:
            p = New_Prod_LUT[New_Prod_LUT.Product_ID == test_data_sub.Product_ID[i]]['mean'].iloc[0]      
        else:
            p1 = algo_SVD.predict(test_data_sub.User_ID[i],test_data_sub.Product_ID[i]).est
            rmin = train_Prod_sta.loc[3, test_data_sub.Product_ID[i]]
            rmax = train_Prod_sta.loc[7, test_data_sub.Product_ID[i]]
            dmin = 1
            dmax = 5
            p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
        price.append(p)
        test_user_list.append(test_data_sub.User_ID[i])
        test_prod_list.append(test_data_sub.Product_ID[i])
    final_test_tally = {'User_ID':test_user_list, 'Product_ID':test_prod_list, 'Purchase': price}
    final_test_tally_df = pd.DataFrame(final_test_tally)
    
    final_train_tally_df ['Actual_Purchase'] = train_data['Purchase']
    final_train_tally_df ['Error'] = final_train_tally_df ['Purchase'] - final_train_tally_df ['Actual_Purchase']
    final_train_tally_df ['Error_sq'] = final_train_tally_df ['Error']*final_train_tally_df ['Error']
    rmse_train.append(np.sqrt(final_train_tally_df ['Error_sq'].sum()/final_train_tally_df.shape[0]))    
    file_train = './Files_Folder/rmse/reg/Predicted_train_Result_SVD_mean_' + str(j+1) + '.csv'
    final_train_tally_df.to_csv(file_train, index=False) 
    
    final_test_tally_df ['Actual_Purchase'] = test_data['Purchase']
    final_test_tally_df ['Error'] = final_test_tally_df ['Purchase'] - final_test_tally_df ['Actual_Purchase']
    final_test_tally_df ['Error_sq'] = final_test_tally_df ['Error']*final_test_tally_df ['Error']
    rmse_test.append(np.sqrt(final_test_tally_df ['Error_sq'].sum()/final_test_tally_df.shape[0]))
    file_test = './Files_Folder/rmse/reg/Predicted_test_Result_SVD_mean_' + str(j+1) + '.csv'
    final_test_tally_df.to_csv(file_test, index=False)
    
reg = [0.01*(k+1) for k in range(20)]
colate = {'reg. param': reg, 'cv_mean': cv_mean, 'cv_std': cv_std, 'rmse_train': rmse_train, 'rmse_test':rmse_test}
colate_df = pd.DataFrame(colate)
colate_df.to_csv('./Files_Folder/rmse/reg/Result_With_Varying_reg_param_SVD.csv', index = False)
colate_df

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0180  1.0167  1.0178  1.0175  0.0006  
MAE (testset)     0.7991  0.7985  0.7984  0.7986  0.0003  
Fit time          128.03  134.32  124.92  129.09  3.91    
Test time         1.61    2.07    1.34    1.67    0.30    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9669  0.9686  0.9666  0.9674  0.0009  
MAE (testset)     0.7598  0.7597  0.7600  0.7598  0.0001  
Fit time          122.73  122.03  114.95  119.91  3.51    
Test time         1.66    1.27    1.28    1.41    0.18    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9395  0.9411  0.9356  0.9387  0.0023  
MAE (testset)     0.7378  0.7391  0.7360  0.7376  0.0013  
Fit time          120.69  129.27  136.90  128.95  6.62    
Test time   

Unnamed: 0,reg. param,cv_mean,cv_std,rmse_train,rmse_test
0,0.01,1.017462,0.000578,1660.280819,2788.836406
1,0.02,0.967358,0.000884,1622.972598,2653.109449
2,0.03,0.938714,0.002311,1626.851822,2555.318079
3,0.04,0.920071,0.001633,1661.32295,2500.197736
4,0.05,0.907569,0.000935,1713.606497,2469.402978
5,0.06,0.897807,0.001227,1767.309345,2441.919414
6,0.07,0.89174,0.000981,1833.579605,2433.884039
7,0.08,0.887697,0.0003,1903.695353,2417.115409
8,0.09,0.885317,0.001654,1972.827554,2410.585068
9,0.1,0.88408,0.001202,2046.983844,2411.671066


In [4]:
new_prod = list(New_Prod_LUT.Product_ID)

cv_mean = []
cv_std = []
rmse_train = []
rmse_test = []

for j in range(10, 201, 10):
    
    algo_SVD = SVD(n_epochs=j, lr_all = 0.005, reg_all = 0.1)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(User_Prod_NP[['User_ID', 'Product_ID', 'Normalized_Purchase']], reader)

    # We can now use this dataset as we please, e.g. calling cross_validate
    crva = cross_validate(algo_SVD, data, cv=3, verbose = True)

    cv_mean.append(crva['test_rmse'].mean())

    cv_std.append(crva['test_rmse'].std())
    
    # Using our model to predict the prices of the train data set
    # This is used to determine the training error
    train_user_list = []
    train_prod_list = []
    train_price = []
    for i in range(train_data.shape[0]):
        p1 = algo_SVD.predict(train_data.User_ID[i],train_data.Product_ID[i]).est
        rmin = train_Prod_sta.loc[3, train_data.Product_ID[i]]
        rmax = train_Prod_sta.loc[7, train_data.Product_ID[i]]
        dmin = 1
        dmax = 5
        p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
        train_price.append(p)
        train_user_list.append(train_data.User_ID[i])
        train_prod_list.append(train_data.Product_ID[i])
    final_train_tally = {'User_ID':train_user_list, 'Product_ID':train_prod_list, 'Purchase': train_price}
    final_train_tally_df = pd.DataFrame(final_train_tally)
    
    
    test_user_list = []
    test_prod_list = []
    price = []
    for i in range(test_data_sub.shape[0]):
        if test_data_sub.Product_ID[i] in new_prod:
            p = New_Prod_LUT[New_Prod_LUT.Product_ID == test_data_sub.Product_ID[i]]['mean'].iloc[0]      
        else:
            p1 = algo_SVD.predict(test_data_sub.User_ID[i],test_data_sub.Product_ID[i]).est
            rmin = train_Prod_sta.loc[3, test_data_sub.Product_ID[i]]
            rmax = train_Prod_sta.loc[7, test_data_sub.Product_ID[i]]
            dmin = 1
            dmax = 5
            p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
        price.append(p)
        test_user_list.append(test_data_sub.User_ID[i])
        test_prod_list.append(test_data_sub.Product_ID[i])
    final_test_tally = {'User_ID':test_user_list, 'Product_ID':test_prod_list, 'Purchase': price}
    final_test_tally_df = pd.DataFrame(final_test_tally)
    
    final_train_tally_df ['Actual_Purchase'] = train_data['Purchase']
    final_train_tally_df ['Error'] = final_train_tally_df ['Purchase'] - final_train_tally_df ['Actual_Purchase']
    final_train_tally_df ['Error_sq'] = final_train_tally_df ['Error']*final_train_tally_df ['Error']
    rmse_train.append(np.sqrt(final_train_tally_df ['Error_sq'].sum()/final_train_tally_df.shape[0]))    
    file_train = './Files_Folder/rmse/epoch/Predicted_train_Result_SVD_mean_' + str(j) + '.csv'
    final_train_tally_df.to_csv(file_train, index=False) 
    
    final_test_tally_df ['Actual_Purchase'] = test_data['Purchase']
    final_test_tally_df ['Error'] = final_test_tally_df ['Purchase'] - final_test_tally_df ['Actual_Purchase']
    final_test_tally_df ['Error_sq'] = final_test_tally_df ['Error']*final_test_tally_df ['Error']
    rmse_test.append(np.sqrt(final_test_tally_df ['Error_sq'].sum()/final_test_tally_df.shape[0]))
    file_test = './Files_Folder/rmse/epoch/Predicted_test_Result_SVD_mean_' + str(j) + '.csv'
    final_test_tally_df.to_csv(file_test, index=False)
    
epo = [k for k in range(10, 201, 10)]
colate = {'epoch': epo, 'cv_mean': cv_mean, 'cv_std': cv_std, 'rmse_train': rmse_train, 'rmse_test':rmse_test}
colate_df = pd.DataFrame(colate)
colate_df.to_csv('./Files_Folder/rmse/epoch/Result_With_Varying_reg_param_SVD.csv', index = False)
colate_df

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9076  0.9073  0.9093  0.9081  0.0009  
MAE (testset)     0.7212  0.7211  0.7230  0.7218  0.0008  
Fit time          9.06    9.30    9.07    9.14    0.11    
Test time         1.24    1.26    1.30    1.27    0.03    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9070  0.9049  0.9053  0.9057  0.0009  
MAE (testset)     0.7198  0.7180  0.7186  0.7188  0.0007  
Fit time          18.17   18.25   18.15   18.19   0.04    
Test time         1.31    1.30    1.42    1.34    0.05    
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9028  0.9004  0.9021  0.9018  0.0010  
MAE (testset)     0.7158  0.7142  0.7151  0.7150  0.0007  
Fit time          27.23   27.12   27.31   27.22   0.08    
Test time   

Unnamed: 0,epoch,cv_mean,cv_std,rmse_train,rmse_test
0,10,0.908081,0.000886,2413.87294,2478.954587
1,20,0.905715,0.000897,2383.923312,2474.120482
2,30,0.901754,0.001,2350.395389,2464.674544
3,40,0.895558,0.000381,2299.167667,2441.266705
4,50,0.891669,0.002285,2250.955035,2431.812837
5,60,0.888575,0.000584,2205.718258,2416.109123
6,70,0.88601,0.000504,2170.263377,2414.98745
7,80,0.884995,0.000195,2136.985884,2409.071824
8,90,0.884023,0.002157,2108.939041,2407.23582
9,100,0.88371,0.002591,2082.002722,2404.90574
