In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import GridSearchCV

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

# Train data set with Normalized Purchase
User_Prod_NP = pd.read_csv('./Files_Folder/User_Prod_NormPurc.csv')

# Statistics of each Product_ID in train data sets
train_Prod_sta = pd.read_csv('./Files_Folder/train_User_Prod_stat.csv') 

# Statistics of new Product_IDs; those are Prodcut_IDs in test data set but not in train data set
New_Prod_LUT = pd.read_csv('./Files_Folder/New_Product_Details.csv')

# Hold Data With Purchase
hold_data = pd.read_csv('./Files_Folder/hold_data.csv')

# Hold Out Data Set
hold_data_sub = pd.read_csv('./Files_Folder/hold_data_sub.csv')

# Train Data Set
train_data = pd.read_csv('./Files_Folder/train_data.csv')

algo_SVD = SVD(n_epochs=130, lr_all = 0.005, reg_all = 0.1)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(User_Prod_NP[['User_ID', 'Product_ID', 'Normalized_Purchase']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(algo_SVD, data, cv=3, verbose = True)

new_prod = list(New_Prod_LUT.Product_ID)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8849  0.8834  0.8820  0.8834  0.0012  
MAE (testset)     0.6989  0.6983  0.6958  0.6977  0.0013  
Fit time          127.77  127.55  126.95  127.42  0.34    
Test time         1.48    1.41    1.30    1.40    0.08    


In [2]:
# Checking out the hold data set and computing the rmse
hold_user_list = []
hold_prod_list = []
price = []
for i in range(hold_data_sub.shape[0]):
    if hold_data_sub.Product_ID[i] in new_prod:
        p = New_Prod_LUT[New_Prod_LUT.Product_ID == hold_data_sub.Product_ID[i]]['mean'].iloc[0]      
    else:
        p1 = algo_SVD.predict(hold_data_sub.User_ID[i],hold_data_sub.Product_ID[i]).est
        rmin = train_Prod_sta.loc[3, hold_data_sub.Product_ID[i]]
        rmax = train_Prod_sta.loc[7, hold_data_sub.Product_ID[i]]
        dmin = 1
        dmax = 5
        p = (p1 - dmin)*(rmax - rmin)/(dmax - dmin) + rmin
    price.append(p)
    hold_user_list.append(hold_data_sub.User_ID[i])
    hold_prod_list.append(hold_data_sub.Product_ID[i])
final_tally = {'User_ID':hold_user_list, 'Product_ID':hold_prod_list, 'Purchase': price}
final_tally_df = pd.DataFrame(final_tally)
final_tally_df.to_csv('./Files_Folder/Predicted_hold_SVD_mean.csv', index=False)

In [3]:
final_tally_df['Actual_Purchase'] = hold_data['Purchase']
final_tally_df['Error'] = final_tally_df['Purchase'] - final_tally_df['Actual_Purchase']
final_tally_df['Error_sq'] = final_tally_df['Error']*final_tally_df['Error']
rmse_hold_data_pred = np.sqrt(final_tally_df.Error_sq.sum()/final_tally_df.shape[0])
print (rmse_hold_data_pred)

2414.796149517017
