In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import GridSearchCV

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
# train and test data sets
train_data = pd.read_csv('./Files_Folder/train_data.csv')

test_data = pd.read_csv('./Files_Folder/test_data.csv')

In [3]:
print (train_data.shape)
print (test_data.shape)

(495062, 12)
(27503, 12)


In [4]:
# Reading all the predicted values
train_data_pred = pd.read_csv('./Files_Folder/Predicted_train_Result_SVD.csv')
test_data_mean_pred = pd.read_csv('./Files_Folder/Predicted_test_SVD_mean.csv')
test_data_median_pred = pd.read_csv('./Files_Folder/Predicted_test_SVD_median.csv')

In [5]:
print (train_data_pred.shape)
print (test_data_mean_pred.shape)
print (test_data_median_pred.shape)

(495062, 3)
(27503, 3)
(27503, 3)
(27495, 3)
(8, 3)
(8, 3)


In [6]:
train_data_pred.dtypes

User_ID         int64
Product_ID     object
Purchase      float64
dtype: object

In [7]:
train_data_pred.columns = ['User_ID', 'Product_ID', 'Predicted_Purchase']
test_data_mean_pred.columns = ['User_ID', 'Product_ID', 'Predicted_Purchase']
test_data_median_pred.columns = ['User_ID', 'Product_ID', 'Predicted_Purchase']

In [9]:
train_data_pred['Actual_Purchase'] = train_data['Purchase']
test_data_mean_pred['Actual_Purchase'] = test_data['Purchase']
test_data_median_pred['Actual_Purchase'] = test_data['Purchase']

In [10]:
train_data_pred['Error'] = train_data_pred['Predicted_Purchase'] - train_data_pred['Actual_Purchase']
test_data_mean_pred['Error'] = test_data_mean_pred['Predicted_Purchase'] - test_data_mean_pred['Actual_Purchase']
test_data_median_pred['Error'] = test_data_median_pred['Predicted_Purchase'] - test_data_median_pred['Actual_Purchase']

In [11]:
train_data_pred['Error_sq'] = train_data_pred['Error']*train_data_pred['Error']
test_data_mean_pred['Error_sq'] = test_data_mean_pred['Error']*test_data_mean_pred['Error']
test_data_median_pred['Error_sq'] = test_data_median_pred['Error']*test_data_median_pred['Error']

In [12]:
rmse_train_data_pred = np.sqrt(train_data_pred.Error_sq.sum()/train_data_pred.shape[0])
rmse_test_data_mean_pred = np.sqrt(test_data_mean_pred.Error_sq.sum()/test_data_mean_pred.shape[0])
rmse_test_data_median_pred = np.sqrt(test_data_median_pred.Error_sq.sum()/test_data_median_pred.shape[0])

In [13]:
print (rmse_train_data_pred)
print (rmse_test_data_mean_pred)
print (rmse_test_data_median_pred)

2043.858948138422
2408.484797052712
2408.714176669016


In [14]:
test_data_mean_pred.shape

(27503, 6)

In [15]:
test_data_mean_pred_2 = test_data_mean_pred.drop(test_data_newid.index)
test_data_mean_pred_2.shape

(27495, 6)

In [16]:
np.sqrt(test_data_mean_pred_2.Error_sq.sum()/test_data_mean_pred_2.shape[0])

2408.406716042237

In [17]:
train_data.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,495062.0,495062.0,495062.0,495062.0,338906.0,150075.0,495062.0
mean,1003027.0,8.076748,0.40971,5.401764,9.844337,12.671438,9265.688885
std,1727.099,6.523106,0.491781,3.935342,5.087848,4.124031,5023.84293
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001514.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003075.0,7.0,0.0,5.0,9.0,14.0,8048.0
75%,1004473.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0
