In [1]:
import numpy as np
import pandas as pd
import random

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import GridSearchCV

In [2]:
# For repeatability, set the seed to a default value
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [3]:
# Reading all data sets -- This contain all the Train data set that will be used
# for analysis

All_Data = pd.read_csv('./Files_Folder/All_Data_Set.csv')
All_Data.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200


In [4]:
# Dividing data into train - test - holdout
hold_data = All_Data.sample(frac = 0.05, random_state = 0)
train_test_set = All_Data.drop(hold_data.index)
test_data = train_test_set.sample(hold_data.shape[0], random_state = 0)
train_data = train_test_set.drop(test_data.index)

print ('Shape of All Data = ', All_Data.shape)
print ('Shape of Train Data = ', train_data.shape)
print ('Shape of Test Data = ', test_data.shape)
print ('Shape of Hold-Out Data = ', hold_data.shape)

Shape of All Data =  (550068, 12)
Shape of Train Data =  (495062, 12)
Shape of Test Data =  (27503, 12)
Shape of Hold-Out Data =  (27503, 12)


In [5]:
All_Data.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200


In [6]:
train_data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [7]:
test_data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
347990,1005599,P00165842,M,36-45,7,A,1,0,14,,,11288
172664,1002727,P00345842,M,26-35,11,C,0,0,2,8.0,14.0,3599
238751,1000855,P00302042,F,18-25,2,A,2,1,5,,,3676
184142,1004392,P00199442,M,18-25,4,C,1,1,5,9.0,,1753
109411,1004831,P00031042,F,26-35,1,C,1,0,8,,,9904


In [8]:
hold_data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
41001,1000306,P00093542,M,18-25,0,C,3,0,1,6.0,,15228
16151,1002478,P0097142,M,51-55,1,A,0,0,12,,,1088
507262,1000129,P00129842,M,26-35,11,C,2,0,6,8.0,15.0,16426
368698,1002816,P00025442,M,26-35,7,C,4+,1,1,2.0,9.0,12042
353782,1000514,P00220442,M,26-35,2,A,1,1,5,14.0,,8699


In [9]:
# Checking out how the data set have been divided into portions
test_data_sub = test_data.drop(['Purchase'], axis=1)
hold_data_sub = hold_data.drop(['Purchase'], axis=1)

print ('Shape of All Data = ', All_Data.shape)
print ('Shape of Train Data = ', train_data.shape)
print ('Shape of Test Data (No Purchase column) = ', test_data_sub.shape)
print ('Shape of Hold-Out Data (No Purchase column) = ', hold_data_sub.shape)

Shape of All Data =  (550068, 12)
Shape of Train Data =  (495062, 12)
Shape of Test Data (No Purchase column) =  (27503, 11)
Shape of Hold-Out Data (No Purchase column) =  (27503, 11)


In [10]:
test_data_sub.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
347990,1005599,P00165842,M,36-45,7,A,1,0,14,,
172664,1002727,P00345842,M,26-35,11,C,0,0,2,8.0,14.0
238751,1000855,P00302042,F,18-25,2,A,2,1,5,,
184142,1004392,P00199442,M,18-25,4,C,1,1,5,9.0,
109411,1004831,P00031042,F,26-35,1,C,1,0,8,,


In [11]:
hold_data_sub.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
41001,1000306,P00093542,M,18-25,0,C,3,0,1,6.0,
16151,1002478,P0097142,M,51-55,1,A,0,0,12,,
507262,1000129,P00129842,M,26-35,11,C,2,0,6,8.0,15.0
368698,1002816,P00025442,M,26-35,7,C,4+,1,1,2.0,9.0
353782,1000514,P00220442,M,26-35,2,A,1,1,5,14.0,


In [12]:
# Saving all the data sets to different files ...
train_data.to_csv ('./Files_Folder/train_data.csv', index=False) # For model development.
test_data_sub.to_csv('./Files_Folder/test_data_sub.csv', index=False)
hold_data_sub.to_csv ('./Files_Folder/hold_data_sub.csv', index=False)
test_data.to_csv('./Files_Folder/test_data.csv', index=False) # For model optimization
hold_data.to_csv ('./Files_Folder/hold_data.csv', index=False) # Final answer for the hold out data will be!

In [13]:
# One needs to determine the Product_IDs that are present in the test data set but not present in the train data set