## Importing The Required Libraries

In [2]:
import seaborn as sns # plotting graphs
import matplotlib.pylab as plt # plotting graphs
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('max_columns', 30)
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv


## Importing The Datasets

In [4]:
test_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv")
train_data = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv")

### Exploratory Data Analysis

In [5]:
train_data.describe()

Unnamed: 0,HomePage,HomePage_Duration,LandingPage,LandingPage_Duration,ProductDescriptionPage,ProductDescriptionPage_Duration,GoogleMetric:Bounce Rates,GoogleMetric:Exit Rates,GoogleMetric:Page Values,SeasonalPurchase,OS,SearchEngine,Zone,Type of Traffic,WeekendPurchase
count,14578.0,14581.0,14578.0,14596.0,14608.0,14564.0,14580.0,14602.0,14599.0,14581.0,14597.0,14609.0,14614.0,14588.0,14610.0
mean,2.25024,79.300762,0.490739,33.455943,31.559488,1184.346084,0.023366,0.044664,4.81262,0.064083,2.122422,2.356629,3.155673,4.090143,0.234155
std,3.288042,179.374699,1.252376,140.146256,44.897089,2009.496307,0.050011,0.049912,16.887366,0.202583,0.914404,1.721823,2.405155,4.040147,0.423484
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,173.1875,0.0,0.014502,0.0,0.0,2.0,2.0,1.0,2.0,0.0
50%,1.0,5.0,0.0,0.0,17.5,584.333333,0.003478,0.026406,0.0,0.0,2.0,2.0,3.0,2.0,0.0
75%,3.0,91.0,0.0,0.0,38.0,1434.255128,0.018182,0.05,0.0,0.0,3.0,2.0,4.0,4.0,0.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0,1.0


#### Determining the type of problem by using the target variable

In [6]:
train_data.Made_Purchase.unique()
#Since the target variable is categorical and binary, this is a binary classification problem
#In real world target is not given, it would be determined by the nature of the problem
#In this case we have to determine whether the customer would make a purchase or not i.e. True/False

array([False,  True])

#### Observing the columns and their missing values

In [7]:
train_data.isnull().sum()
#Approximately 1% of samples are missing from each of the features

HomePage                           153
HomePage_Duration                  150
LandingPage                        153
LandingPage_Duration               135
ProductDescriptionPage             123
ProductDescriptionPage_Duration    167
GoogleMetric:Bounce Rates          151
GoogleMetric:Exit Rates            129
GoogleMetric:Page Values           132
SeasonalPurchase                   150
Month_SeasonalPurchase             144
OS                                 134
SearchEngine                       122
Zone                               117
Type of Traffic                    143
CustomerType                       144
Gender                             145
Cookies Setting                    144
Education                          136
Marital Status                     130
WeekendPurchase                    121
Made_Purchase                        0
dtype: int64

#### Splitting the data and label columns

In [8]:
X = train_data.drop(labels = ['Made_Purchase'], axis = 1)
y = train_data[['Made_Purchase']]
y = y.values.ravel()

#### Applying OneHotEncoding on categorical columns

In [10]:
X1=pd.get_dummies(X,columns=['Month_SeasonalPurchase',
                             'CustomerType','Gender','Cookies Setting',
                             'Education','Marital Status',
                             'WeekendPurchase'],drop_first='True')

In [11]:
X1 = pd.DataFrame(X1)
X1

Unnamed: 0,HomePage,HomePage_Duration,LandingPage,LandingPage_Duration,ProductDescriptionPage,ProductDescriptionPage_Duration,GoogleMetric:Bounce Rates,GoogleMetric:Exit Rates,GoogleMetric:Page Values,SeasonalPurchase,OS,SearchEngine,Zone,Type of Traffic,Month_SeasonalPurchase_Dec,...,Month_SeasonalPurchase_Nov,Month_SeasonalPurchase_Oct,Month_SeasonalPurchase_Sep,CustomerType_Other,CustomerType_Returning_Visitor,Gender_Male,Gender_Not Specified,Cookies Setting_Deny,Cookies Setting_Required,Education_Graduate,Education_Not Specified,Education_Others,Marital Status_Other,Marital Status_Single,WeekendPurchase_1.0
0,0.0,0.000000,0.0,0.0,1.0,0.000000,0.200000,0.200000,0.0,0.0,4.0,1.0,9.0,3.0,0,...,0,0,0,0,1,0,1,1,0,0,1,0,1,0,0
1,0.0,0.000000,0.0,0.0,2.0,2.666667,0.050000,0.140000,0.0,0.0,3.0,2.0,2.0,4.0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
2,0.0,0.000000,0.0,0.0,10.0,627.500000,0.020000,0.050000,0.0,0.0,3.0,3.0,1.0,4.0,0,...,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
3,0.0,0.000000,0.0,0.0,1.0,0.000000,0.200000,0.200000,0.0,0.4,2.0,4.0,3.0,3.0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0
4,1.0,0.000000,0.0,0.0,0.0,0.000000,0.200000,0.200000,0.0,0.0,1.0,2.0,1.0,5.0,0,...,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14726,1.0,4.000000,0.0,0.0,39.0,983.138889,0.015385,0.017599,0.0,0.0,3.0,2.0,6.0,3.0,0,...,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
14727,8.0,117.023809,2.0,57.0,11.0,252.892857,0.000000,0.011078,0.0,0.0,2.0,2.0,2.0,4.0,0,...,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
14728,2.0,75.600000,2.0,652.8,10.0,1143.666667,0.000000,0.023333,0.0,0.0,2.0,2.0,4.0,2.0,0,...,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0
14729,0.0,0.000000,0.0,0.0,6.0,1057.000000,0.000000,0.033333,0.0,0.0,2.0,4.0,4.0,1.0,0,...,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0


#### Imputing missing ~1% values by the median

In [12]:
from sklearn.impute import SimpleImputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X1 = imp_median.fit_transform(X1)

#### Splitting the data into train and test

In [13]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, y_train, y_test = train_test_split(
    X1, y, test_size=0.2, random_state=42)

#### Training LogisticRegression model on the Dataset

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0, solver="lbfgs", max_iter=1000)
lr = lr.fit(X1_train, y_train)
lr.score(X1_test, y_test)

0.6823888700373261

#### Training AdaBoostClassifier

In [15]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=160, learning_rate=0.4, random_state=0, )

abc1 = abc.fit(X1_train, y_train)
abc1.score(X1_test, y_test)

0.6901934170342722

#### Training HistogramGradientBoostingClassifier

In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgbc = HistGradientBoostingClassifier(learning_rate = 0.07, random_state = 0, max_iter = 150,
                                     max_leaf_nodes = 26)
hgbc1 = hgbc.fit(X1_train, y_train)
hgbc1.score(X1_test, y_test)

0.6935866983372921

# Test Data

In [17]:
X2 = test_data
X2

Unnamed: 0,HomePage,HomePage_Duration,LandingPage,LandingPage_Duration,ProductDescriptionPage,ProductDescriptionPage_Duration,GoogleMetric:Bounce Rates,GoogleMetric:Exit Rates,GoogleMetric:Page Values,SeasonalPurchase,Month_SeasonalPurchase,OS,SearchEngine,Zone,Type of Traffic,CustomerType,Gender,Cookies Setting,Education,Marital Status,WeekendPurchase
0,0.0,0.000000,0.0,0.0,1.0,0.000000,0.200000,0.200000,0.0,0.0,Feb,1.0,1.0,1.0,1.0,Returning_Visitor,Not Specified,Deny,Not Specified,Single,0.0
1,0.0,0.000000,0.0,0.0,2.0,64.000000,0.000000,0.100000,0.0,0.0,Feb,2.0,2.0,1.0,2.0,Returning_Visitor,Not Specified,ALL,Graduate,Married,0.0
2,0.0,0.000000,0.0,0.0,19.0,154.216667,0.015789,0.024561,0.0,0.0,Feb,2.0,2.0,1.0,3.0,Returning_Visitor,Female,Required,Diploma,Other,0.0
3,0.0,0.000000,0.0,0.0,2.0,37.000000,0.000000,0.100000,0.0,0.8,Feb,2.0,2.0,2.0,3.0,Returning_Visitor,Not Specified,Required,Graduate,Other,0.0
4,0.0,0.000000,0.0,0.0,16.0,407.750000,0.018750,0.025833,0.0,0.4,Feb,1.0,1.0,4.0,3.0,Returning_Visitor,Female,Deny,Others,Other,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6594,0.0,0.000000,0.0,0.0,7.0,208.000000,0.000000,0.028571,0.0,0.0,Feb,4.0,1.0,1.0,5.0,Returning_Visitor,Not Specified,Required,Graduate,Other,1.0
6595,0.0,0.000000,3.0,44.0,179.0,1738.472529,0.000027,0.025998,0.0,0.0,Aug,2.0,4.0,9.0,11.0,Returning_Visitor,Male,Required,Not Specified,Other,0.0
6596,5.0,99.166667,1.0,27.0,33.0,,0.002778,0.009127,0.0,0.6,May,8.0,5.0,1.0,2.0,Returning_Visitor,Female,Required,Graduate,Married,0.0
6597,0.0,0.000000,0.0,0.0,3.0,9.000000,0.066667,0.133333,0.0,0.0,May,2.0,2.0,2.0,3.0,Returning_Visitor,Not Specified,Deny,Diploma,Other,1.0


#### Applying OneHotEncoding on the Test Data

In [18]:
X2 = pd.get_dummies(X2,columns=['Month_SeasonalPurchase',
                             'CustomerType','Gender','Cookies Setting',
                             'Education','Marital Status',
                             'WeekendPurchase'], drop_first='True')

#### Imputing the ~1% missing values with median

In [19]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X2 = imp_median.fit_transform(X2)

#### Using the trained models on the test data

In [20]:
final_out = lr.predict(X2) #Logistic Output
final_ada = abc1.predict(X2) #AdaBoostClassifier Output
final_hgbc = hgbc1.predict(X2) #HistogramGradientBasedClassifier

#### Transforming the output into Pandas Dataframe

In [23]:
final_out = pd.DataFrame(final_out, columns=['Made_Purchase'])
final_ada = pd.DataFrame(final_ada, columns=['Made_Purchase'])
final_hgbc = pd.DataFrame(final_hgbc, columns=['Made_Purchase'])

#### Adding the column name "id"

In [24]:
final_out.index.name = 'id'
final_ada.index.name = 'id'
final_hgbc.index.name = 'id'

In [23]:
# final_out.to_csv("submission.csv")

In [24]:
# final_ada.to_csv("submission.csv")

In [25]:
final_hgbc.to_csv("submission.csv")