In [1]:
import numpy as np
import pandas as pd


In [2]:
data = pd.read_csv('kpmg_encoded.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20495 entries, 0 to 20494
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   past_3_years_bike_related_purchases  20495 non-null  int64  
 1   owns_car                             20495 non-null  int64  
 2   tenure                               20495 non-null  float64
 3   property_valuation                   20495 non-null  float64
 4   online_order                         20495 non-null  float64
 5   order_status                         20495 non-null  int64  
 6   list_price                           20495 non-null  float64
 7   standard_cost                        20495 non-null  float64
 8   age                                  20495 non-null  float64
 9   F                                    20495 non-null  int64  
 10  M                                    20495 non-null  int64  
 11  Affluent Customer           

In [4]:
#selecting the dependent and independent variables

x = data.loc[:, data.columns != 'order_status']
y = data.iloc[:,5].values

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state = 42)

In [6]:
#FEATURE SCALING

from sklearn.preprocessing import StandardScaler
stanscaler = StandardScaler()
x_train = stanscaler.fit_transform(x_train)
x_test = stanscaler.transform(x_test)

### SMOTE Oversampling


In [7]:
#class label count after oversampling

print('Before Oversampling, count of labels "1": {}'.format(sum(y_train == 1)))
print('Before Oversampling, count of labels "0": {}'.format(sum(y_train == 0)))

Before Oversampling, count of labels "1": 14221
Before Oversampling, count of labels "0": 125


In [8]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [9]:

print('After Oversampling, count of labels "1": {}'.format(sum(y_train_res == 1)))
print('After Oversampling, count of labels "0": {}'.format(sum(y_train_res == 0)))

After Oversampling, count of labels "1": 14221
After Oversampling, count of labels "0": 14221


### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state = 0)
lr.fit(x_train_res, y_train_res)

LogisticRegression(random_state=0)

In [11]:
predlr_y = lr.predict(x_test)
predlr_y

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predlr_y))

              precision    recall  f1-score   support

           0       0.01      0.33      0.01        54
           1       0.99      0.57      0.73      6095

    accuracy                           0.57      6149
   macro avg       0.50      0.45      0.37      6149
weighted avg       0.98      0.57      0.72      6149



### Bagging Classifier

In [13]:
from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier()
bg.fit(x_train_res, y_train_res)

BaggingClassifier()

In [14]:
predbg_y = bg.predict(x_test)
predbg_y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [15]:
print(classification_report(y_test, predbg_y))

              precision    recall  f1-score   support

           0       0.02      0.02      0.02        54
           1       0.99      0.99      0.99      6095

    accuracy                           0.98      6149
   macro avg       0.51      0.51      0.51      6149
weighted avg       0.98      0.98      0.98      6149



### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 850)
rfc.fit(x_train_res, y_train_res.ravel())

RandomForestClassifier(n_estimators=850)

In [17]:
predrfc_y = rfc.predict(x_test)
predrfc_y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
print(classification_report(y_test, predrfc_y))

              precision    recall  f1-score   support

           0       0.17      0.02      0.03        54
           1       0.99      1.00      1.00      6095

    accuracy                           0.99      6149
   macro avg       0.58      0.51      0.51      6149
weighted avg       0.98      0.99      0.99      6149



### Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train_res, y_train_res)

GaussianNB()

In [20]:
prednb_y = nb.predict(x_test)
prednb_y

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [21]:
print(classification_report(y_test, prednb_y))

              precision    recall  f1-score   support

           0       0.01      0.56      0.02        54
           1       0.99      0.52      0.68      6095

    accuracy                           0.52      6149
   macro avg       0.50      0.54      0.35      6149
weighted avg       0.98      0.52      0.68      6149



### Support Vector Machine Algorithm

In [22]:
from sklearn.svm import SVC

svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(x_train_res, y_train_res)

SVC(kernel='linear', random_state=0)

In [23]:
predsvc_y = svc.predict(x_test)

In [24]:
print(classification_report(y_test, predsvc_y))

              precision    recall  f1-score   support

           0       0.01      0.43      0.01        54
           1       0.99      0.51      0.67      6095

    accuracy                           0.51      6149
   macro avg       0.50      0.47      0.34      6149
weighted avg       0.98      0.51      0.67      6149

