In [1]:
import numpy as np
import pandas as pd


In [2]:
data = pd.read_csv('kpmg_encoded.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20495 entries, 0 to 20494
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   past_3_years_bike_related_purchases  20495 non-null  int64  
 1   owns_car                             20495 non-null  int64  
 2   tenure                               20495 non-null  float64
 3   property_valuation                   20495 non-null  float64
 4   order_status                         20495 non-null  int64  
 5   list_price                           20495 non-null  float64
 6   standard_cost                        20495 non-null  float64
 7   age                                  20495 non-null  float64
 8   Female                               20495 non-null  int64  
 9   Male                                 20495 non-null  int64  
 10  Affluent Customer                    20495 non-null  int64  
 11  High Net Worth              

In [4]:
#selecting the dependent and independent variables

x = data.loc[:, data.columns != 'order_status']
y = data.iloc[:,4].values

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state = 42)

In [6]:
#FEATURE SCALING

from sklearn.preprocessing import StandardScaler
stanscaler = StandardScaler()
x_train = stanscaler.fit_transform(x_train)
x_test = stanscaler.transform(x_test)

### SMOTE Oversampling


In [7]:
#class label count after oversampling

print('Before Oversampling, count of labels "1": {}'.format(sum(y_train == 1)))
print('Before Oversampling, count of labels "0": {}'.format(sum(y_train == 0)))

Before Oversampling, count of labels "1": 14221
Before Oversampling, count of labels "0": 125


In [8]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [9]:

print('After Oversampling, count of labels "1": {}'.format(sum(y_train_res == 1)))
print('After Oversampling, count of labels "0": {}'.format(sum(y_train_res == 0)))

After Oversampling, count of labels "1": 14221
After Oversampling, count of labels "0": 14221


### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state = 0)
lr.fit(x_train_res, y_train_res)

LogisticRegression(random_state=0)

In [11]:
predlr_y = lr.predict(x_test)
predlr_y

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predlr_y))

              precision    recall  f1-score   support

           0       0.01      0.54      0.02        54
           1       0.99      0.57      0.72      6095

    accuracy                           0.57      6149
   macro avg       0.50      0.55      0.37      6149
weighted avg       0.98      0.57      0.72      6149



### Bagging Classifier

In [13]:
from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier()
bg.fit(x_train_res, y_train_res)

BaggingClassifier()

In [14]:
predbg_y = bg.predict(x_test)
predbg_y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [15]:
print(classification_report(y_test, predbg_y))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.99      0.99      0.99      6095

    accuracy                           0.99      6149
   macro avg       0.50      0.50      0.50      6149
weighted avg       0.98      0.99      0.98      6149



### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 850)
rfc.fit(x_train_res, y_train_res.ravel())

RandomForestClassifier(n_estimators=850)

In [17]:
predrfc_y = rfc.predict(x_test)
predrfc_y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [18]:
print(classification_report(y_test, predrfc_y))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        54
           1       0.99      1.00      0.99      6095

    accuracy                           0.99      6149
   macro avg       0.50      0.50      0.50      6149
weighted avg       0.98      0.99      0.99      6149



### Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train_res, y_train_res)

GaussianNB()

In [20]:
prednb_y = nb.predict(x_test)
prednb_y

array([0, 1, 1, ..., 1, 1, 1], dtype=int64)

In [21]:
print(classification_report(y_test, prednb_y))

              precision    recall  f1-score   support

           0       0.01      0.56      0.02        54
           1       0.99      0.58      0.73      6095

    accuracy                           0.58      6149
   macro avg       0.50      0.57      0.38      6149
weighted avg       0.98      0.58      0.72      6149



### Support Vector Machine Algorithm

In [22]:
from sklearn.svm import SVC

svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(x_train_res, y_train_res)

SVC(kernel='linear', random_state=0)

In [23]:
predsvc_y = svc.predict(x_test)

In [24]:
print(classification_report(y_test, predsvc_y))

              precision    recall  f1-score   support

           0       0.01      0.59      0.02        54
           1       0.99      0.52      0.68      6095

    accuracy                           0.52      6149
   macro avg       0.50      0.56      0.35      6149
weighted avg       0.98      0.52      0.68      6149



### USing BalancedBaggingClassifier to handle the imbalanced dataset

In [25]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bbc_dt = BalancedBaggingClassifier(base_estimator = DecisionTreeClassifier(),
                               sampling_strategy = 'auto',
                               replacement = False,
                               random_state = 0)

#fit the model
bbc_dt.fit(x_train, y_train)

BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                          random_state=0)

In [26]:
predbbc_dt = bbc_dt.predict(x_test)
print(classification_report(y_test, predbbc_dt))

              precision    recall  f1-score   support

           0       0.01      0.50      0.02        54
           1       0.99      0.62      0.76      6095

    accuracy                           0.62      6149
   macro avg       0.50      0.56      0.39      6149
weighted avg       0.98      0.62      0.75      6149



In [27]:
bbc_rf = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(),
                               sampling_strategy = 'auto',
                               replacement = False,
                               random_state = 42)

#fit the model
bbc_rf.fit(x_train, y_train)

predbbc_rf = bbc_rf.predict(x_test)
print(classification_report(y_test, predbbc_rf))

              precision    recall  f1-score   support

           0       0.01      0.24      0.02        54
           1       0.99      0.80      0.88      6095

    accuracy                           0.79      6149
   macro avg       0.50      0.52      0.45      6149
weighted avg       0.98      0.79      0.88      6149



## Using The New Customers dataset

In [28]:
#importing the dataset

newdata = pd.read_csv('newcustomerslistencoded.csv')

In [29]:
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   past_3_years_bike_related_purchases  1000 non-null   int64  
 1   owns_car                             1000 non-null   int64  
 2   tenure                               1000 non-null   int64  
 3   property_valuation                   1000 non-null   int64  
 4   Rank                                 1000 non-null   int64  
 5   Value                                1000 non-null   float64
 6   age                                  1000 non-null   float64
 7   Affluent Customer                    1000 non-null   int64  
 8   High Net Worth                       1000 non-null   int64  
 9   Mass Customer                        1000 non-null   int64  
 10  Female                               1000 non-null   int64  
 11  Male                           

In [30]:
#using Random Forest Classifier
rfc.predict(newdata)

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [31]:
#using balancedbagging and decision tree

bbc_dt.predict(newdata)

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [34]:
#using balancedbagging and random forest

status = bbc_rf.predict(newdata)
status.to_csv('predictions.csv', index = False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

From the above predictions, we would be going forward with balancedbagging classifier based on random forest. as it gives us an accuracy of 79% which is good enough.