In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from matplotlib.colors import ListedColormap
from imblearn import under_sampling, over_sampling

In [None]:
df = pd.read_excel('E_Commerce_Dataset.xlsx', sheet_name='E Comm')
df = df[['Tenure', 'PreferredLoginDevice', 'CityTier', 'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
          'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore', 'MaritalStatus', 'NumberOfAddress', 'Complain', 'OrderAmountHikeFromlastYear',
          'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount', 'Churn']]
df.head()

Unnamed: 0,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Churn
0,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93,1
1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,120.9,1
2,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120.28,1
3,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07,1
4,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,129.6,1


In [None]:
#imputation

#except hours spend on app, the missing values will be impute with median and hours spend on app using mean
df['Tenure'] = df['Tenure'].fillna(df['Tenure'].median())
df['WarehouseToHome'] = df['WarehouseToHome'].fillna(df['WarehouseToHome'].median())
df['HourSpendOnApp'] = df['HourSpendOnApp'].fillna(df['HourSpendOnApp'].mean())
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(df['DaySinceLastOrder'].median())
df['OrderCount'] = df['OrderCount'].fillna(df['OrderCount'].median())
df['CouponUsed'] = df['CouponUsed'].fillna(df['CouponUsed'].median())
df['OrderAmountHikeFromlastYear'] = df['OrderAmountHikeFromlastYear'].fillna(df['OrderAmountHikeFromlastYear'].median())

In [None]:
#data Imbalance handling
df['Churn'].value_counts()

0    4682
1     948
Name: Churn, dtype: int64

In [None]:
# split dataset
X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [None]:
#logistic regression

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

logreg = LogisticRegression(random_state = 0)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

0.872113676731794

In [None]:
cf_matrix

array([[900,  36],
       [108,  82]])

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
logreg_2 = LogisticRegression(random_state = 0)
logreg_2.fit(X_res, y_res)

y_pred = logreg_2.predict(X_test)

cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
cf_matrix

array([[719, 217],
       [ 37, 153]])

In [None]:
accuracy_score(y_test, y_pred)

0.7744227353463587

In [None]:
# SVM

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = SVC()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[922,  14],
       [ 84, 106]])

In [None]:
accuracy_score(y_test, y_pred)

0.9129662522202486

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

svm_2 = SVC(random_state = 0)
svm_2.fit(X_res, y_res)

y_pred = svm_2.predict(X_test)

cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
cf_matrix

array([[850,  86],
       [ 18, 172]])

In [None]:
accuracy_score(y_test, y_pred)

0.9076376554174067

In [None]:
# KNN

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[927,  14],
       [ 67, 118]])

In [None]:
accuracy_score(y_test, y_pred)

0.9280639431616341

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

knn_2 = KNeighborsClassifier()
knn_2.fit(X_res, y_res)

y_pred = knn_2.predict(X_test)

cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
cf_matrix

array([[839, 102],
       [ 11, 174]])

In [None]:
accuracy_score(y_test, y_pred)

0.8996447602131439

In [None]:
#Decision Tree

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[921,  20],
       [ 19, 166]])

In [None]:
accuracy_score(y_test, y_pred)

0.9653641207815276

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

dt_2 = DecisionTreeClassifier()
dt_2.fit(X_res, y_res)

y_pred = dt_2.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[907,  34],
       [ 27, 158]])

In [None]:
accuracy_score(y_test, y_pred)

0.9458259325044405

In [None]:
#Naive Bayes

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[858,  83],
       [ 66, 119]])

In [None]:
accuracy_score(y_test, y_pred)

0.8676731793960923

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

nb_2 = GaussianNB()
nb_2.fit(X_res, y_res)

y_pred = nb_2.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[615, 326],
       [ 43, 142]])

In [None]:
accuracy_score(y_test, y_pred)

0.672291296625222

In [None]:
#Random Forest

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[938,   3],
       [ 18, 167]])

In [None]:
accuracy_score(y_test, y_pred)

0.9813499111900533

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

rf_2 = RandomForestClassifier()
rf_2.fit(X_res, y_res)

y_pred = rf_2.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[933,   8],
       [ 18, 167]])

In [None]:
accuracy_score(y_test, y_pred)

0.9769094138543517

In [None]:
# XGBoost

X = df.iloc[:, :18].values
y = df.iloc[:, 18].values

le = LabelEncoder()
X[:, 5] = le.fit_transform(X[:, 5])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 4] = le.fit_transform(X[:, 4])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 10] = le.fit_transform(X[:, 10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = XGBClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[926,  15],
       [ 71, 114]])

In [None]:
accuracy_score(y_test, y_pred)

0.9236234458259325

In [None]:
# imbalance evaluate using SMOTE

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X_train, y_train)

xg_2 = XGBClassifier()
xg_2.fit(X_res, y_res)

y_pred = xg_2.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[892,  49],
       [ 53, 132]])

In [None]:
accuracy_score(y_test, y_pred)

0.9094138543516874

In [None]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'criterion': ['entropy'], 'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
              {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'criterion': ['gini'], 'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
grid_search = GridSearchCV(estimator = rf_2,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_res, y_res)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 96.34 %
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 60}


Best Model : Random Forest