#### Preprocessing

In [1]:
# Importing modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# Reading the dataset to dataframe
# Dropping CustomerID
df = pd.read_csv('customer-churn-data.csv')
df.drop(columns = ['CustomerID'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True
1,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False
2,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True
4,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False


In [3]:
df['Gender'].unique(), df['Gender'].value_counts()

(array(['Other', 'Male', 'Female'], dtype=object),
 Gender
 Female    342
 Male      334
 Other     324
 Name: count, dtype: int64)

In [4]:
df['PromotionResponse'].unique(), df['PromotionResponse'].value_counts()

(array(['Responded', 'Ignored', 'Unsubscribed'], dtype=object),
 PromotionResponse
 Unsubscribed    361
 Responded       338
 Ignored         301
 Name: count, dtype: int64)

In [5]:
df['EmailOptIn'].unique(), df['EmailOptIn'].value_counts()

(array([ True, False]),
 EmailOptIn
 True     529
 False    471
 Name: count, dtype: int64)

In [6]:
# Label Encoding the values to these columns
df_le = df.copy()
df_le['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df_le['PromotionResponse'] = LabelEncoder().fit_transform(df['PromotionResponse'])
df_le['EmailOptIn'] = LabelEncoder().fit_transform(df['EmailOptIn'])
df_le.head()

Unnamed: 0,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,62,2,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,True
1,65,1,79.51,9025.47,13,77,22.9,2,2,3,227,0,1,False
2,18,1,29.19,618.83,13,71,50.53,5,2,2,283,0,1,True
3,21,2,79.63,9110.3,3,33,411.83,5,3,5,226,1,0,True
4,21,2,77.66,5390.88,15,43,101.19,3,0,5,242,0,2,False


In [7]:
# One Hot Encoding the values to these columns
temp = df.copy()
ohe = OneHotEncoder(sparse_output=False)
X_ohe = ohe.fit_transform(temp.loc[:,['Gender', 'PromotionResponse', 'EmailOptIn']])
X_ohe = pd.DataFrame(data = X_ohe, columns=ohe.get_feature_names_out())
df_ohe = pd.concat([X_ohe, temp.iloc[:,[0,2,3,4,5,6,7,8,9,10,13]]] , axis = 1)
df_ohe.head()

Unnamed: 0,Gender_Female,Gender_Male,Gender_Other,PromotionResponse_Ignored,PromotionResponse_Responded,PromotionResponse_Unsubscribed,EmailOptIn_False,EmailOptIn_True,Age,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,Churn
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,62,45.15,5892.58,5,22,453.8,2,0,3,129,True
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,65,79.51,9025.47,13,77,22.9,2,2,3,227,False
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,18,29.19,618.83,13,71,50.53,5,2,2,283,True
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,21,79.63,9110.3,3,33,411.83,5,3,5,226,True
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,21,77.66,5390.88,15,43,101.19,3,0,5,242,False


In [8]:
# Splitting data into train and test
X_train_le, X_test_le, y_train_le, y_test_le = train_test_split(
    df_le.iloc[:,:-1], df_le.iloc[:,-1], test_size=0.3, random_state = 680)

X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(
    df_ohe.iloc[:,:-1], df_ohe.iloc[:,-1], test_size=0.3, random_state = 680)

#### Ada Boost Classifier

##### Label Encoded Data

In [9]:
# GridSearch Adaboost Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf1_le = AdaBoostClassifier(random_state = 680)
params = {
    'estimator':[DecisionTreeClassifier(max_depth=1),
                 RandomForestClassifier(max_depth=1),
                 GaussianNB()],
    # 'n_estimators':[50,100,200],
    'learning_rate':[0.5,1,1.5],
    'algorithm':['SAMME', 'SAMME.R']
}

newclf1_le = GridSearchCV(clf1_le, params, cv=5, n_jobs=-1)
newclf1_le.fit(X_train_le, y_train_le)
y_pred_le = newclf1_le.predict(X_test_le)
newconfmat_le = confusion_matrix(y_test_le, y_pred_le)
print(newclf1_le.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat_le}\n')

tn, fp, fn, tp = newconfmat_le.ravel()
precision1_le = round(tp/(tp+fp),3)
recall1_le = round(tp/(tp+fn),3)
accuracy1_le= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy1_le}\nPrecision : {precision1_le}\nRecall : {recall1_le}')

{'algorithm': 'SAMME.R', 'estimator': GaussianNB(), 'learning_rate': 0.5} 

Confusion Matrix:
[[ 57  79]
 [ 58 106]]

Overall accuracy: 0.543
Precision : 0.573
Recall : 0.646


##### One Hot Encoded Data

In [10]:
# GridSearch Adaboost Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf1_ohe = AdaBoostClassifier(random_state = 680)
params = {
    'estimator':[DecisionTreeClassifier(max_depth=1),
                 RandomForestClassifier(max_depth=1),
                 GaussianNB()],
    # 'n_estimators':[50,100,200],
    'learning_rate':[0.5,1,1.5],
    'algorithm':['SAMME', 'SAMME.R']
}

newclf1_ohe = GridSearchCV(clf1_ohe, params, cv=5, n_jobs=-1)
newclf1_ohe.fit(X_train_ohe, y_train_ohe)
y_pred_ohe = newclf1_ohe.predict(X_test_ohe)
newconfmat_ohe = confusion_matrix(y_test_ohe, y_pred_ohe)
print(newclf1_ohe.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat_ohe}\n')

tn, fp, fn, tp = newconfmat_ohe.ravel()
precision1_ohe = round(tp/(tp+fp),3)
recall1_ohe = round(tp/(tp+fn),3)
accuracy1_ohe= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy1_ohe}\nPrecision : {precision1_ohe}\nRecall : {recall1_ohe}')

{'algorithm': 'SAMME.R', 'estimator': GaussianNB(), 'learning_rate': 1} 

Confusion Matrix:
[[69 67]
 [80 84]]

Overall accuracy: 0.51
Precision : 0.556
Recall : 0.512


#### Random Forest Classifier

##### Label Encoded Data

In [11]:
# GridSearch Random Forest Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf2_le = RandomForestClassifier(random_state = 680)
params = {
    'n_estimators':[50,100,200,500],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_features':['sqrt', 'log2', None]
}
newclf2_le = GridSearchCV(clf2_le, params, cv=5, n_jobs=-1)
newclf2_le.fit(X_train_le, y_train_le)
y_pred_le = newclf2_le.predict(X_test_le)
newconfmat_le = confusion_matrix(y_test_le, y_pred_le)
print(newclf2_le.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat_le}\n')

tn, fp, fn, tp = newconfmat_le.ravel()
precision2_le = round(tp/(tp+fp),3)
recall2_le = round(tp/(tp+fn),3)
accuracy2_le= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy2_le}\nPrecision : {precision2_le}\nRecall : {recall2_le}')

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50} 

Confusion Matrix:
[[72 64]
 [78 86]]

Overall accuracy: 0.527
Precision : 0.573
Recall : 0.524


##### Ohe Hot Encoded Data

In [12]:
# GridSearch Random Forest Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf2_ohe = RandomForestClassifier(random_state = 680)
params = {
    'n_estimators':[50,100,200,500],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_features':['sqrt', 'log2', None]
}
newclf2_ohe = GridSearchCV(clf2_ohe, params, cv=5, n_jobs=-1)
newclf2_ohe.fit(X_train_ohe, y_train_ohe)
y_pred_ohe = newclf2_ohe.predict(X_test_ohe)
newconfmat_ohe = confusion_matrix(y_test_ohe, y_pred_ohe)
print(newclf2_ohe.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat_ohe}\n')

tn, fp, fn, tp = newconfmat_ohe.ravel()
precision2_ohe = round(tp/(tp+fp),3)
recall2_ohe = round(tp/(tp+fn),3)
accuracy2_ohe= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy2_ohe}\nPrecision : {precision2_ohe}\nRecall : {recall2_ohe}')

{'criterion': 'entropy', 'max_features': None, 'n_estimators': 50} 

Confusion Matrix:
[[66 70]
 [72 92]]

Overall accuracy: 0.527
Precision : 0.568
Recall : 0.561


In [13]:
data = [['AdaBoostClassifier', 'Label Encoding', accuracy1_le, precision1_le, recall1_le],
        ['AdaBoostClassifier', 'One Hot Encoding', accuracy1_ohe, precision1_ohe, recall1_ohe],
        ['RandomForestClassifier', 'Label Encoding', accuracy2_le, precision2_le, recall2_le],
        ['RandomForestClassifier', 'One Hot Encoding', accuracy2_ohe, precision2_ohe, recall2_ohe]]

In [14]:
report = pd.DataFrame(data, columns= ['Model', 'Encoding', 'Overall accuracy', 'Precision', 'Recall'])
report

Unnamed: 0,Model,Encoding,Overall accuracy,Precision,Recall
0,AdaBoostClassifier,Label Encoding,0.543,0.573,0.646
1,AdaBoostClassifier,One Hot Encoding,0.51,0.556,0.512
2,RandomForestClassifier,Label Encoding,0.527,0.573,0.524
3,RandomForestClassifier,One Hot Encoding,0.527,0.568,0.561
