#### Preprocessing

In [1]:
# Importing modules
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# Reading the dataset to dataframe
# Dropping CustomerID
df = pd.read_csv('customer-churn-data.csv')
df.drop(columns = ['CustomerID'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True
1,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False
2,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True
4,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False


In [3]:
df['Gender'].unique(), df['Gender'].value_counts()

(array(['Other', 'Male', 'Female'], dtype=object),
 Gender
 Female    342
 Male      334
 Other     324
 Name: count, dtype: int64)

In [4]:
df['PromotionResponse'].unique(), df['PromotionResponse'].value_counts()

(array(['Responded', 'Ignored', 'Unsubscribed'], dtype=object),
 PromotionResponse
 Unsubscribed    361
 Responded       338
 Ignored         301
 Name: count, dtype: int64)

In [5]:
df['EmailOptIn'].unique(), df['EmailOptIn'].value_counts()

(array([ True, False]),
 EmailOptIn
 True     529
 False    471
 Name: count, dtype: int64)

In [6]:
# One Hot Encoding the values to these columns
temp = df.copy()
ohe = OneHotEncoder(sparse_output=False)
X_ohe = ohe.fit_transform(temp.loc[:,['Gender', 'PromotionResponse', 'EmailOptIn']])
X_ohe = pd.DataFrame(data = X_ohe, columns=ohe.get_feature_names_out())
df_ohe = pd.concat([X_ohe, temp.iloc[:,[0,2,3,4,5,6,7,8,9,10,13]]] , axis = 1)
df_ohe.head()

Unnamed: 0,Gender_Female,Gender_Male,Gender_Other,PromotionResponse_Ignored,PromotionResponse_Responded,PromotionResponse_Unsubscribed,EmailOptIn_False,EmailOptIn_True,Age,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,Churn
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,62,45.15,5892.58,5,22,453.8,2,0,3,129,True
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,65,79.51,9025.47,13,77,22.9,2,2,3,227,False
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,18,29.19,618.83,13,71,50.53,5,2,2,283,True
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,21,79.63,9110.3,3,33,411.83,5,3,5,226,True
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,21,77.66,5390.88,15,43,101.19,3,0,5,242,False


In [7]:
# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df_ohe.iloc[:,:-1], df_ohe.iloc[:,-1], test_size=0.3, random_state = 0)

#### Ada Boost Classifier

In [8]:
# GridSearch Adaboost Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf1 = AdaBoostClassifier(random_state = 0)
params = {
    'estimator':[DecisionTreeClassifier(max_depth=1),
                 RandomForestClassifier(max_depth=1),
                 GaussianNB()],
    'learning_rate':[0.5,1,1.5,2,2.5]
}

newclf1 = GridSearchCV(clf1, params, scoring='recall', cv=5, n_jobs=-1)
newclf1.fit(X_train, y_train)
y_pred = newclf1.predict(X_test)
confmat = confusion_matrix(y_test, y_pred)
print(f'Best Parameters: {newclf1.best_params_}\n')
print(f'Confusion Matrix:\n{confmat}\n')

tn, fp, fn, tp = confmat.ravel()
precision1 = round(tp/(tp+fp),3)
recall1 = round(tp/(tp+fn),3)
accuracy1= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy1}\nPrecision : {precision1}\nRecall : {recall1}')

Best Parameters: {'estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 2.5}

Confusion Matrix:
[[  1 139]
 [  3 157]]

Overall accuracy: 0.527
Precision : 0.53
Recall : 0.981


#### Random Forest Classifier

In [12]:
# GridSearch Random Forest Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf2 = RandomForestClassifier(n_jobs=-1, random_state = 0)
params = {
    'n_estimators':[50,100,200,500],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_features':['sqrt', 'log2']
}
newclf2 = GridSearchCV(clf2, params, cv=5,scoring='recall', n_jobs=-1)
newclf2.fit(X_train, y_train)
y_pred = newclf2.predict(X_test)
confmat = confusion_matrix(y_test, y_pred)
print(f'Best Parameters: {newclf2.best_params_}\n')
print(f'Confusion Matrix:\n{confmat}\n')

tn, fp, fn, tp = confmat.ravel()
precision2 = round(tp/(tp+fp),3)
recall2 = round(tp/(tp+fn),3)
accuracy2= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy2}\nPrecision : {precision2}\nRecall : {recall2}')

Best Parameters: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 500}

Confusion Matrix:
[[57 83]
 [67 93]]

Overall accuracy: 0.5
Precision : 0.528
Recall : 0.581


In [13]:
data = [[str(newclf1.best_estimator_), accuracy1, precision1, recall1],
        [str(newclf2.best_estimator_), accuracy2, precision2, recall2]]

In [14]:
report = pd.DataFrame(data, columns= ['Model', 'Overall accuracy', 'Precision', 'Recall'])
report

Unnamed: 0,Model,Overall accuracy,Precision,Recall
0,AdaBoostClassifier(estimator=DecisionTreeClass...,0.527,0.53,0.981
1,"RandomForestClassifier(criterion='entropy', n_...",0.5,0.528,0.581
