#### Preprocessing

In [17]:
# Importing modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [18]:
# Reading the dataset to dataframe
df = pd.read_csv('customer-churn-data.csv')
df.head()

Unnamed: 0,CustomerID,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,1,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True
1,2,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,4,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False


In [19]:
df['Gender'].unique(), df['Gender'].value_counts()

(array(['Other', 'Male', 'Female'], dtype=object),
 Gender
 Female    342
 Male      334
 Other     324
 Name: count, dtype: int64)

In [20]:
df['PromotionResponse'].unique(), df['PromotionResponse'].value_counts()

(array(['Responded', 'Ignored', 'Unsubscribed'], dtype=object),
 PromotionResponse
 Unsubscribed    361
 Responded       338
 Ignored         301
 Name: count, dtype: int64)

In [21]:
df['EmailOptIn'].unique(), df['EmailOptIn'].value_counts()

(array([ True, False]),
 EmailOptIn
 True     529
 False    471
 Name: count, dtype: int64)

In [22]:
# Encoding the values to these columns
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df['PromotionResponse'] = LabelEncoder().fit_transform(df['PromotionResponse'])
df['EmailOptIn'] = LabelEncoder().fit_transform(df['EmailOptIn'])

In [23]:
# Dropping CustomerID
df.drop(columns = ['CustomerID'], axis = 1, inplace = True)

In [24]:
# Transformed dataframe
df.head()

Unnamed: 0,Age,Gender,AnnualIncome,TotalSpend,YearsAsCustomer,NumOfPurchases,AvgTransactionAmount,NumOfReturns,NumOfSupportQueries,SatisfactionScore,LastPurchaseDaysAgo,EmailOptIn,PromotionResponse,Churn
0,62,2,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,True
1,65,1,79.51,9025.47,13,77,22.9,2,2,3,227,0,1,False
2,18,1,29.19,618.83,13,71,50.53,5,2,2,283,0,1,True
3,21,2,79.63,9110.3,3,33,411.83,5,3,5,226,1,0,True
4,21,2,77.66,5390.88,15,43,101.19,3,0,5,242,0,2,False


In [25]:
# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df.iloc[:,-1], test_size=0.3, random_state = 680)

#### Ada Boost Classifier

In [26]:
# GridSearch Adaboost Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf1 = AdaBoostClassifier(random_state = 680)
params = {
    'n_estimators':[50,100,200,500],
    'learning_rate':[0.1,0.5,1,1.5,2],
    'algorithm':['SAMME', 'SAMME.R']
}
newclf1 = GridSearchCV(clf1, params, cv=5, n_jobs=-1)
newclf1.fit(X_train, y_train)
y_pred = newclf1.predict(X_test)
newconfmat = confusion_matrix(y_test, y_pred)
print(newclf1.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat}\n')

tn, fp, fn, tp = newconfmat.ravel()
precision1 = round(tp/(tp+fp),3)
recall1 = round(tp/(tp+fn),3)
accuracy1= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy1}\nPrecision : {precision1}\nRecall : {recall1}')

{'algorithm': 'SAMME.R', 'learning_rate': 1.5, 'n_estimators': 500} 

Confusion Matrix:
[[69 67]
 [75 89]]

Overall accuracy: 0.527
Precision : 0.571
Recall : 0.543


#### Random Forest Classifier

In [27]:
# GridSearch Random Forest Model
# TN is {0,0}, FN is {1,0}, TP {1,1} and FP {0,1}

clf2 = RandomForestClassifier(random_state = 680)
params = {
    'n_estimators':[50,100,200,500],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_features':['sqrt', 'log2', None]
}
newclf2 = GridSearchCV(clf2, params, cv=5, n_jobs=-1)
newclf2.fit(X_train, y_train)
y_pred = newclf2.predict(X_test)
newconfmat = confusion_matrix(y_test, y_pred)
print(newclf2.best_params_,'\n')
print(f'Confusion Matrix:\n{newconfmat}\n')

tn, fp, fn, tp = newconfmat.ravel()
precision2 = round(tp/(tp+fp),3)
recall2 = round(tp/(tp+fn),3)
accuracy2= round((tp+tn)/(tp+tn+fp+fn),3)
print(f'Overall accuracy: {accuracy2}\nPrecision : {precision2}\nRecall : {recall2}')

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50} 

Confusion Matrix:
[[72 64]
 [78 86]]

Overall accuracy: 0.527
Precision : 0.573
Recall : 0.524


In [29]:
data = [['DecisionTreeClassifier', newclf1.best_params_, accuracy1, precision1, recall1],
        ['RandomForestClassifier', newclf2.best_params_, accuracy2, precision2, recall2]]

In [30]:
report = pd.DataFrame(data, columns= ['Model', 'Parameters', 'Overall accuracy', 'Precision', 'Recall'])
report

Unnamed: 0,Model,Parameters,Overall accuracy,Precision,Recall
0,DecisionTreeClassifier,"{'algorithm': 'SAMME.R', 'learning_rate': 1.5,...",0.527,0.571,0.543
1,RandomForestClassifier,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.527,0.573,0.524
