### Libraries

In [1]:
# Data Wrangling
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ( AdaBoostClassifier,
                                GradientBoostingClassifier,
                                BaggingClassifier,
                                RandomForestClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Metric scores
from sklearn.metrics import (f1_score,
                            accuracy_score,
                            recall_score,
                            precision_score,
                            confusion_matrix,
                            roc_auc_score,
                            plot_confusion_matrix,
)


# To impute missing values and handle modeling
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# To supress warnings
import warnings

warnings.filterwarnings("ignore")

### Import Data

In [2]:
# read in the data
bank_churn= pd.read_csv('/Users/tinapham/Desktop/lighthouse-data-notes/ProJect/LHL_FinalProject/Data/BankChurners.csv')


#### Data Dictionary
![Alt text](data_dictionary.png)

### Data Preprocessing

####                    Drop Irrelavant column

In [3]:
# Drop ['CLIENTNUM']
bank_churn= bank_churn.drop(columns='CLIENTNUM')
# Drop the last 2 columns in the DataFrame
bank_churn = bank_churn.iloc[:, :-2]
# Checking the DataFrame
bank_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

In [5]:
# Filter out data categorical data
category=bank_churn.select_dtypes(exclude=np.number)
# Drop categorical feature
numerical= bank_churn.drop(columns=category)

### Data Cleaning

- Unknown data in object datatype but does not contribute to any missing data in numerical data. The unknown category will be kept as is.
- Outlier observe from distribution plot in EDA:
    - ['Total_Amt_Chng_Q4_Q1']>=2.5
    - ['Total_Ct_Chng_Q4_Q1']>=2.7
    - ['Total_Trans_Ct']>=135
- To be remove in data preprocessing prior to modeling

In [4]:
# Create a model data frame and removing outlier
model_df= bank_churn.copy()
model_df= model_df[model_df['Total_Amt_Chng_Q4_Q1']<=2.5]
model_df= model_df[model_df['Total_Ct_Chng_Q4_Q1']<=2.7]
model_df= model_df[model_df['Total_Trans_Ct']<=135]

print('Number of data for modeling:',model_df.shape[0])

Number of data for modeling: 10115


### Data Preprocessig

In [5]:
# Set target and encoding y
y= model_df['Attrition_Flag'].replace(to_replace={'Attrited Customer': 1, 'Existing Customer': 0})
# Drop target variable and with no significant different in EDA
drop_column= ['Attrition_Flag','Months_on_book', 'Dependent_count']
X = model_df.drop(columns=drop_column)

In [6]:
# set dummy variables to categorical data
X = pd.get_dummies(
    data=X,
    columns=[
        "Gender",
        "Education_Level",  # has missing values
        "Marital_Status",  # has missing val
        "Income_Category",
        "Card_Category",
    ],
    drop_first=True,
)

### Modeling

#### Split data
- With imbalance in class from target value, with 16.1% attrited customer, K-fold stratified validiation will be included to consider for class imbalance
- Split data into 3 set:
    - test set
    - train set
    - validation set

In [7]:
# Split data into training and test set
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Split training dataset into training and validation
X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.25, random_state=42, stratify=y_training)


In [31]:
datasets = {
    'Train Data': y_train,
    'Validation Data': y_val,
    'Test Data': y_test
}

for dataset_name, data in datasets.items():
    value_counts = data.value_counts(normalize=True)
    print(f"Value counts for {dataset_name}:\n{value_counts}\n{'='*40}\n")

Value counts for Train Data:
0    0.839018
1    0.160982
Name: Attrition_Flag, dtype: float64

Value counts for Validation Data:
0    0.839348
1    0.160652
Name: Attrition_Flag, dtype: float64

Value counts for Test Data:
0    0.839348
1    0.160652
Name: Attrition_Flag, dtype: float64



In [None]:
# Standardize Data
scaler= StandardScaler()
X_train= scaler.fit_transform(X_train)
X_val= scaler.transform(X_val)

### Model Selection

In [45]:
# Define models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('SVM', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Naive Bayes Classifier', GaussianNB()))
models.append(('Decision Tree Classifier', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('XGBoost', GradientBoostingClassifier()))

# Model Evaluation 
acc_results = []
auc_results = []
recall_results= []
#cross_val_score_train=[]
names = []

# Dataframe of results
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 'Accuracy Mean', 'Accuracy STD', 'Recall Mean', 'Recall STD']
model_results = pd.DataFrame(columns=col)
i = 0
# K-fold validation of model
for name, model in models:
    kfold = StratifiedKFold(n_splits=10)
    # accuracy:
    cv_acc_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
    # recall:
    cv_recall_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'recall')
    # roc_auc:
    cv_auc_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name, round(cv_auc_results.mean()*100, 2), round(cv_auc_results.std()*100, 2),
                            round(cv_acc_results.mean()*100, 2), round(cv_acc_results.std()*100, 2),
                            round(cv_recall_results.mean()*100, 2),round(cv_recall_results.std()*100, 2) ]
    i += 1
    
model_results.sort_values(by=['ROC AUC Mean'], ascending=False)


Unnamed: 0,Algorithm,ROC AUC Mean,ROC AUC STD,Accuracy Mean,Accuracy STD,Recall Mean,Recall STD
6,XGBoost,98.77,0.36,96.29,0.42,82.8,1.35
5,Random Forest,98.66,0.37,95.21,0.62,76.25,4.61
0,Logistic Regression,90.38,1.84,88.85,1.3,47.7,4.08
4,Decision Tree Classifier,88.63,1.72,93.84,0.91,80.96,4.55
3,Naive Bayes Classifier,87.68,1.56,89.72,1.06,61.41,2.21
2,KNN,87.56,1.67,88.73,1.37,54.76,4.94
1,SVM,81.2,2.12,83.9,0.07,0.0,0.0


- XGboost is the best model

### Model Training

In [None]:
#=========================================================================
# XGBoost regression: 
# Parameters: 
# n_estimators  "Number of gradient boosted trees. Equivalent to number 
#                of boosting rounds."
# learning_rate "Boosting learning rate (also known as “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make 
#                the model more complex and more likely to overfit." 
#=========================================================================
classifier=xgb.XGBClassifier(eval_metric='accuracy')

#=========================================================================
# exhaustively search for the optimal hyperparameters
#=========================================================================
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

In [None]:
search = GridSearchCV(classifier, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)