In [40]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 419 kB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3


In [16]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

### LOAD DATA

In [45]:
bank = pd.read_csv('https://github.com/tianqi72/BankChurners/blob/main/BankChurners.csv')

# Remove two columns that leaks information
bank = bank.drop(columns=bank.columns[-2:], axis=1)
# Remove id column
bank = bank.drop(columns='CLIENTNUM', axis=1)

y = bank['Attrition_Flag']
X = bank.drop('Attrition_Flag', axis=1)
y = y.values.ravel()

bank.tail()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
10122,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,3,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462
10123,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,4,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511
10124,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,5,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.0
10125,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,4,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.0
10126,Attrited Customer,43,F,2,Graduate,Married,Less than $40K,Silver,25,6,2,4,10388.0,1961,8427.0,0.703,10294,61,0.649,0.189


### Encode the targets

In [46]:
le = LabelEncoder()
y = le.fit_transform(y)

### Split dataset into training and test set

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Preoprocessing

In [49]:
# Impute missing values
# Normalize numerical variables
con_pipe = Pipeline([('scaler', StandardScaler()),
                     ('imputer', SimpleImputer(strategy='median', add_indicator=True))])
# Encode categorical variables
cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore')),
                     ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True))])

categorical_columns = (X.dtypes == object)
preprocessing = ColumnTransformer([('categorical', cat_pipe,  categorical_columns),
                                   ('continuous',  con_pipe, ~categorical_columns)])

# Resampling
smote = SMOTE()

### Hyperparameter tuning

In [50]:
rf_pipe = make_pipeline(preprocessing,
                        smote,
                        RandomForestClassifier(n_jobs=-1))

knn_pipe = make_pipeline(preprocessing,
                         smote,
                         KNeighborsClassifier(n_jobs=-1))

svc_pipe = make_pipeline(preprocessing,
                         smote,
                         SVC())

xgb_pipe = make_pipeline(preprocessing,
                         smote,
                         XGBClassifier())

rf_hyper = dict(smote__sampling_strategy=[*np.arange(0.1, 0.5, 0.1), 'auto'],
                smote__k_neighbors=range(2, 20),
                randomforestclassifier__n_estimators=[
                    *range(5, 100, 5), *range(100, 500, 100)],
                randomforestclassifier__max_depth=[*range(2, 30), None],
                randomforestclassifier__min_samples_split=range(1, 10),
                randomforestclassifier__min_samples_leaf=range(1, 10),
                randomforestclassifier__max_features=['auto', 'sqrt', 'log2'],
                randomforestclassifier__class_weight=[None, 'balanced'])

knn_hyper = dict(smote__sampling_strategy=[*np.arange(0.1, 0.5, 0.1), 'auto'],
                 smote__k_neighbors=range(2, 20),
                 kneighborsclassifier__n_neighbors=range(2, 20),
                 kneighborsclassifier__leaf_size=range(5, 51, 5),
                 kneighborsclassifier__weights=['uniform', 'distance'])

svc_hyper = dict(smote__sampling_strategy=[*np.arange(0.1, 0.5, 0.1), 'auto'],
                 smote__k_neighbors=range(2, 20),
                 svc__C=[0.1, 0.3, 0.5, 1, 10, 50, 100],
                 svc__kernel=['linear', 'poly',
                              'rbf', 'sigmoid', 'precomputed'],
                 svc__gamma=['scale', 'auto'],
                 svc__degree=range(2, 5),
                 svc__class_weight=[None, 'balanced'])

xgb_hyper = dict(smote__sampling_strategy=[*np.arange(0.1, 0.5, 0.1), 'auto'],
                 smote__k_neighbors=range(2, 20),
                 xgbclassifier__booster=['gbtree', 'gblinear', 'dart'],
                 xgbclassifier__eta=np.arange(0, 1, 0.1),
                 xgbclassifier__gamma=range(0, 10),
                 xgbclassifier__max_depth=range(2, 30),
                 xgbclassifier__subsample=np.arange(0.2, 1, 0.1),
                 xgbclassifier__sampling_method=['uniform', 'gradient_based'])

algorithms = [rf_pipe, knn_pipe, svc_pipe, xgb_pipe]
hyper = [rf_hyper, knn_hyper, svc_hyper, xgb_hyper]

for a, h in zip(algorithms, hyper):
    clf_rand_cv = RandomizedSearchCV(estimator=a,
                                     param_distributions=h,
                                     n_iter=50,
                                     cv=10,
                                     scoring='f1',
                                     n_jobs=-1,
                                     verbose=False)
    clf_rand_cv.fit(X_train, y_train)
    print(clf_rand_cv.best_params_)

{'smote__sampling_strategy': 0.4, 'smote__k_neighbors': 17, 'randomforestclassifier__n_estimators': 400, 'randomforestclassifier__min_samples_split': 6, 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__max_depth': 21, 'randomforestclassifier__class_weight': 'balanced'}
{'smote__sampling_strategy': 0.2, 'smote__k_neighbors': 15, 'kneighborsclassifier__weights': 'uniform', 'kneighborsclassifier__n_neighbors': 12, 'kneighborsclassifier__leaf_size': 40}
{'svc__kernel': 'rbf', 'svc__gamma': 'scale', 'svc__degree': 3, 'svc__class_weight': None, 'svc__C': 1, 'smote__sampling_strategy': 0.2, 'smote__k_neighbors': 15}
{'xgbclassifier__subsample': 0.6000000000000001, 'xgbclassifier__sampling_method': 'uniform', 'xgbclassifier__max_depth': 10, 'xgbclassifier__gamma': 3, 'xgbclassifier__eta': 0.2, 'xgbclassifier__booster': 'dart', 'smote__sampling_strategy': 0.2, 'smote__k_neighbors': 3}


### Model selection

In [55]:
rf_pipe = make_pipeline(preprocessing,
                        SMOTE(sampling_strategy=0.3, k_neighbors=19),
                        RandomForestClassifier(n_estimators=55,
                                               min_samples_split=3,
                                               min_samples_leaf=1,
                                               max_features='sqrt',
                                               max_depth=26,
                                               class_weight=None,
                                               n_jobs=-1))

knn_pipe = make_pipeline(preprocessing,
                         SMOTE(sampling_strategy=0.3, k_neighbors=10),
                         KNeighborsClassifier(weights='distance',
                                              n_neighbors=15,
                                              leaf_size=25,
                                              n_jobs=-1))

svc_pipe = make_pipeline(preprocessing,
                         SMOTE(sampling_strategy=0.3, k_neighbors=8),
                         SVC(kernel='poly',
                             gamma='scale',
                             degree=4,
                             C=0.3))

xgb_pipe = make_pipeline(preprocessing,
                         SMOTE(sampling_strategy=0.2, k_neighbors=3),
                         XGBClassifier(subsample=0.6,
                                       sampling_method='uniform',
                                       max_depth=10,
                                       gamma=3,
                                       eta=0.2,
                                       booster='gbtree'))

algorithms = [rf_pipe, knn_pipe, svc_pipe, xgb_pipe]
name = ['RF', 'KNN', 'SVC', 'XG']

for pipe, name in zip(algorithms, name):
    print(name, round(np.mean(cross_val_score(pipe, X_train,
                                              y_train, cv=5, scoring='f1_weighted', n_jobs=-1)), 4))

RF 0.9531
KNN 0.9155
SVC 0.916
XG 0.9704


### Final model evaluation

In [54]:
xgb_pipe = make_pipeline(preprocessing,
                         SMOTE(sampling_strategy=0.2, k_neighbors=3),
                         XGBClassifier(subsample=0.6,
                                       sampling_method='uniform',
                                       max_depth=10,
                                       gamma=3,
                                       eta=0.2,
                                       booster='gbtree'))
xgb_pipe.fit(X_train, y_train)
y_pred = xgb_pipe.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 387   48]
 [  31 2066]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       435
           1       0.98      0.99      0.98      2097

    accuracy                           0.97      2532
   macro avg       0.95      0.94      0.94      2532
weighted avg       0.97      0.97      0.97      2532

