##### Preprocessing

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

df = pd.read_csv('credit_card_churn.csv')
df = df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'CLIENTNUM'], axis=1)
cols = df.columns
num_data = list(df._get_numeric_data().columns)
cat_data = list(set(cols) - set(num_data))
num_data.remove('Dependent_count')
num_data.remove('Total_Relationship_Count')
num_data.remove('Months_Inactive_12_mon')
num_data.remove('Contacts_Count_12_mon')
cat_data.append('Dependent_count')
cat_data.append('Total_Relationship_Count')
cat_data.append('Months_Inactive_12_mon')
cat_data.append('Contacts_Count_12_mon')
num_data_plot = num_data
cat_data_plot = cat_data

df = df.loc[df['Card_Category'] != 'Platinum']
df = df.loc[df['Total_Ct_Chng_Q4_Q1'] <= 1.6]
df = df.loc[df['Months_Inactive_12_mon'] >= 1]
df = df.loc[df['Months_Inactive_12_mon'] < 6]
df = df.loc[df['Customer_Age'] <= 66]
df = df.loc[df['Total_Amt_Chng_Q4_Q1'] <= 1.6]
df['Credit_Limit'] = np.log10(df['Credit_Limit'])
fitted_data, fitted_lambda = stats.boxcox(df['Avg_Open_To_Buy'])
df['Avg_Open_To_Buy'] = fitted_data
df['Total_Trans_Amt'] = np.log10(df['Total_Trans_Amt'])
df['Avg_Utilization_Ratio'] = (df['Avg_Utilization_Ratio'])**(1/2)
#sns.histplot(df, x= 'Avg_Open_To_Buy', hue='Attrition_Flag', kde=True)
#for i in cols:
#    sns.histplot(df, x= i, hue='Attrition_Flag', kde=True)
#    plt.show()
#    df[i].value_counts()

y = df['Attrition_Flag']
#x = df.drop(['Attrition_Flag', 'Customer_Age', 'Gender', 'Marital_Status', 'Card_Category', 'Months_on_book'], axis=1)
#x = df.drop(['Attrition_Flag'], axis=1)
#x = df.drop(['Attrition_Flag', 'Avg_Utilization_Ratio', 'Dependent_count', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Contacts_Count_12_mon', 'Credit_Limit', 'Avg_Open_To_Buy', 'Customer_Age', 'Gender', 'Total_Revolving_Bal', 'Total_Amt_Chng_Q4_Q1'], axis=1)
x = df[['Total_Trans_Ct'
,'Total_Ct_Chng_Q4_Q1'
,'Total_Revolving_Bal'
,'Avg_Utilization_Ratio'
,'Total_Trans_Amt'
,'Contacts_Count_12_mon'
,'Months_Inactive_12_mon'
,'Total_Relationship_Count'
,'Total_Amt_Chng_Q4_Q1']]

x = pd.get_dummies(x)
y = y.replace(['Existing Customer', 'Attrited Customer'], [1, 0])

In [2]:
from sklearn.preprocessing import MinMaxScaler

x = x.values
y = y.values
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

##### Train model with k-fold and tuning hyperparameter

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

parameters = {'C':[0.01, 0.1, 1, 10, 100],
                'solver':('newton-cg', 'liblinear', 'lbfgs', 'sag', 'saga')}

kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf.get_n_splits(x)
precision_0 = []
recall_0 = []
f1_0 = []

precision_1 = []
recall_1 = []
f1_1 = []

for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    logisReg = LogisticRegression()
    clf = GridSearchCV(logisReg, parameters, scoring='f1', verbose=5, return_train_score=True, n_jobs=-1)
    clf.fit(x_train, y_train)
    
    y_pred = clf.predict(x_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision_0.append(report['0']['precision'])
    recall_0.append(report['0']['recall'])
    f1_0.append(report['0']['f1-score'])

    precision_1.append(report['1']['precision'])
    recall_1.append(report['1']['recall'])
    f1_1.append(report['1']['f1-score'])

print('_______________precision__________recall__________f1-score')
print('_______0_________'+ str(round(np.mean(precision_0), 2)) + '______________' + str(round(np.mean(recall_0), 2)) + '_____________' + str(round(np.mean(f1_0), 2)))
print('_______1_________'+ str(round(np.mean(precision_1), 2)) + '______________' + str(round(np.mean(recall_1), 2)) + '_____________' + str(round(np.mean(f1_1), 2)))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
_______________precision__________recall__________f1-score
_______0_________0.78______________0.63_____________0.7
_______1_________0.93______________0.97_____________0.95


##### Grid search for feature_selection parameter

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

N_FEATURES_OPTIONS = list(range(1, 25,1))
C_OPTIONS = [0.01, 0.1, 1, 10, 100, 1000]
pca = PCA()
k_chi2 = SelectKBest(chi2)
k_mutal = SelectKBest(mutual_info_classif)
k_anova = SelectKBest()
logis_reg = LogisticRegression()
pipe = Pipeline(
    [
        ("reduce_dim", "passthrough"),
        ("classify", logis_reg),
    ]
)

param_grid = [
    {
        "reduce_dim": [pca],
        "reduce_dim__n_components": N_FEATURES_OPTIONS,
        "classify__C": C_OPTIONS,
        "classify__solver":('newton-cg', 'liblinear', 'sag', 'saga')
    },
    {
        "reduce_dim": [k_chi2, k_mutal, k_anova],
        "reduce_dim__k": N_FEATURES_OPTIONS,
        "classify__C": C_OPTIONS,
        "classify__solver":('newton-cg', 'liblinear', 'sag', 'saga')
    },
]
reducer_labels = ["PCA", "KBest(chi2)", "KBest(Mutual)", "KBest(Anova)"]

kf = KFold(n_splits=10, shuffle=True, random_state=1)
kf.get_n_splits(x)
precision_0 = []
recall_0 = []
f1_0 = []

precision_1 = []
recall_1 = []
f1_1 = []

for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    grid = GridSearchCV(pipe, n_jobs=-1, param_grid=param_grid, scoring='f1')
    grid.fit(x_train, y_train)
    
    y_pred = grid.predict(x_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision_0.append(report['0']['precision'])
    recall_0.append(report['0']['recall'])
    f1_0.append(report['0']['f1-score'])

    precision_1.append(report['1']['precision'])
    recall_1.append(report['1']['recall'])
    f1_1.append(report['1']['f1-score'])

    mean_scores = np.array(grid.cv_results_["mean_test_score"])
    # scores are in the order of param_grid iteration, which is alphabetical
    mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
    # select score for best C
    mean_scores = mean_scores.max(axis=0)
    bar_offsets = np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + 0.5
    
    plt.figure()
    COLORS = "bgrcmyk"
    for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
        plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])
    
    plt.title("Comparing feature reduction techniques")
    plt.xlabel("Reduced number of features")
    plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
    plt.ylabel("Digit classification accuracy")
    plt.ylim((0, 1))
    plt.legend(loc="upper left")
    plt.grid()
    
    plt.show()
    
print('_______________precision__________recall__________f1-score')
print('_______0_________'+ str(round(np.mean(precision_0), 2)) + '______________' + str(round(np.mean(recall_0), 2)) + '_____________' + str(round(np.mean(f1_0), 2)))
print('_______1_________'+ str(round(np.mean(precision_1), 2)) + '______________' + str(round(np.mean(recall_1), 2)) + '_____________' + str(round(np.mean(f1_1), 2)))

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

print(grid.best_params_)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

In [None]:
grid.cv_results_

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

y_score = clf.decision_function(x_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()