In [317]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={'figure.figsize':(11,8)})
pd.set_option("display.max_columns",30)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### credit card default data

In [318]:
default = pd.read_excel(os.getcwd()+"//data//default-of-credit-card-clients.xls",
                       skiprows=1)\
.drop("ID",axis=1)\
.rename(columns={"default payment next month":"default"})

default["SEX"] = np.where(default["SEX"]==2,0,default["SEX"])
default["MARRIAGE"] = np.where(default["MARRIAGE"]==2,0,
                              np.where(default["MARRIAGE"]==3,2,default["MARRIAGE"]))

In [319]:
cols = ["SEX","EDUCATION","MARRIAGE"]
for col in cols:
        default[col] = pd.Categorical(default[col])

### last 6 months default count

In [320]:
last_def_count=default.iloc[:,range(5,11)]\
.reset_index()\
.melt(id_vars="index")\
.assign(value = lambda x: x["value"].apply(lambda y: 1 if y>0 else 0))\
.pivot(index="index",
      columns="variable")\
.reset_index(drop=True)\
.assign(last_def_count = lambda x: x[x.columns].sum(axis=1))["last_def_count"]

default.insert(11,"last_def_count",last_def_count)

### weighted payment history

In [321]:
def weighted_pmt_hist(df):
    return df["PAY_0"]+df["PAY_2"]/2+df["PAY_3"]/3+df["PAY_4"]/4+df["PAY_5"]/5+df["PAY_6"]/6
    
default["weighted_pmt_hist"] = default.apply(weighted_pmt_hist,axis=1)

### bill_amt and pay_amt mean ingoring zeroes

In [322]:
bill = ["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6"]
pmt = ["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]

default["avg_bill_amt"] = default[default[bill]!=0][bill].mean(axis=1).fillna(0)
default["avg_pmt_amt"] = default[default[pmt]!=0][pmt].mean(axis=1).fillna(0)

### train test split

In [323]:
X = pd.get_dummies(default.drop("default",axis=1),drop_first=True)
y = default["default"]

In [324]:
X_train, X_test = train_test_split(X, train_size=3/4, random_state=123, stratify=y)
y_train, y_test = train_test_split(y, train_size=3/4, random_state=123, stratify=y)

## GLM

In [325]:
glm = LogisticRegression(penalty="none",
                        max_iter=10000)
scores = cross_validate(
    glm,
    X_train, 
    y_train, 
    cv=10,
    return_train_score=True,
    scoring="roc_auc"
)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.361082,0.003,0.639122,0.652431
1,0.508107,0.005004,0.649082,0.650927
2,0.460104,0.003,0.643007,0.651711
3,0.383081,0.003001,0.657868,0.64926
4,0.335077,0.002999,0.65533,0.647646
5,0.339077,0.003,0.636362,0.65162
6,0.84219,0.002999,0.723266,0.702972
7,0.903203,0.002999,0.676781,0.710017
8,0.369083,0.003,0.663906,0.648744
9,0.389082,0.002999,0.636034,0.652551


In [326]:
selector = RFECV(glm, 
                 step=1, 
                 min_features_to_select=1, 
                 cv=10, 
                 scoring="roc_auc")
selector.fit(X_train, y_train)

In [327]:
selected_features_df = pd.DataFrame({
    "selected": selector.support_,
    "ranking": selector.ranking_
}, index=X.columns)
selected_features_df

Unnamed: 0,selected,ranking
LIMIT_BAL,False,16
AGE,False,3
PAY_0,True,1
PAY_2,True,1
PAY_3,True,1
PAY_4,True,1
PAY_5,False,2
PAY_6,True,1
last_def_count,True,1
BILL_AMT1,False,11


In [328]:
ult_features = selected_features_df[selected_features_df["selected"]==True].index.to_list()
X_train = X_train[ult_features]
X_test = X_test[ult_features]

In [329]:
glm = LogisticRegression(penalty="none",
                        max_iter=10000)
scores = cross_validate(
    glm,
    X_train, 
    y_train, 
    cv=10,
    return_train_score=True,
    scoring="roc_auc"
)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.100023,0.003004,0.754266,0.740767
1,0.092021,0.003001,0.717724,0.744489
2,0.104024,0.003001,0.730979,0.743907
3,0.132031,0.003005,0.740292,0.742009
4,0.127029,0.003,0.768346,0.739101
5,0.137031,0.002,0.73849,0.742745
6,0.115027,0.003,0.746424,0.741629
7,0.08802,0.001999,0.736121,0.742343
8,0.122027,0.003,0.721434,0.743544
9,0.123027,0.002,0.749253,0.740064


In [330]:
glm_prediction = pd.DataFrame(glm.fit(X_train,y_train).predict_proba(X_test),columns=["glm_pred_0","glm_pred_1"])

## knn classifier

In [331]:
pipeline_knn = Pipeline(steps=[
    ("std_scaler", StandardScaler()),
    ("classifier", GridSearchCV(KNeighborsClassifier(),
                                param_grid = {"n_neighbors":range(2,50,2)},
                                cv=5,
                                scoring="roc_auc",
                                verbose=1,
                                n_jobs=6,
                                refit=True))
])

knn = pipeline_knn.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [332]:
knn_prediction = pd.DataFrame(knn.predict_proba(X_test),columns=["knn_pred_0","knn_pred_1"])

### decision tree classifier