In [2]:
import pandas as pd
from path import Path
from collections import Counter

df = pd.read_csv('Resources/cc_default.csv')
df.head()
#in_balance_limit: maximum balance limit on a card
# sex: 1=female, 0=male
# education: 1=graduate school, 2=university, 3=highschool, 4=others
# marriage 1=married, 0=single
# age = age of credit card holder
# default_next_month ; 1=yes, 0=no (target variable)

Unnamed: 0,ID,ln_balance_limit,sex,education,marriage,age,default_next_month
0,1,9.903488,1,2,0,24,1
1,2,11.695247,1,2,1,26,1
2,3,11.407565,1,2,1,34,0
3,4,10.819778,1,2,0,37,0
4,5,10.819778,0,2,0,57,0


In [4]:
# define features and target
x_cols = [i for i in df.columns if i not in ('ID', 'default_next_month')]
X = df[x_cols]
y = df['default_next_month']

In [5]:
# split training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1)

In [6]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 4968, 1: 4968})

In [8]:
# train the data for LogisticRegreesion model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [9]:
# we make predictions and generate a confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[3732, 2100],
       [ 740,  928]])

In [12]:
# balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)


0.5981363057701987

In [13]:
# finally we print the classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))
# the results are unimpressive, especial prediciting defaults

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.64      0.56      0.72      0.60      0.36      5832
          1       0.31      0.56      0.64      0.40      0.60      0.35      1668

avg / total       0.72      0.62      0.57      0.65      0.60      0.36      7500



# Cluster Centroid Undersampling

In [15]:
# instantiate the resampling module and use it to resample the data
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [16]:
# Then instantiate and train a logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state = 1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [17]:
# generate the metrics
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.49      0.64      0.61      0.56      0.30      5832
          1       0.26      0.64      0.49      0.37      0.56      0.31      1668

avg / total       0.70      0.52      0.60      0.56      0.56      0.31      7500



#### these results are worse than those from random undersampling. This underscores an important point: while resampling can attempt to address imbalance, it does not guarantee better results