In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score, make_scorer, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

In [2]:
#read in data and look at it
df = pd.read_csv(r'..\data\AER_credit_card_data.csv')

df.head()


Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [3]:
card_frequency = Counter(df['card'])
owner_frequency = Counter(df['owner'])
selfemp_frequency = Counter(df['selfemp'])
majorcards_frequency = Counter(df['majorcards'])
dep_frequency = Counter(df['dependents'])
reports_frequency = Counter(df['reports'])

print(card_frequency)
print(owner_frequency)
print(selfemp_frequency)
print(majorcards_frequency)
print(dep_frequency)
print(reports_frequency)

Counter({'yes': 1023, 'no': 296})
Counter({'no': 738, 'yes': 581})
Counter({'no': 1228, 'yes': 91})
Counter({1: 1078, 0: 241})
Counter({0: 659, 1: 267, 2: 218, 3: 115, 4: 44, 5: 9, 6: 7})
Counter({0: 1060, 1: 137, 2: 50, 3: 24, 4: 17, 5: 11, 7: 6, 6: 5, 11: 4, 9: 2, 12: 1, 14: 1, 10: 1})


In [3]:
df.describe()

Unnamed: 0,reports,age,income,share,expenditure,dependents,months,majorcards,active
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,0.456406,33.213103,3.365376,0.068732,185.057071,0.993935,55.267627,0.817286,6.996967
std,1.345267,10.142783,1.693902,0.094656,272.218917,1.247745,66.271746,0.386579,6.305812
min,0.0,0.166667,0.21,0.000109,0.0,0.0,0.0,0.0,0.0
25%,0.0,25.41667,2.24375,0.002316,4.583333,0.0,12.0,1.0,2.0
50%,0.0,31.25,2.9,0.038827,101.2983,1.0,30.0,1.0,6.0
75%,0.0,39.41667,4.0,0.093617,249.0358,2.0,72.0,1.0,11.0
max,14.0,83.5,13.5,0.90632,3099.505,6.0,540.0,1.0,46.0


In [4]:
#encode cat features
cat_feat = [
    'card',
    'owner',
    'selfemp'
]

label_encoders = {}

for column in cat_feat:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [5]:
df.dtypes

card             int32
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner            int32
selfemp          int32
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object

In [8]:
df.to_csv(r'..\credit_card_experiments\encoded_dataset.csv', index=False)

In [9]:
X = df.drop('card', axis=1)
y = df['card']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

In [13]:
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [14]:
# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

In [15]:
# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_scaled, 
                            y=y_train, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [16]:
# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())

  0%|          | 0/48 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 48/48 [00:25<00:00,  1.87trial/s, best loss: -0.9672468056617753]


In [17]:
#first model trained for f1
# Train model using the best parameters
xgboost_1 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best)['colsample_bytree'], 
                           gamma=space_eval(space, best)['gamma'], 
                           learning_rate=space_eval(space, best)['learning_rate'], 
                           max_depth=space_eval(space, best)['max_depth'], 
                           reg_alpha=space_eval(space, best)['reg_alpha'],
                           reg_lambda=space_eval(space, best)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best)['scale_pos_weight']
                           ).fit(X_train_scaled,y_train)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.9772727272727273
1.0
[[ 62   0]
 [  6 196]]
