In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score, make_scorer, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from imblearn.over_sampling import SMOTENC

# MAR - Missing at random

## Im going to only take missings out of two different important columns

In [8]:
df = pd.read_csv(r'..\credit_card_experiments\encoded_dataset.csv')

In [9]:
# Define the target feature
target_feature = 'card'

# Define the percentage of values to make missing
missing_percentage = 0.6

# Create a mask to introduce missing values, excluding the target feature
for column in ['income', 'age']:
    mask = np.random.rand(df.shape[0]) < missing_percentage
    df.loc[mask, column] = np.nan

# Splitting
X = df.drop('card', axis=1)
y = df['card']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#standardizing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
print(X_train_scaled.shape)

(1055, 11)


In [11]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_scaled, 
                            y=y_train, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [15]:
# Optimize
best_1 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())
best_2 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 64, trials = Trials())

100%|██████████| 48/48 [00:04<00:00,  9.87trial/s, best loss: -0.9672468056617753]
100%|██████████| 64/64 [00:06<00:00, 10.26trial/s, best loss: -0.9672468056617753]


In [16]:
# Train model using the best parameters
xgboost_1 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best_1)['colsample_bytree'], 
                           gamma=space_eval(space, best_1)['gamma'], 
                           learning_rate=space_eval(space, best_1)['learning_rate'], 
                           max_depth=space_eval(space, best_1)['max_depth'], 
                           reg_alpha=space_eval(space, best_1)['reg_alpha'],
                           reg_lambda=space_eval(space, best_1)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best_1)['scale_pos_weight']
                           ).fit(X_train_scaled,y_train)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)
print()

xgboost_2 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best_2)['colsample_bytree'], 
                           gamma=space_eval(space, best_2)['gamma'], 
                           learning_rate=space_eval(space, best_2)['learning_rate'], 
                           max_depth=space_eval(space, best_2)['max_depth'], 
                           reg_alpha=space_eval(space, best_2)['reg_alpha'],
                           reg_lambda=space_eval(space, best_2)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best_2)['scale_pos_weight']
                           ).fit(X_train_scaled,y_train)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.9734848484848485
0.9838709677419355
[[ 61   1]
 [  6 196]]

0.9734848484848485
0.9838709677419355
[[ 61   1]
 [  6 196]]


In [19]:
# dropping the columns all together

X_train_scaled_removed = np.delete(X_train_scaled, [1,2], axis=1)
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_scaled_removed, 
                            y=y_train, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [20]:
# Optimize
best_3 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 32, trials = Trials())
best_4 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 64, trials = Trials())

100%|██████████| 32/32 [00:22<00:00,  1.45trial/s, best loss: -0.9672468056617753]
100%|██████████| 64/64 [00:05<00:00, 12.06trial/s, best loss: -0.9672468056617753]


In [21]:
# Train model using the best parameters
xgboost_2 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best_3)['colsample_bytree'], 
                           gamma=space_eval(space, best_3)['gamma'], 
                           learning_rate=space_eval(space, best_3)['learning_rate'], 
                           max_depth=space_eval(space, best_3)['max_depth'], 
                           reg_alpha=space_eval(space, best_3)['reg_alpha'],
                           reg_lambda=space_eval(space, best_3)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best_3)['scale_pos_weight']
                           ).fit(X_train_scaled_removed,y_train)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.9734848484848485
0.9838709677419355
[[ 61   1]
 [  6 196]]
