In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score, make_scorer, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from imblearn.over_sampling import SMOTENC

# MCAR - Missing completely at random
## 70% of values are taken out at random

In [2]:
df = pd.read_csv(r'..\credit_card_experiments\encoded_dataset.csv')

df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,1,0,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,0,0,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,1,0,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,0,0,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,1,0,2,64,1,5


In [4]:
# Define the target feature
target_feature = 'card'

# Define the percentage of values to make missing
missing_percentage = 0.7

# Create a mask to introduce missing values, excluding the target feature
for column in df.columns:
    if column != target_feature:
        mask = np.random.rand(df.shape[0]) < missing_percentage
        df.loc[mask, column] = np.nan

# Splitting
X = df.drop('card', axis=1)
y = df['card']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#standardizing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_scaled, 
                            y=y_train, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [6]:
# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())

  0%|          | 0/48 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 48/48 [00:23<00:00,  2.04trial/s, best loss: -0.40651464385603364]


In [7]:
#first model trained for f1
# Train model using the best parameters
xgboost_1 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best)['colsample_bytree'], 
                           gamma=space_eval(space, best)['gamma'], 
                           learning_rate=space_eval(space, best)['learning_rate'], 
                           max_depth=space_eval(space, best)['max_depth'], 
                           reg_alpha=space_eval(space, best)['reg_alpha'],
                           reg_lambda=space_eval(space, best)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best)['scale_pos_weight']
                           ).fit(X_train_scaled,y_train)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.42803030303030304
0.9032258064516129
[[ 56   6]
 [145  57]]


1. Create flag for missing values
1. Impute missing values 
1. Generate samples for the minority class until balanced using SMOTE
1. Use flag column to delete values that should be missing
1. Delete flag columns

In [8]:
#Flags
flag_col = []

# Create a copy of the original DataFrame
df_with_flags = df.copy()

print(df_with_flags.columns)

# Iterate through each column in the DataFrame
for column in df.columns:
    # Create a new flag column for each feature
    flag_column = f'{column}_missing_flag'
    flag_col.append(flag_column)
    
    # Set the flag column to 1 where the original column has missing values (NaN), 0 otherwise
    df_with_flags[flag_column] = df[column].isna().astype(int)

df_with_flags = df_with_flags.drop('card_missing_flag', axis=1)
flag_col.remove('card_missing_flag')

df_with_flags.head()
print(flag_col)

Index(['card', 'reports', 'age', 'income', 'share', 'expenditure', 'owner',
       'selfemp', 'dependents', 'months', 'majorcards', 'active'],
      dtype='object')
['reports_missing_flag', 'age_missing_flag', 'income_missing_flag', 'share_missing_flag', 'expenditure_missing_flag', 'owner_missing_flag', 'selfemp_missing_flag', 'dependents_missing_flag', 'months_missing_flag', 'majorcards_missing_flag', 'active_missing_flag']


In [9]:
#Seperate categorical and continuous features because they need to be imputed differently

cat_features = [
    'owner',
    'selfemp',
    'dependents',
    'reports',
    'majorcards'
]

cont_features = [
    'age',
    'income',
    'share',
    'expenditure',
    'months',
    'active'
]

features = cat_features + cont_features

#cat featutes first
for col in cat_features:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df_with_flags[col] = cat_imputer.fit_transform(df_with_flags[col].values.reshape(-1,1))[:,0]
    
#cont features
for col in cont_features:
    cont_imputer = SimpleImputer(strategy='mean')
    df_with_flags[col] = cont_imputer.fit_transform(df_with_flags[col].values.reshape(-1,1))[:,0]


df_with_flags.isna().any()

card                        False
reports                     False
age                         False
income                      False
share                       False
expenditure                 False
owner                       False
selfemp                     False
dependents                  False
months                      False
majorcards                  False
active                      False
reports_missing_flag        False
age_missing_flag            False
income_missing_flag         False
share_missing_flag          False
expenditure_missing_flag    False
owner_missing_flag          False
selfemp_missing_flag        False
dependents_missing_flag     False
months_missing_flag         False
majorcards_missing_flag     False
active_missing_flag         False
dtype: bool

In [10]:
# Splitting
X = df_with_flags.drop('card', axis=1)
y = df_with_flags['card']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Generate synthetic samples for minority class
print(f'Original dataset samples per class {Counter(y_train)}')

cat_features_idx = []
#get the indexes of the cat columns
for idx in range(len(X.columns)):
    if(X.columns[idx] in cat_features or X.columns[idx] in flag_col):
        cat_features_idx.append(idx)
        print(X.columns[idx])

print(cat_features_idx)

sm = SMOTENC(random_state=42, categorical_features=cat_features_idx)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(f'Resampled dataset samples per class {Counter(y_train_res)}')

Original dataset samples per class Counter({1: 821, 0: 234})
reports
owner
selfemp
dependents
majorcards
reports_missing_flag
age_missing_flag
income_missing_flag
share_missing_flag
expenditure_missing_flag
owner_missing_flag
selfemp_missing_flag
dependents_missing_flag
months_missing_flag
majorcards_missing_flag
active_missing_flag
[0, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
Resampled dataset samples per class Counter({1: 821, 0: 821})


In [12]:
#standardize
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the training data for the selected features
X_train_scaled_features = scaler.fit_transform(X_train_res[features])

# Transform the test data using the same scaler for the selected features
X_test_scaled_features = scaler.transform(X_test[features])

# Get the flag columns as NumPy arrays
X_train_flags = X_train_res[flag_col].values
X_test_flags = X_test[flag_col].values

# Combine the scaled features and flag columns using np.concatenate
X_train_scaled = np.concatenate((X_train_scaled_features, X_train_flags), axis=1)
X_test_scaled = np.concatenate((X_test_scaled_features, X_test_flags), axis=1)

In [13]:
#lets see if keeping flag column and imputed values is good for the model

# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_scaled, 
                            y=y_train_res, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

# Optimize
best_2 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())

  0%|          | 0/48 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 48/48 [00:14<00:00,  3.39trial/s, best loss: -0.7191184147919875]


In [14]:
xgboost_with_flags = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best_2)['colsample_bytree'], 
                           gamma=space_eval(space, best_2)['gamma'], 
                           learning_rate=space_eval(space, best_2)['learning_rate'], 
                           max_depth=space_eval(space, best_2)['max_depth'], 
                           reg_alpha=space_eval(space, best_2)['reg_alpha'],
                           reg_lambda=space_eval(space, best_2)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best_2)['scale_pos_weight']
                           ).fit(X_train_scaled,y_train_res)
# Make prediction using the best model
bayesian_opt_predict = xgboost_with_flags.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_with_flags.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.4621212121212121
0.8870967741935484
[[ 55   7]
 [135  67]]


In [15]:
#take out flags and values that flag indicate
# Iterate over flag columns and update corresponding feature values

flag_columns = [col for col in X_train_res.columns if col.endswith('_missing_flag')]

for flag_col in flag_columns:
    feature_col = flag_col.replace('_missing_flag', '')
    X_train_res.loc[X_train_res[flag_col] == 1, feature_col] = np.nan
    X_test.loc[X_test[flag_col] == 1, feature_col] = np.nan


In [16]:
X_train_res.drop(columns=flag_columns, inplace=True)
X_test.drop(columns=flag_columns, inplace=True)

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the training data for the selected features
X_train_res_scaled = scaler.fit_transform(X_train_res)

# Transform the test data using the same scaler for the selected features
X_test_scaled = scaler.transform(X_test)


In [18]:
# Space
space = {
    'learning_rate': hp.choice('learning_rate', [0.0001,0.001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,21,3)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
    'scale_pos_weight' : hp.choice('scale_pos_weight', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10])
}

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# optimize f1-score for the minority class (0)
scorer = make_scorer(f1_score, pos_label = 0)

# Objective function
def objective(params):
    
    xgboost = xgb.XGBClassifier(seed=0, **params)
    score = cross_val_score(estimator=xgboost, 
                            X=X_train_res_scaled, 
                            y=y_train_res, 
                            cv=kfold, 
                            scoring=scorer, 
                            n_jobs=-1).mean()
    # Loss is negative score
    loss = - score
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [19]:
# Optimize
best_3 = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())

  0%|          | 0/48 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 48/48 [00:07<00:00,  6.43trial/s, best loss: -0.7210730786319258]


In [20]:
xgboost_1 = xgb.XGBClassifier(seed=0, 
                           colsample_bytree=space_eval(space, best_3)['colsample_bytree'], 
                           gamma=space_eval(space, best_3)['gamma'], 
                           learning_rate=space_eval(space, best_3)['learning_rate'], 
                           max_depth=space_eval(space, best_3)['max_depth'], 
                           reg_alpha=space_eval(space, best_3)['reg_alpha'],
                           reg_lambda=space_eval(space, best_3)['reg_lambda'],
                           scale_pos_weight=space_eval(space, best_3)['scale_pos_weight']
                           ).fit(X_train_res_scaled,y_train_res)
# Make prediction using the best model
bayesian_opt_predict = xgboost_1.predict(X_test_scaled)
# Get predicted probabilities
bayesian_opt_predict_prob = xgboost_1.predict_proba(X_test_scaled)[:,1]
# Get performance metrics
acc = accuracy_score(y_test, bayesian_opt_predict)
recall = recall_score(y_test, bayesian_opt_predict, pos_label=0)
conf = confusion_matrix(y_test, bayesian_opt_predict)


print(acc)
print(recall)
print(conf)

0.4393939393939394
0.8709677419354839
[[ 54   8]
 [140  62]]
