<a href="https://colab.research.google.com/github/zSoftwareRepository/MDS-561-46-22SU/blob/main/Xgboost_Model_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold

import xgboost
import pickle

seed = 23
np.random.seed(seed)

from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
training_data = pd.read_csv('drive/MyDrive/data/creditcard.csv')

In [6]:
train_target = pd.DataFrame()
train_target['class'] = training_data.loc[:,'Class'].copy()
train_target.index = training_data.index

In [7]:
folds = pd.DataFrame()
folds['class'] = training_data.loc[:,'Class']
folds['fold']   = -1
folds.index = training_data.index

In [8]:
n_folds = 10

kf = KFold(n_splits=n_folds,shuffle=True,random_state=seed)

for n, (train_index, val_index) in enumerate(kf.split(training_data)):
    folds.loc[val_index, 'fold'] = int(n)

folds.loc[:,'fold'] = folds.loc[:,'fold'].astype(int)

In [9]:
val_idx = folds[folds['fold'] == 0].index 
holdout = training_data.iloc[val_idx].reset_index(drop=True).copy()
class_holdout = train_target.iloc[val_idx].reset_index(drop=True).copy()

In [10]:
trn_idx = folds[folds['fold'] != 0].index
training_data = training_data.iloc[trn_idx].reset_index(drop=True).copy()

In [11]:
train_target = pd.DataFrame()
train_target['class'] = training_data.loc[:,'Class'].copy()
train_target.index = training_data.index

In [12]:
folds = pd.DataFrame()
folds['class'] = training_data.loc[:,'Class']
folds['fold']   = -1
folds.index = training_data.index

In [13]:
n_folds = 10

kf = KFold(n_splits=n_folds,shuffle=True,random_state=seed)

for n, (train_index, val_index) in enumerate(kf.split(training_data)):
    folds.loc[val_index, 'fold'] = int(n)

folds.loc[:,'fold'] = folds.loc[:,'fold'].astype(int)

In [14]:
data = training_data.copy()
data.drop(['Class','Time'],axis=1,inplace=True)

In [17]:
scale_pos_weight = train_target.loc[train_target['class'] == 0,'class'].count() / \
                   train_target.loc[train_target['class'] == 1,'class'].count()

models    = []

param = {
    'max_depth': 4,
    'eta':0.01,
    'objective':'binary:logistic',
    'eval_metric':'logloss',
    'nthread': -1,
    'seed' : seed,
    'subsample' : 0.9,
    'colsample_bytree' : 0.8,
    'tree_method' : 'gpu_hist',
    'lambda' : 3,
    'scale_pos_weight' : scale_pos_weight
}

In [18]:
for fold_num in range(n_folds):
        
    trn_idx = folds[folds['fold'] != fold_num].index
    val_idx = folds[folds['fold'] == fold_num].index    
        
    X_train = data.iloc[trn_idx].reset_index(drop=True).copy()
    Y_train = train_target.iloc[trn_idx].reset_index(drop=True).copy() 
        
    X_val   = data.iloc[val_idx].reset_index(drop=True).copy()
    Y_val   = train_target.iloc[val_idx].reset_index(drop=True).copy() 
        
    dtrain = xgboost.DMatrix(X_train,Y_train)
    dtest  = xgboost.DMatrix(X_val,Y_val)
        
    model = xgboost.train(params=param,
                          dtrain=dtrain,
                          num_boost_round= 5000,
                          early_stopping_rounds=100,
                          evals=[(dtrain, 'train'), (dtest, 'test')],
                          verbose_eval=100)
    models.append(model)

[0]	train-logloss:0.685034	test-logloss:0.685072
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.263469	test-logloss:0.264365
[200]	train-logloss:0.136629	test-logloss:0.138124
[300]	train-logloss:0.087683	test-logloss:0.089413
[400]	train-logloss:0.062579	test-logloss:0.064385
[500]	train-logloss:0.047964	test-logloss:0.049663
[600]	train-logloss:0.037922	test-logloss:0.039625
[700]	train-logloss:0.029949	test-logloss:0.03165
[800]	train-logloss:0.024444	test-logloss:0.02618
[900]	train-logloss:0.020046	test-logloss:0.021786
[1000]	train-logloss:0.016629	test-logloss:0.018358
[1100]	train-logloss:0.013788	test-logloss:0.015543
[1200]	train-logloss:0.011721	test-logloss:0.013513
[1300]	train-logloss:0.00989	test-logloss:0.011671
[1400]	train-logloss:0.008554	test-logloss:0.010358
[1500]	train-logloss:0.007321	test-logloss:0.009168
[1600]	train-logloss:0.006273	tes

In [19]:
holdout.drop(['Class','Time'],axis=1,inplace=True)

In [20]:
predictions = 0
print("Generating predictions")
for model in models:
    predictions += model.predict(xgboost.DMatrix(holdout))

predictions /= len(models)

Generating predictions


In [21]:
predictions = np.round(predictions,0)

In [22]:
# make predictions for test data and evaluate
accuracy = accuracy_score(class_holdout['class'], predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.97%


In [23]:
# confusion m,atrix
tn, fp, fn, tp = confusion_matrix(class_holdout['class'], predictions).ravel()
print(tn, fp, fn, tp)

28428 6 2 45


In [24]:
sensitivity = tp/(tp+fn)
print("Sensitivity: %.2f%%" % (sensitivity * 100.0))
specificity = tn/(tn+fp)
print("Specificity: %.2f%%" % (specificity * 100.0))

Sensitivity: 95.74%
Specificity: 99.98%


In [25]:
confusion_matrix(class_holdout['class'], predictions)

array([[28428,     6],
       [    2,    45]])