In [7]:
from config import CONFIG
import pandas as pd
import numpy as np
from pathlib import Path
from logzero import setup_logger

# trainning
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import recall_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [10]:
logger = setup_logger(name='01_train',logfile=CONFIG.reports / 'logs'/ '01_train.log' )

In [2]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=CONFIG.random_state)



In [3]:
# load data

dataX = pd.read_csv(CONFIG.data_path / 'interim' / 'train_data.csv')
dataX.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,154640.0,-0.006179,0.428347,0.107722,-0.5343,0.428184,-0.490463,0.682737,-0.0014,-0.167845,...,-0.315014,-0.788776,0.068315,-0.595862,-0.949021,0.29583,0.584499,0.25375,-0.324723,0
1,139525.0,0.906804,-0.111816,-1.453647,0.841653,0.445183,-0.763003,0.743077,-0.324415,-0.289703,...,0.38282,0.730425,-0.371941,-0.564845,0.944516,-0.88689,-0.186377,-0.171943,0.246484,0
2,69778.0,-0.553119,-2.6891,-0.923019,0.331786,-1.504413,-0.575062,1.294521,-0.593775,-1.173047,...,0.471684,-1.974414,-1.756821,0.835349,-0.268925,1.804527,-0.785323,0.690229,4.854238,0
3,48473.0,-0.264895,0.620774,-0.405357,-0.551577,1.792919,2.50329,0.037274,0.665005,-0.2935,...,-0.26289,-0.976089,0.233278,1.559025,-1.511704,0.082939,0.026766,0.77044,-0.335278,0
4,129350.0,-0.326964,0.128487,0.186869,-1.262066,1.895409,3.02106,-0.160778,0.7846,0.431836,...,0.228638,1.046186,-0.514704,1.26802,-0.161567,1.271054,0.183406,-0.101017,-0.335278,0


## Model Training

In [5]:
# Split Features and target data
X, y = dataX.iloc[:, :-1], dataX.iloc[:, -1]
print(X.shape, y.shape)

(199364, 30) (199364,)


In [13]:
# penalty = 'l2'
# C = 1.0
# class_weight = 'balanced'
# random_state = 2018
# solver = 'liblinear'
logReg = LogisticRegression(n_jobs=-1, class_weight='balanced')
# take recall as metric as we want to get all of the fraud cases possible
scorer = make_scorer(recall_score)
scores = cross_val_score(logReg, X, y, scoring=scorer, cv=k_fold, n_jobs=-1)
logger.info('Train LogisticRegression:')
logger.info(f'CV recall: {np.mean(scores): .3f} +/- {np.std(scores): .3f}')
logger.info('Done!')

[I 201108 18:27:29 <ipython-input-13-eaa576db51ad>:10] Train LogisticRegression:
[I 201108 18:27:29 <ipython-input-13-eaa576db51ad>:11] CV recall:  0.921 +/-  0.020
[I 201108 18:27:29 <ipython-input-13-eaa576db51ad>:12] Done!


In [14]:
# n_estimators = 10
# max_features = 'auto'
# max_depth = None
# min_samples_split = 2
# min_samples_leaf = 1
# min_weight_fraction_leaf = 0.0
# max_leaf_nodes = None
# bootstrap = True
# oob_score = False
# n_jobs = -1
# random_state = 2018
# class_weight = 'balanced'

RFC = RandomForestClassifier(n_jobs=-1, class_weight='balanced')

scores = cross_val_score(RFC, X, y, scoring=scorer, cv=k_fold, n_jobs=-1)
logger.info('Train RandomForest:')
logger.info(f'CV recall: {np.mean(scores): .3f} +/- {np.std(scores): .3f}')
logger.info('Done!')

[I 201108 18:29:04 <ipython-input-14-73d9ce1b9e20>:17] Train RandomForest:
[I 201108 18:29:04 <ipython-input-14-73d9ce1b9e20>:18] CV recall:  0.759 +/-  0.017
[I 201108 18:29:04 <ipython-input-14-73d9ce1b9e20>:19] Done!


In [15]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1, scale_pos_weight=578)

scores = cross_val_score(lgbm, X, y, cv=k_fold, scoring=scorer, n_jobs=-1)
logger.info('Train LGBM:')
logger.info(f'CV recall: {np.mean(scores): .3f} +/- {np.std(scores): .3f}')
logger.info('Done!')

[I 201108 18:29:29 <ipython-input-15-9a2a74c73630>:6] Train LGBM:
[I 201108 18:29:29 <ipython-input-15-9a2a74c73630>:7] CV recall:  0.860 +/-  0.033
[I 201108 18:29:29 <ipython-input-15-9a2a74c73630>:8] Done!


In [16]:
from xgboost import XGBClassifier

xgbm = XGBClassifier(n_jobs=-1, scale_pos_weight=578)
scores = cross_val_score(xgbm, X, y, cv=k_fold, scoring=scorer, n_jobs=-1)
logger.info('Train XGBM:')
logger.info(f'CV recall: {np.mean(scores): .3f} +/- {np.std(scores): .3f}')
logger.info('Done!')


[I 201108 18:31:42 <ipython-input-16-ef9700ec8b02>:5] Train XGBM:
[I 201108 18:31:42 <ipython-input-16-ef9700ec8b02>:6] CV recall:  0.814 +/-  0.022
[I 201108 18:31:42 <ipython-input-16-ef9700ec8b02>:7] Done!


### Tuning

In [44]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Logistic Regression Tuning

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'class_weight': ['balanced']
}

LRgs = GridSearchCV(logReg, param_grid=param_grid, cv=k_fold, n_jobs=-1, scoring=scorer, refit=True)
best_lr = gs.fit(X, y)
# rs = RandomizedSearchCV(logReg, param_distributions=param_grid, cv=k_fold, n_jobs=-1, scoring=scorer)
# rs.fit(X, y)

In [34]:
# LightGbm Tuning
rs.best_params_

{'penalty': 'l2', 'class_weight': 'balanced', 'C': 3792.690190732246}

In [37]:
gs.best_params_

{'C': 1.623776739188721, 'class_weight': 'balanced', 'penalty': 'l2'}

In [38]:
gs.best_score_

0.9243393009377664

In [35]:
rs.best_score_

0.9243393009377664

In [41]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight': [100, 200, 300, 400, 500, 600]}


fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}


lgbm_gs = RandomizedSearchCV(
    lgbm, param_distributions=param_test, 
    n_iter=100, scoring=scorer, cv=k_fold, refit=True, n_jobs=-1,

)
lgbm_gs.fit(X, y)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=LGBMClassifier(scale_pos_weight=578), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8a12466880>,
                                        'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8a11936e20>,
                                        'min_ch...
                                                             0.1, 1, 10.0,
                                                             100.0, 1000.0,
                                                             10000.0],
                                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8a1193d100>,
                                        'reg_alpha': [0, 0.1, 1, 2, 5, 7, 10,
                                                      50, 100],
   

In [42]:
lgbm_gs.best_score_

0.8777919863597614

In [46]:
testX = pd.read_csv(CONFIG.data_path / 'interim' / 'test_data.csv')
testX.shape
X_test, y_test = testX.iloc[:, :-1], testX.iloc[:, -1]
X_test.shape, y_test.shape

((85443, 30), (85443,))

In [47]:
gs.score(X_test, y_test)

0.8648648648648649

In [48]:
lgbm_gs.score(X_test, y_test)

0.8175675675675675

### Evaluating

In [55]:
from sklearn.metrics import confusion_matrix

y_pred = gs.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

[[82869  2426]
 [   20   128]]


In [51]:
y_pred = lgbm_gs.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

[[82800  2495]
 [   27   121]]


In [54]:
from sklearn.metrics import recall_score, precision_score

print(f'Recall: {recall_score(y_true=y_test, y_pred=y_pred)}')
print(f'Precision:{precision_score(y_true=y_test, y_pred=y_pred)}')

Recall: 0.8175675675675675
Precision:0.04625382262996942


In [56]:
print(f'Recall: {recall_score(y_true=y_test, y_pred=y_pred)}')
print(f'Precision:{precision_score(y_true=y_test, y_pred=y_pred)}')

Recall: 0.8648648648648649
Precision:0.050117462803445575
