In [3]:
import numpy as np
import pandas as pd
import os

import gc
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import space_eval
import time
import math
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
import pprint
pp = pprint.PrettyPrinter(indent=4)
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import KFold

In [4]:
data_dir= "data/"
df = pd.read_csv(data_dir + "/" + "creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
input_cols = ["V" + str(x) for x in range(1,29)] + ["Amount"]
X = df[input_cols]
y = df["Class"]
y.value_counts()

0    284315
1       492
Name: Class, dtype: int64

* As we can see that the dataset is heavily imbalanced as there are very samples with target class value 1 than 0.
* We will balance dataset with SMOTE, which will oversample the samples that have minority class as output value by introducing new synthetic samples that have slightly different values of input variables from each other.


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=7)

In [7]:
# Balance dataset with SMOTE
sm = SMOTE(random_state=7)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
X_train_bal = pd.DataFrame(X_train_bal, columns=input_cols)
y_train_bal = pd.Series(y_train_bal)

Next let's find out the best hyperparameters for LightGBM classifier model. I am using Hyperopt library where objective function calculates the negative f1 score as value to be minimized while searching for the optimal values of hyperparameters using Tree-structured Parzen Estimator (TPE) algorithm to explore hyperparameter space. Finally it will find out the best number of iterations with reduced learning rate for gradient boosting algorithm to be used for training on entire training dataset, before evaluating its performance against test dataset.

In [8]:
number_of_evals = 300
def find_best_params_for_lgb(X, y):
    evaluated_point_scores = {}
    
    def objective(params):
        garbage=gc.collect()
        if (str(params) in evaluated_point_scores):
            return evaluated_point_scores[str(params)]
        else:          
            kf = KFold(n_splits=2, random_state=7)
            scores = []
            for train_index, test_index in kf.split(X.values):                
                X_train, X_val = X.values[train_index], X.values[test_index]
                y_train, y_val = y.values.ravel()[train_index], y.values.ravel()[test_index]
            
                train_data = lgb.Dataset(X_train, 
                                label=y_train,
                                feature_name=list(X.columns),
                                )
                
                validation_data = lgb.Dataset(X_val, 
                                label=y_val,
                                feature_name=list(X.columns),
                                )
                
                evals_result = {}
                bst = lgb.train(params, train_data, 
                                valid_sets=[train_data, validation_data], 
                                valid_names=['train', 'val'], 
                                evals_result=evals_result, 
                                num_boost_round=10000,
                                early_stopping_rounds=100,
                                verbose_eval=None,
                               )

                y_val_preds = np.where(bst.predict(X_val) > 0.5, 1, 0)
                score = f1_score(y_val, y_val_preds)
                scores.append(score)
                
#             print("Evaluating params:")
#             pp.pprint(params)
            socre=np.mean(scores).item(0)
#             print("f1: " + str(score))
            evaluated_point_scores[str(params)] = -score
            return -score
    param_space = {
            'objective': hp.choice("objective", ["binary"]),        
            "max_depth": scope.int(hp.quniform("max_depth", 50, 60, 1)),
            "learning_rate": hp.choice("learning_rate", [0.2]),
            "num_leaves": scope.int(hp.quniform("num_leaves", 32, 1024, 10)),   
            "max_bin": scope.int(hp.quniform("max_bin", 50, 250, 10)),
            "bagging_fraction": hp.quniform('bagging_fraction', 0.70, 1.0, 0.05),
            "feature_fraction": hp.uniform("feature_fraction", 0.90, 1.0),
            "bagging_freq": hp.choice("bagging_freq", [1]),
            "lambda_l1": hp.quniform('lambda_l1', 1, 10, 1),        
            "lambda_l2": hp.quniform('lambda_l2', 1, 100, 5),
            "loss_function": hp.choice("loss_function", ["binary_error"]), 
            "eval_metric": hp.choice("eval_metric", ["binary_error"]),
            "metric": hp.choice("metric", ["binary_error"]),
            "random_state": hp.choice("random_state", [7]),
            "verbose": hp.choice("verbose", [None])
        }

    best_params = space_eval(
        param_space, 
        fmin(objective, 
             param_space, 
             algo=hyperopt.tpe.suggest,
             max_evals=number_of_evals))    
    
    
    # Finding best number of iterations with learning rate 0.1
    best_params["learning_rate"] = 0.1

    kf = KFold(n_splits=5)

    num_iterations_array = []
    for train_index, test_index in kf.split(X.values):                
        X_train, X_val = X.values[train_index], X.values[test_index]
        y_train, y_val = y.values.ravel()[train_index], y.values.ravel()[test_index]

        train_data = lgb.Dataset(X_train, 
                        label=y_train,
                        feature_name=list(X.columns),
                        )

        validation_data = lgb.Dataset(X_val, 
                        label=y_val,
                        feature_name=list(X.columns),
                        )

        evals_result = {}
        bst = lgb.train(best_params, train_data, 
                        valid_sets=[train_data, validation_data], 
                        valid_names=['train', 'val'], 
                        evals_result=evals_result, 
                        num_boost_round=10000,
                        early_stopping_rounds=100,
                        verbose_eval=None,
                       )

        num_iterations_array.append(bst.best_iteration)        

    best_params["num_iterations"] = int(np.mean(num_iterations_array).item(0))        
    print ("Best Hyperparameters found:")
    pp.pprint(best_params)
    return best_params

In [9]:
best_params = find_best_params_for_lgb(X=X_train_bal, y=y_train_bal)

  0%|          | 0/300 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 300/300 [8:57:30<00:00, 107.50s/trial, best loss: -0.9374631489297982]  
Best Hyperparameters found:
{   'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'eval_metric': 'binary_error',
    'feature_fraction': 0.948181844012349,
    'lambda_l1': 1.0,
    'lambda_l2': 55.0,
    'learning_rate': 0.1,
    'loss_function': 'binary_error',
    'max_bin': 60,
    'max_depth': 54,
    'metric': 'binary_error',
    'num_iterations': 540,
    'num_leaves': 40,
    'objective': 'binary',
    'random_state': 7,
    'verbose': None}


In [10]:
train_data = lgb.Dataset(X_train_bal.values, 
                            label=y_train_bal.values.ravel(),
                            feature_name=list(X_train_bal.columns),
                        )
bst = lgb.train(best_params, train_data)
y_probs = bst.predict(X_test)





Calculating AUC ROC score


In [11]:
test_score = roc_auc_score(y_test, y_probs)
test_score

0.990055221413246

Calculating F1-Score with sample representing a fraudulant transaction considered as positive sample

In [12]:
y_preds = np.where(y_probs > 0.5, 1, 0)
f1 = f1_score(y_test, y_preds)
f1

0.7945205479452055

The performance of the model can be further improved by exploring the Hyperparameter space at more granuarlity level. This can be achieved by evaluating more combinations of hyperparameter values. This will take more execution time to explore the hyperparameter space to find the optimal parameters.

Bayesian Optimization technique can also be used to narrow down search space of Hyperparams.