# MODEL SELECTION & HYPERPARAMETER TUNING

## Based on the most recent executions, we obtain the results.csv file and interpet the metrics

In [None]:
# Mount GDrive

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os # Importing os for specifying file paths

main_gdrive = '/content/drive/MyDrive/fraud_detection_benchmarking/'

# Specify execution number and model name

exe_n = 6

executions = [exe for exe in os.listdir(main_gdrive) if 'exe_' in exe]
executions_n = [int(exe.split('_')[1]) for exe in executions]

exe_folder = executions[executions_n.index(exe_n)]

results_path = os.path.join(main_gdrive, exe_folder, 'results.csv')

In [None]:
# Reading the data

import pandas as pd
import numpy as np

results = pd.read_csv(results_path)

results

Unnamed: 0,timestamp,model,pr_auc_mean,pr_auc_std,roc_auc_mean,roc_auc_std,recall_mean,recall_std,precision_mean,precision_std,fit_time_total_min,score_time_total_min,total_time_min
0,05:34 PM,LightGBM,0.929257,0.015397,0.999062,0.001207,0.944794,0.017542,0.524081,0.012646,1.995425,0.456379,2.451804
1,05:35 PM,DecisionTree,0.657509,0.008571,0.983726,0.007794,0.969952,0.01577,0.090394,0.021741,0.698735,0.043555,0.742289
2,05:38 PM,XGBoost,0.920644,0.017011,0.998705,0.001353,0.984569,0.010727,0.247211,0.007739,2.56888,0.266117,2.834997
3,05:55 PM,RandomForest,0.873934,0.01672,0.975035,0.007106,0.760564,0.012063,0.980197,0.009932,16.555204,0.285229,16.840432
4,07:01 PM,LogisticRegression,0.640735,0.047965,0.992815,0.001087,0.948037,0.021137,0.031782,0.000495,65.478509,0.055089,65.533598
5,07:02 PM,GaussianNB,0.041252,0.003607,0.917566,0.010361,0.582818,0.02149,0.024551,0.001745,0.187607,0.072202,0.259808
6,08:27 PM,KNN,0.542059,0.042731,0.855896,0.012873,0.407436,0.02672,0.875736,0.047902,1.00547,84.123022,85.128492


### The following are the top 3 models based on average PR-AUC

In [None]:
top3_pr_auc = results.sort_values(by = 'pr_auc_mean', ascending = False)['model'][:3]
top3_pr_auc

Unnamed: 0,model
0,LightGBM
2,XGBoost
3,RandomForest


### ...based on recall mean,

In [None]:
top3_recall = results.sort_values(by = 'recall_mean', ascending = False)['model'][:3]
top3_recall

Unnamed: 0,model
2,XGBoost
1,DecisionTree
4,LogisticRegression


### ...based on precision mean,

In [None]:
top3_precision = results.sort_values(by = 'precision_mean', ascending = False)['model'][:3]
top3_precision

Unnamed: 0,model
3,RandomForest
6,KNN
0,LightGBM


### We should also calculate the F-1 score, which is the harmonic mean of the recall and precision scores:

In [None]:
results['f1_score'] = 2 / ( 1 / results.precision_mean + 1 / results.recall_mean)
results[['model', 'f1_score']]

Unnamed: 0,model,f1_score
0,LightGBM,0.674188
1,DecisionTree,0.165375
2,XGBoost,0.395195
3,RandomForest,0.856525
4,LogisticRegression,0.061502
5,GaussianNB,0.047117
6,KNN,0.556131


### And so, the top 3 models based on the F-1 score are:

In [None]:
top3_f1 = results.sort_values(by = 'f1_score', ascending = False)['model'][:3]
top3_f1

Unnamed: 0,model
3,RandomForest
0,LightGBM
6,KNN


### Collating the results:

In [None]:
top3_all = [var for var in globals() if 'top3' in var]
top3_all

results_collated = globals()[top3_all[0]].reset_index(drop = True)

for df_name in top3_all[1:]:
  df = globals()[df_name]

  if type(df) == pd.Series:
    df.reset_index(drop = True, inplace = True)
    results_collated = pd.concat([results_collated, df], axis = 1)

results_collated.columns = top3_all[:4]
results_collated

Unnamed: 0,top3_pr_auc,top3_recall,top3_precision,top3_f1
0,LightGBM,XGBoost,RandomForest,RandomForest
1,XGBoost,DecisionTree,KNN,LightGBM
2,RandomForest,LogisticRegression,LightGBM,KNN


From the above, we can see that:
- LightGBM has the highest PR-AUC score, as well as makes to top 3 models ranked by the F1 and precision scores,
- XGBoost has done pretty well in recall and PR-AUC, but has a terrible precision score (~25%),
- the Random Forest model does well in PR-AUC, precision, and F1 scores just like LightBGM, also making it a top contender,
- Logistic Regression performs well on recall, but is not top 3 in any of the other metrics. It also has a pretty terrible precision of ~3%,
- KNN performs pretty well as well, but has a bad recall (~41%), and finally
- Decision Trees have a good recall, but a very low precision score (~9%).

Here's how many times each model made it to a ranking:


In [None]:
pd.Series((results_collated.values).reshape(-1)).value_counts()

Unnamed: 0,count
LightGBM,3
RandomForest,3
XGBoost,2
KNN,2
DecisionTree,1
LogisticRegression,1


We can also take a look at the training times of these models:

In [None]:
top_3_models = list(np.unique(results_collated.values)) #Extracting the unique list of model names mentioned in the top 3 of the above metrics

results.loc[

          results['model'].isin(top_3_models), # For indexing the rows, we simply filter for model name in the list of top 3 model names

          ['model'] + list(results.columns[-3:]) # For indexing the columns, we append to a list containing the column 'model' another list which simply has the
                                                 # the last 3 columns which have the data on time taken. In total, there will be 4 columns in the resulting dataframe

          ].sort_values(
                        by = 'total_time_min', # Finally, we sort the models by total time taken
                        ascending = False
                      )

Unnamed: 0,model,score_time_total_min,total_time_min,f1_score
6,KNN,84.123022,85.128492,0.556131
4,LogisticRegression,0.055089,65.533598,0.061502
3,RandomForest,0.285229,16.840432,0.856525
2,XGBoost,0.266117,2.834997,0.395195
0,LightGBM,0.456379,2.451804,0.674188
1,DecisionTree,0.043555,0.742289,0.165375


We also need to be mindful of the computation costs in terms of both time and money, so we have to balance performance with efficiency:

- We can see that XGboost, LightGBM, and Decision Trees hardly take any time to train and evaluate,
- Random Forests take a modest amount of time to fit, but can be evaluated farily quickly,
- But KNN and Logistic Regression take too much time to fit and evaluate. Logitistic regression takes over an hour to fit, whereas KNN takes even more time to evaluate

LightGBM is thus a very easy pick. It has a good balance between precision and recall, and is also vey efficient.

Another pick is the Random Forest classifier. It provides the best precision along with a decent recall, and doesn't take too long to train and evaluate.

The final pick is between XGBoost and KNN. The following is are the results for the two:

In [None]:
results[results['model'].isin(['KNN', 'XGBoost'])]

Unnamed: 0,timestamp,model,pr_auc_mean,pr_auc_std,roc_auc_mean,roc_auc_std,recall_mean,recall_std,precision_mean,precision_std,fit_time_total_min,score_time_total_min,total_time_min,f1_score
2,05:38 PM,XGBoost,0.920644,0.017011,0.998705,0.001353,0.984569,0.010727,0.247211,0.007739,2.56888,0.266117,2.834997,0.395195
6,08:27 PM,KNN,0.542059,0.042731,0.855896,0.012873,0.407436,0.02672,0.875736,0.047902,1.00547,84.123022,85.128492,0.556131


XGBoost offers a higher recall, while KNN offers a higher precision. Overall, the KNN is more balanced due to its higher F-1 score, but it is computationally the most expensive model to evaluate and had the highest total time of all models. Since we already have the Random Forest that performs well on precision, we can go with XGBoost in this case since it is also the model with the highest recall.

So the three shortlisted models are:

1. LightGBM
2. Random Forests
3. XGBoost

## HYPERPARAMETER TUNING

We will now implement hyperparameter tuning of all 3 models using sklearn's GridSearch, Randomized Search, and another library called Optuna.

In [None]:
# Importing hyperparameter tuning classes

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [None]:
# Importing scoring metrics
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# Importing shortlisted models

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Importing libraries for preprocessing

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# Importing the tuning function

from sklearn.model_selection import RandomizedSearchCV

### Bringing in the data

In [None]:
datasets = {}

for d in ['X', 'y']:
  for type in ['validation', 'test']:
    df = d + '_' + type
    filepath = os.path.join(main_gdrive, exe_folder, f'{d}_{type}.csv')
    datasets[df] = pd.read_csv(filepath)

dtypes = {
            # Numerical columns
            'amount': 'float32',
            'oldbalanceOrg': 'float32',
            'newbalanceOrig': 'float32',
            'oldbalanceDest': 'float32',
            'newbalanceDest': 'float32',
            "hour_of_day": "category",

            # Alphanumeric columns
            'type': 'category'

        }

for df in ['X_validation', 'X_test']:
  datasets[df] = datasets[df].astype(dtypes)

for df in ['y_validation', 'y_test']:
  datasets[df] = datasets[df].astype('int8')

X_validation, y_validation = datasets['X_validation'], datasets['y_validation']
X_test, y_test = datasets['X_test'], datasets['y_test']

In [None]:
# Creating two new features as sin and cos of hour_of_day to introduce periodicity that will be captured by XGBoost and LightGBM

X_validation_xg_lgbm = X_validation.assign(
                    sin_hour = np.sin(X_validation['hour_of_day'].astype('int') * 2 * np.pi / 24),
                    cos_hour = np.cos(X_validation['hour_of_day'].astype('int') * 2 * np.pi / 24)
                )

X_test_xg_lgbm = X_test.assign(
                    sin_hour = np.sin(X_test['hour_of_day'].astype('int') * 2 * np.pi / 24),
                    cos_hour = np.cos(X_test['hour_of_day'].astype('int') * 2 * np.pi / 24)
                )

In [None]:
# One-hot encoding the hour_of_day and type columns for RandomForests

X_validation_rf = pd.concat([ # Concatenating 3 dataframes
                              X_validation, # 1: the original dataframe
                              pd.get_dummies(X_validation['hour_of_day'])*1, # 2: df with one-hot encoded values of hour_of_day
                              pd.get_dummies(X_validation['type'])*1 # 3: df with one-hot encoded values of type
                            ],
                            axis = 1
                            ).drop(
                                  columns = ['type', 'hour_of_day'] # Dropping these columns since they are not needed anymore
                            )

X_test_rf = pd.concat([
                              X_test,
                              pd.get_dummies(X_test['hour_of_day'])*1,
                              pd.get_dummies(X_test['type'])*1
                            ],
                            axis = 1
                            ).drop(
                                  columns = ['type', 'hour_of_day']
                            )

# Converting all feature names to string type instead of int as a requirement by sklearn

X_validation_rf.columns = X_validation_rf.columns.astype('str')
X_test_rf.columns = X_test_rf.columns.astype('str')

# Randomised Search

In [None]:
# Helper function to keep Colab session from becoming inactive

def keep_session_active():
  while True:
    pass

In [None]:
from scipy.stats import loguniform, randint, uniform # For creating distributions to draw parameter values from

In [None]:
from sklearn.model_selection import train_test_split

X_tiny, _, y_tiny, _ = train_test_split(X_validation_rf, y_validation, train_size = 0.005, random_state = 99)

In [None]:
X_tiny

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,0,1,2,3,4,...,19,20,21,22,23,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
1771770,1644.489990,166143.71875,164499.234375,0.000000e+00,0.000000e+00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2622104,44094.300781,0.00000,0.000000,4.154657e+05,4.595600e+05,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1909891,13863.790039,0.00000,0.000000,0.000000e+00,0.000000e+00,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1549677,8184.439941,21153.00000,12968.559570,0.000000e+00,0.000000e+00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2344815,229875.312500,21685.00000,0.000000,4.241275e+06,4.471150e+06,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806273,175010.640625,49566.00000,0.000000,3.178936e+05,5.477349e+05,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1858757,932938.375000,46228.00000,0.000000,4.541510e+04,9.783534e+05,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
843474,226637.031250,0.00000,0.000000,7.124325e+06,7.350962e+06,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1287881,42197.351562,149117.00000,106919.648438,0.000000e+00,0.000000e+00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
models = {
    'LightGBM': LGBMClassifier(
    device="gpu",
    gpu_platform_id=0,
    gpu_device_id=0,
    objective = "binary",
    boosting_type = "gbdt",
    n_estimators = 3000,      # fixed, rely on early stopping later
    random_state = 99,
    n_jobs = -1
    ),

    'XGBoost': XGBClassifier(
    enable_categorical = True,
    tree_method="hist",   # <-- histogram algorithm (CPU or GPU)
    device="cuda",        # <-- THIS enables GPU
    objective = 'binary:logistic',
    random_state = 99,
    n_jobs = -1
                                  ),

    'RandomForest': RandomForestClassifier(
        n_estimators = 100,
        random_state = 99,
        n_jobs = 1
    )
}

model_params = {

    'LightGBM': {
              "learning_rate": loguniform(0.01, 0.2),
              "n_estimators": randint(100, 2000),
              "num_leaves": randint(31, 256),
              "reg_alpha": loguniform(1e-5, 10),
              "reg_lambda": loguniform(1e-5, 10),
              # "max_depth": [-1, 5, 10, 20],
              # "min_data_in_leaf": randint(50, 500),
              # "feature_fraction": loguniform(0.6, 1.0),
              # "bagging_fraction": loguniform(0.6, 1.0),
              # "scale_pos_weight": loguniform(96, 1536)

    },

    'XGBoost': {
            "learning_rate": loguniform(0.01, 0.2),
            "max_depth": randint(3, 15),
            "n_estimators": randint(100, 2000),
            # "reg_lambda": loguniform(0.001, 10),       # L2 regularization
            # "subsample": loguniform(0.6, 0.4)
            # "colsample_bytree": loguniform(0.6, 1.0)
    },

    'RandomForest': {
            "n_estimators": randint(200, 1200),
            "max_depth": [None, 5, 10, 20, 30],
            "max_features": ["sqrt", "log2", 0.25, 0.5, 0.8],
            "min_samples_split": randint(2, 50),
            "min_samples_leaf": randint(1, 50)
    }

}


model_randomsearch_specs = {

    'XGBoost': lambda: RandomizedSearchCV(

                                  models[model],
                                  param_distributions = model_params[model],
                                  n_iter = 100,
                                  scoring = "average_precision",
                                  cv = 3,
                                  random_state = 99,
                                  n_jobs = 1,
                                  verbose = 2,
                                  error_score = 'raise',

                                 ).fit(
                                        X_validation_xg_lgbm, y_validation,
                                        categorical_feature=["type", "hour_of_day"],
                                        # error_score = 'raise'
                                 ),

    'LightGBM': lambda: RandomizedSearchCV(

                                  models[model],
                                  param_distributions = model_params[model],
                                  n_iter = 100,
                                  scoring = "average_precision",
                                  cv = 3,
                                  random_state = 99,
                                  n_jobs = 1,
                                  verbose = 2,
                                  error_score = 'raise',

                                 ).fit(
                                        X_validation_xg_lgbm, y_validation,
                                        # categorical_feature=["type", "hour_of_day"],
                                        # error_score = 'raise'
                                 ),

    'RandomForest': lambda: RandomizedSearchCV(

                                  models[model],
                                  param_distributions = model_params[model],
                                  n_iter = 5,
                                  scoring = "average_precision",
                                  cv = 3,
                                  random_state = 99,
                                  n_jobs = 1,
                                  verbose = 2,
                                  error_score = 'raise',

                                 ).fit(
                                        X_validation_rf, y_validation,
                                        # categorical_feature=["type", "hour_of_day"],
                                        # error_score = 'raise'
                                 )
}

In [None]:
def remove_models(model_names = None):
  if isinstance(model_names, str):
    model_names = [model_names]
  for model in model_names:
    try:
        del models[model]
    except KeyError as e:
      pass

In [None]:
remove_models([
              # 'XGBoost' ,
              # 'LightGBM',
              'RandomForest'
              ])

In [None]:
models_tuned = {}

for model in models:

  try:

    print(f'\n\n{'~'*20} Now training: {model.upper()} {'~'*20}\n\n')

    models_tuned[model] = model_randomsearch_specs[model]()

  except Exception as e:
    print(f'An error occurred for {model}:\n{e}')



~~~~~~~~~~~~~~~~~~~~ Now training: LIGHTGBM ~~~~~~~~~~~~~~~~~~~~


Fitting 3 folds for each of 100 candidates, totalling 300 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.034122 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.07492979945210515, n_estimators=1309, num_leaves=199, reg_alpha=0.8973689620835453, reg_lambda=1.544110320669616e-05; total time= 3.9min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.035420 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.07492979945210515, n_estimators=1309, num_leaves=199, reg_alpha=0.8973689620835453, reg_lambda=1.544110320669616e-05; total time= 4.1min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2338, number of negative: 1800404
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.035279 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646471
[LightGBM] [Info] Start training from score -6.646471
[CV] END learning_rate=0.07492979945210515, n_estimators=1309, num_leaves=199, reg_alpha=0.8973689620835453, reg_lambda=1.544110320669616e-05; total time= 4.0min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.034524 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.11253749408678979, n_estimators=1509, num_leaves=182, reg_alpha=0.0006105693399947266, reg_lambda=1.906225711292883e-05; total time= 1.8min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.028601 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.11253749408678979, n_estimators=1509, num_leaves=182, reg_alpha=0.0006105693399947266, reg_lambda=1.906225711292883e-05; total time= 1.8min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2338, number of negative: 1800404
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.031171 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646471
[LightGBM] [Info] Start training from score -6.646471
[CV] END learning_rate=0.11253749408678979, n_estimators=1509, num_leaves=182, reg_alpha=0.0006105693399947266, reg_lambda=1.906225711292883e-05; total time= 1.7min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.026889 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.19446254321780423, n_estimators=1345, num_leaves=90, reg_alpha=0.415679080905549, reg_lambda=0.30241449674836457; total time= 2.7min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.032055 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.19446254321780423, n_estimators=1345, num_leaves=90, reg_alpha=0.415679080905549, reg_lambda=0.30241449674836457; total time= 2.6min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2338, number of negative: 1800404
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.029200 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646471
[LightGBM] [Info] Start training from score -6.646471
[CV] END learning_rate=0.19446254321780423, n_estimators=1345, num_leaves=90, reg_alpha=0.415679080905549, reg_lambda=0.30241449674836457; total time= 2.5min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.052583 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.030978428141936014, n_estimators=752, num_leaves=199, reg_alpha=3.7470574570083373, reg_lambda=0.0023589800392543195; total time= 1.6min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.050850 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042
[CV] END learning_rate=0.030978428141936014, n_estimators=752, num_leaves=199, reg_alpha=3.7470574570083373, reg_lambda=0.0023589800392543195; total time= 1.7min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2338, number of negative: 1800404
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.030955 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646471
[LightGBM] [Info] Start training from score -6.646471
[CV] END learning_rate=0.030978428141936014, n_estimators=752, num_leaves=199, reg_alpha=3.7470574570083373, reg_lambda=0.0023589800392543195; total time= 1.8min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 2339, number of negative: 1800403
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1344
[LightGBM] [Info] Number of data points in the train set: 1802742, number of used features: 9
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (20.63 MB) transferred to GPU in 0.030599 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001297 -> initscore=-6.646042
[LightGBM] [Info] Start training from score -6.646042


In [None]:
y_preds = models_tuned['XGBoost'].predict(X_test)

In [None]:
X_validation.dtypes

Unnamed: 0,0
type,category
amount,float32
oldbalanceOrg,float32
newbalanceOrig,float32
oldbalanceDest,float32
newbalanceDest,float32
hour_of_day,category


In [None]:
keep_session_active()

In [None]:
lgbm_tuned = LGBMClassifier(
    # device="gpu",
    # gpu_platform_id=0,
    # gpu_device_id=0,
    objective = "binary",
    boosting_type = "gbdt",
    random_state = 42,
    n_jobs = -1,
    learning_rate = 0.010987849279294894,
    n_estimators = 1321
)

In [None]:
lgbm_tuned.fit(X_test, y_test)
y_preds = lgbm_tuned.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score, average_precision_score

In [None]:
precision = precision_score(y_test, y_preds)
recall = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
cm = confusion_matrix(y_test, y_preds)

In [None]:
balanced_accuracy_score(y_test, y_preds)

np.float64(0.9177746039395049)

In [None]:
average_precision_score(y_test, y_preds)

np.float64(0.8060864602196495)

In [None]:
results = [
            f'Precision: {precision*100:.2f}%',
            f'Recall: {recall*100:.2f}%',
            f'F1: {f1*100:.2f}%',
            f'Confusion matrix: \n{cm}'
          ]

result_str = '\n'.join(results)

print(result_str)

Precision: 96.44%
Recall: 83.56%
F1: 89.54%
Confusion matrix: 
[[2700534     107]
 [    571    2902]]


In [None]:
import os
from pathlib import Path
import pickle

tuned_model_dir = os.path.join(main_gdrive, exe_folder, '0_model_pickle_dump')
Path(tuned_model_dir).mkdir(parents = True, exist_ok = True)

In [None]:
pickle_file = os.path.join(tuned_model_dir, 'model.pkl')

with open(pickle_file, 'wb') as f:
  pickle.dump(lgbm_tuned, f)

In [None]:
exe_folder