# Baseline Model 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [3]:
from tqdm import tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import json
# def open_file_json(path,mode='r',var=None):
#     if mode == 'w':
#         with open(path,'w') as f:
#             json.dump(var, f)
#     if mode == 'r':
#         with open(path,'r') as f:
#             return json.load(f)

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])

# import json
# from glob import glob
# from typing import NewType


# DictsPathType = NewType("DictsPath", str)


# def open_file_json(path):
#     with open(path, "r") as f:
#         return json.load(f)

# class LoadDicts:
#     def __init__(self, dict_path: DictsPathType = "./data"):
#         Dicts_glob = glob(f"{dict_path}/*.json")
#         self.List = []
#         self.Dict = {}
#         for path_json in Dicts_glob:
#             name = path_json.split("/")[-1].replace(".json", "")
#             self.List.append(name)
#             self.Dict[name] = open_file_json(path_json)
#             setattr(self, name, self.Dict[name])


In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.25.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit

Git hash: ed3aba7692c8a9756f12bca72c13a990d76a3939

Git repo: https://github.com/ysraell/creditcardfraud.git

Git branch: main

numpy : 1.19.5
pandas: 1.3.1

CPU	: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
Mem:           15G
Swap:         4.0G


# Initial search

In [5]:
#
N_fraud_test = 200
N_truth_test = int(2e4)
N_truth_train = int(2e5)

#
split_seeds = [13, 17, 47, 53]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [200, 400, 800]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Minimum number of samples required to split a node
min_samples_split = [2, 8]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)

{'n_estimators': [200, 400, 800], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 8], 'min_samples_leaf': [1, 4], 'bootstrap': [True]}


In [26]:
target_col = 'Class'
ds_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
glob_paths = glob('/work/data/creditcard*.csv')

total_exps = len(glob_paths)*len(split_seeds)*len(ParameterGrid(search_grid))
print(total_exps)

288


In [7]:

with tqdm(total=total_exps) as progress_bar:

    def RunGrid(df_train, df_test, random_state):
        out = []
        for params in ParameterGrid(search_grid):
            params['random_state'] = random_state
            params['n_jobs'] = 8
            rf = RandomForestClassifier(**params)
            rf.fit(df_train[ds_cols].to_numpy(), df_train[target_col].to_numpy())
            probs = rf.predict_proba(df_test[ds_cols].to_numpy())
            exp = {
                'probs' : probs,
                'rf_classes': rf.classes_,
                'params': params
            }
            out.append(exp)
            progress_bar.update(1)
        return out


    Results = {}
    for ds_path in glob_paths:
        df = pd.read_csv(ds_path)
        df = df[ds_cols+[target_col]]
        df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
        df_truth = df.query('Class == 0').reset_index(drop=True).copy()
        del df
        set_exp = {}
        for seed in split_seeds:
            df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
            df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
            df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
            df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
            out = RunGrid(df_train, df_test, random_state)
            set_exp[seed] = {
                'target_test': df_test[target_col].to_numpy(),
                'exps': out
            }
        Results[ds_path] = set_exp

100%|██████████| 288/288 [12:17:57<00:00, 153.74s/it]  


In [42]:
cols_results = ['ds_path', 'seed']
cols_param = ['bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split', 'n_estimators', 'random_state']
cols_metrics = ['Fraud_True_Sum','Truth_False_Sum', 'Fraud_False_Sum', 'F1_M', 'AUC_ROC_M', 'TP_0', 'TP_1']
cols = cols_results+cols_param+cols_metrics

In [9]:
', '.join(cols_metrics)

'Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_W, AUC_ROC_W'

In [10]:
''.join([ f'param[\'{col}\'], ' for col in cols_param])

"param['bootstrap'], param['max_features'], param['min_samples_leaf'], param['min_samples_split'], param['n_estimators'], param['random_state'], "

In [11]:
data = []
for ds_path,sets_exp in Results.items():
    for seed,set_exp in sets_exp.items():
        target_test = set_exp['target_test']
        for exp in set_exp['exps']:
            df_exp = pd.DataFrame(exp['probs'], columns=exp['rf_classes'])
            df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: exp['rf_classes'][np.argmax(x)], axis=1)
            df_exp['target'] = target_test
            Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
            Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
            Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
            F1_M = f1_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            AUC_ROC_M = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            TP_0 = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 0)].shape[0]/sum(df_exp.target == 0)
            TP_1 = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)].shape[0]/sum(df_exp.target == 1)
            param = exp['params']
            data.append([
                ds_path, seed,
                param['bootstrap'], param['max_features'], param['min_samples_leaf'],
                param['min_samples_split'], param['n_estimators'], param['random_state'],
                Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1
            ])

In [12]:
df_Results = pd.DataFrame(data, columns=cols)
#df_Results.to_csv('/work/data/Results_creditcard_Init.csv', index=False)

In [7]:
df_Results = pd.read_csv('/work/data/Results_creditcard_Init.csv')

In [11]:
map_ds_path = {
    '/work/data/creditcard_trans_float.csv': 'Float',
    '/work/data/creditcard.csv': 'Original',
    '/work/data/creditcard_trans_int.csv': 'Integer'
}

In [13]:
df_Results['DS'] = df_Results.ds_path.apply(lambda x: map_ds_path[x])

In [19]:
for metric in cols_metrics:
    md(f'# {metric}')
    display(df_Results.sort_values(metric, ascending=False).head(20)[['DS', 'seed']+cols_param[:-1]+cols_metrics])
    

# Fraud_True_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
228,Integer,17,True,sqrt,1,2,200,0.70035,0.196325,0.000109,0.997542,0.889925
216,Integer,17,True,auto,1,2,200,0.70035,0.196325,0.000109,0.997542,0.889925
217,Integer,17,True,auto,1,2,400,0.6991,0.19735,0.00011,0.997542,0.889925
229,Integer,17,True,sqrt,1,2,400,0.6991,0.19735,0.00011,0.997542,0.889925
37,Float,17,True,sqrt,1,2,400,0.699037,0.197313,0.00011,0.997542,0.889925
25,Float,17,True,auto,1,2,400,0.699037,0.197313,0.00011,0.997542,0.889925
121,Original,17,True,auto,1,2,400,0.6989,0.1974,0.000109,0.997542,0.889925
133,Original,17,True,sqrt,1,2,400,0.6989,0.1974,0.000109,0.997542,0.889925
230,Integer,17,True,sqrt,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
218,Integer,17,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925


# Truth_False_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
261,Integer,47,True,sqrt,4,8,200,0.629357,0.226146,0.000161,0.997106,0.874875
249,Integer,47,True,auto,4,8,200,0.629357,0.226146,0.000161,0.997106,0.874875
258,Integer,47,True,sqrt,4,2,200,0.629357,0.226146,0.000161,0.997106,0.874875
150,Original,47,True,auto,4,2,200,0.629357,0.226146,0.000161,0.997106,0.874875
165,Original,47,True,sqrt,4,8,200,0.629357,0.226146,0.000161,0.997106,0.874875
153,Original,47,True,auto,4,8,200,0.629357,0.226146,0.000161,0.997106,0.874875
162,Original,47,True,sqrt,4,2,200,0.629357,0.226146,0.000161,0.997106,0.874875
246,Integer,47,True,auto,4,2,200,0.629357,0.226146,0.000161,0.997106,0.874875
57,Float,47,True,auto,4,8,200,0.629586,0.226125,0.000161,0.997106,0.874875
66,Float,47,True,sqrt,4,2,200,0.629586,0.226125,0.000161,0.997106,0.874875


# Fraud_False_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
64,Float,47,True,sqrt,1,8,400,0.642623,0.223501,0.000167,0.997106,0.874875
52,Float,47,True,auto,1,8,400,0.642623,0.223501,0.000167,0.997106,0.874875
244,Integer,47,True,auto,1,8,400,0.642436,0.223515,0.000167,0.997106,0.874875
256,Integer,47,True,sqrt,1,8,400,0.642436,0.223515,0.000167,0.997106,0.874875
160,Original,47,True,sqrt,1,8,400,0.642436,0.223515,0.000167,0.997106,0.874875
148,Original,47,True,auto,1,8,400,0.642436,0.223515,0.000167,0.997106,0.874875
65,Float,47,True,sqrt,1,8,800,0.643943,0.222702,0.000164,0.997106,0.874875
53,Float,47,True,auto,1,8,800,0.643943,0.222702,0.000164,0.997106,0.874875
245,Integer,47,True,auto,1,8,800,0.643792,0.22274,0.000164,0.997106,0.874875
161,Original,47,True,sqrt,1,8,800,0.643792,0.22274,0.000164,0.997106,0.874875


# F1_W

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.997688,0.89
278,Integer,53,True,sqrt,1,2,800,0.670319,0.202844,0.0,0.997688,0.89
86,Float,53,True,sqrt,1,2,800,0.670625,0.202756,0.0,0.997688,0.89
74,Float,53,True,auto,1,2,800,0.670625,0.202756,0.0,0.997688,0.89
170,Original,53,True,auto,1,2,800,0.667919,0.205275,0.0,0.997632,0.8875
182,Original,53,True,sqrt,1,2,800,0.667919,0.205275,0.0,0.997632,0.8875
87,Float,53,True,sqrt,1,8,200,0.659752,0.205351,2.6e-05,0.997583,0.887475
279,Integer,53,True,sqrt,1,8,200,0.659652,0.205451,2.6e-05,0.997583,0.887475
171,Original,53,True,auto,1,8,200,0.659652,0.205451,2.6e-05,0.997583,0.887475
267,Integer,53,True,auto,1,8,200,0.659652,0.205451,2.6e-05,0.997583,0.887475


# AUC_ROC_W

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.997688,0.89
74,Float,53,True,auto,1,2,800,0.670625,0.202756,0.0,0.997688,0.89
278,Integer,53,True,sqrt,1,2,800,0.670319,0.202844,0.0,0.997688,0.89
86,Float,53,True,sqrt,1,2,800,0.670625,0.202756,0.0,0.997688,0.89
229,Integer,17,True,sqrt,1,2,400,0.6991,0.19735,0.00011,0.997542,0.889925
228,Integer,17,True,sqrt,1,2,200,0.70035,0.196325,0.000109,0.997542,0.889925
217,Integer,17,True,auto,1,2,400,0.6991,0.19735,0.00011,0.997542,0.889925
218,Integer,17,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
220,Integer,17,True,auto,1,8,400,0.692122,0.197221,0.000111,0.997542,0.889925
26,Float,17,True,auto,1,2,800,0.697856,0.197581,0.000111,0.997542,0.889925


In [39]:
for col in cols_param[:-1]:
    md(f'# {col}')
    display(df_Results[['DS', col]+cols_metrics].groupby(['DS', col]).mean())

# bootstrap

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
DS,bootstrap,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Float,True,0.660165,0.210346,8.5e-05,0.997385,0.882122
Integer,True,0.660063,0.210382,8.4e-05,0.997386,0.882123
Original,True,0.660021,0.210432,8.4e-05,0.997385,0.882071


# max_features

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
DS,max_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Float,auto,0.660165,0.210346,8.5e-05,0.997385,0.882122
Float,sqrt,0.660165,0.210346,8.5e-05,0.997385,0.882122
Integer,auto,0.660063,0.210382,8.4e-05,0.997386,0.882123
Integer,sqrt,0.660063,0.210382,8.4e-05,0.997386,0.882123
Original,auto,0.660021,0.210432,8.4e-05,0.997385,0.882071
Original,sqrt,0.660021,0.210432,8.4e-05,0.997385,0.882071


# min_samples_leaf

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
DS,min_samples_leaf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Float,1,0.669899,0.208542,7.2e-05,0.99743,0.883072
Float,4,0.650432,0.212151,9.7e-05,0.997341,0.881173
Integer,1,0.669831,0.208599,7.1e-05,0.997432,0.883073
Integer,4,0.650294,0.212165,9.7e-05,0.997341,0.881173
Original,1,0.669721,0.2087,7.1e-05,0.997429,0.882969
Original,4,0.65032,0.212164,9.7e-05,0.997341,0.881173


# min_samples_split

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
DS,min_samples_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Float,2,0.662465,0.210047,8.1e-05,0.997395,0.88223
Float,8,0.657866,0.210646,8.9e-05,0.997376,0.882015
Integer,2,0.662338,0.210081,8e-05,0.997397,0.882231
Integer,8,0.657788,0.210683,8.9e-05,0.997376,0.882015
Original,2,0.662248,0.210184,8e-05,0.997394,0.882127
Original,8,0.657793,0.21068,8.9e-05,0.997376,0.882015


# n_estimators

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
DS,n_estimators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Float,200,0.659563,0.210755,8.5e-05,0.997377,0.881809
Float,400,0.659802,0.210877,8.2e-05,0.99738,0.881656
Float,800,0.661131,0.209407,8.6e-05,0.997399,0.882902
Integer,200,0.659625,0.210638,8.4e-05,0.997384,0.881967
Integer,400,0.659679,0.210908,8.2e-05,0.99738,0.881656
Integer,800,0.660883,0.209599,8.6e-05,0.997395,0.882745
Original,200,0.659493,0.210792,8.4e-05,0.99738,0.881811
Original,400,0.659689,0.21091,8.2e-05,0.99738,0.881656
Original,800,0.66088,0.209594,8.6e-05,0.997395,0.882745


# Baseline model.

In [40]:
#
N_fraud_test = 200
N_truth_test = int(2e4)
N_truth_train = int(2e5)

#
split_seeds = [13, 17, 19, 41]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [800, 1000, 1200, 1400]

# Number of features to consider at every split
max_features = ['auto']

# Minimum number of samples required to split a node
min_samples_split = [2]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)

{'n_estimators': [800, 1000, 1200, 1400], 'max_features': ['auto'], 'min_samples_split': [2], 'min_samples_leaf': [1], 'bootstrap': [True]}


In [41]:
glob_paths = ['/work/data/creditcard_trans_int.csv']
total_exps = len(glob_paths)*len(split_seeds)*len(ParameterGrid(search_grid))
print(total_exps)

16


In [43]:

with tqdm(total=total_exps) as progress_bar:

    def RunGrid(df_train, df_test, random_state):
        out = []
        for params in ParameterGrid(search_grid):
            params['random_state'] = random_state
            params['n_jobs'] = 8
            rf = RandomForestClassifier(**params)
            rf.fit(df_train[ds_cols].to_numpy(), df_train[target_col].to_numpy())
            probs = rf.predict_proba(df_test[ds_cols].to_numpy())
            exp = {
                'probs' : probs,
                'rf_classes': rf.classes_,
                'params': params
            }
            out.append(exp)
            progress_bar.update(1)
        return out


    Results = {}
    for ds_path in glob_paths:
        df = pd.read_csv(ds_path)
        df = df[ds_cols+[target_col]]
        df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
        df_truth = df.query('Class == 0').reset_index(drop=True).copy()
        del df
        set_exp = {}
        for seed in split_seeds:
            df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
            df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
            df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
            df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
            out = RunGrid(df_train, df_test, random_state)
            set_exp[seed] = {
                'target_test': df_test[target_col].to_numpy(),
                'exps': out
            }
        Results[ds_path] = set_exp

100%|██████████| 16/16 [2:59:44<00:00, 674.03s/it]  


In [44]:
data = []
for ds_path,sets_exp in Results.items():
    for seed,set_exp in sets_exp.items():
        target_test = set_exp['target_test']
        for exp in set_exp['exps']:
            df_exp = pd.DataFrame(exp['probs'], columns=exp['rf_classes'])
            df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: exp['rf_classes'][np.argmax(x)], axis=1)
            df_exp['target'] = target_test
            Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
            Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
            Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
            F1_M = f1_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            AUC_ROC_M = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            TP_0 = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 0)].shape[0]/sum(df_exp.target == 0)
            TP_1 = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)].shape[0]/sum(df_exp.target == 1)
            param = exp['params']
            data.append([
                ds_path, seed,
                param['bootstrap'], param['max_features'], param['min_samples_leaf'],
                param['min_samples_split'], param['n_estimators'], param['random_state'],
                Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1
            ])
            
df_Results = pd.DataFrame(data, columns=cols)
df_Results.to_csv('/work/data/Results_creditcard_Baseline.csv', index=False)

In [45]:
df_Results

Unnamed: 0,ds_path,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,random_state,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
0,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,800,42,0.679956,0.206375,2.6e-05,0.997471,0.882475
1,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,1000,42,0.68016,0.206045,2.6e-05,0.997471,0.882475
2,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,1200,42,0.680117,0.206283,2.6e-05,0.997471,0.882475
3,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,1400,42,0.680086,0.206232,2.7e-05,0.997471,0.882475
4,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,800,42,0.698,0.197581,0.00011,0.997542,0.889925
5,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,1000,42,0.69245,0.202665,0.000111,0.997429,0.884925
6,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,1200,42,0.695008,0.200158,0.000111,0.997486,0.887425
7,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,1400,42,0.695154,0.20005,0.00011,0.997486,0.887425
8,/work/data/creditcard_trans_int.csv,19,True,auto,1,2,800,42,0.684669,0.195112,4.5e-05,0.997639,0.889975
9,/work/data/creditcard_trans_int.csv,19,True,auto,1,2,1000,42,0.68511,0.19511,4.5e-05,0.997639,0.889975


In [47]:
for metric in cols_metrics:
    md(f'# {metric}')
    display(df_Results.sort_values(metric, ascending=False).head(20)[cols_param[:-1]+cols_metrics])
    

# Fraud_True_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
4,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
7,True,auto,1,2,1400,0.695154,0.20005,0.00011,0.997486,0.887425
6,True,auto,1,2,1200,0.695008,0.200158,0.000111,0.997486,0.887425
5,True,auto,1,2,1000,0.69245,0.202665,0.000111,0.997429,0.884925
9,True,auto,1,2,1000,0.68511,0.19511,4.5e-05,0.997639,0.889975
10,True,auto,1,2,1200,0.684854,0.195067,4.5e-05,0.997639,0.889975
11,True,auto,1,2,1400,0.6848,0.195125,4.5e-05,0.997639,0.889975
8,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.997639,0.889975
13,True,auto,1,2,1000,0.68086,0.206985,0.000135,0.997381,0.8849
12,True,auto,1,2,800,0.680594,0.207244,0.000136,0.997381,0.8849


# Truth_False_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
14,True,auto,1,2,1200,0.678117,0.209517,0.000109,0.997373,0.882425
15,True,auto,1,2,1400,0.678693,0.209443,0.00011,0.997373,0.882425
12,True,auto,1,2,800,0.680594,0.207244,0.000136,0.997381,0.8849
13,True,auto,1,2,1000,0.68086,0.206985,0.000135,0.997381,0.8849
0,True,auto,1,2,800,0.679956,0.206375,2.6e-05,0.997471,0.882475
2,True,auto,1,2,1200,0.680117,0.206283,2.6e-05,0.997471,0.882475
3,True,auto,1,2,1400,0.680086,0.206232,2.7e-05,0.997471,0.882475
1,True,auto,1,2,1000,0.68016,0.206045,2.6e-05,0.997471,0.882475
5,True,auto,1,2,1000,0.69245,0.202665,0.000111,0.997429,0.884925
6,True,auto,1,2,1200,0.695008,0.200158,0.000111,0.997486,0.887425


# Fraud_False_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
12,True,auto,1,2,800,0.680594,0.207244,0.000136,0.997381,0.8849
13,True,auto,1,2,1000,0.68086,0.206985,0.000135,0.997381,0.8849
5,True,auto,1,2,1000,0.69245,0.202665,0.000111,0.997429,0.884925
6,True,auto,1,2,1200,0.695008,0.200158,0.000111,0.997486,0.887425
4,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
7,True,auto,1,2,1400,0.695154,0.20005,0.00011,0.997486,0.887425
15,True,auto,1,2,1400,0.678693,0.209443,0.00011,0.997373,0.882425
14,True,auto,1,2,1200,0.678117,0.209517,0.000109,0.997373,0.882425
9,True,auto,1,2,1000,0.68511,0.19511,4.5e-05,0.997639,0.889975
10,True,auto,1,2,1200,0.684854,0.195067,4.5e-05,0.997639,0.889975


# F1_W

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
8,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.997639,0.889975
9,True,auto,1,2,1000,0.68511,0.19511,4.5e-05,0.997639,0.889975
10,True,auto,1,2,1200,0.684854,0.195067,4.5e-05,0.997639,0.889975
11,True,auto,1,2,1400,0.6848,0.195125,4.5e-05,0.997639,0.889975
4,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
6,True,auto,1,2,1200,0.695008,0.200158,0.000111,0.997486,0.887425
7,True,auto,1,2,1400,0.695154,0.20005,0.00011,0.997486,0.887425
0,True,auto,1,2,800,0.679956,0.206375,2.6e-05,0.997471,0.882475
1,True,auto,1,2,1000,0.68016,0.206045,2.6e-05,0.997471,0.882475
2,True,auto,1,2,1200,0.680117,0.206283,2.6e-05,0.997471,0.882475


# AUC_ROC_W

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
8,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.997639,0.889975
9,True,auto,1,2,1000,0.68511,0.19511,4.5e-05,0.997639,0.889975
10,True,auto,1,2,1200,0.684854,0.195067,4.5e-05,0.997639,0.889975
11,True,auto,1,2,1400,0.6848,0.195125,4.5e-05,0.997639,0.889975
4,True,auto,1,2,800,0.698,0.197581,0.00011,0.997542,0.889925
6,True,auto,1,2,1200,0.695008,0.200158,0.000111,0.997486,0.887425
7,True,auto,1,2,1400,0.695154,0.20005,0.00011,0.997486,0.887425
5,True,auto,1,2,1000,0.69245,0.202665,0.000111,0.997429,0.884925
12,True,auto,1,2,800,0.680594,0.207244,0.000136,0.997381,0.8849
13,True,auto,1,2,1000,0.68086,0.206985,0.000135,0.997381,0.8849
