# Baseline Model 

- Initial parameters search.

- Search parameter for baseline model.

#### Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [5]:
%load_ext watermark

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [7]:
from tqdm import tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import json
# def open_file_json(path,mode='r',var=None):
#     if mode == 'w':
#         with open(path,'w') as f:
#             json.dump(var, f)
#     if mode == 'r':
#         with open(path,'r') as f:
#             return json.load(f)

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])

# import json
# from glob import glob
# from typing import NewType


# DictsPathType = NewType("DictsPath", str)


# def open_file_json(path):
#     with open(path, "r") as f:
#         return json.load(f)

# class LoadDicts:
#     def __init__(self, dict_path: DictsPathType = "./data"):
#         Dicts_glob = glob(f"{dict_path}/*.json")
#         self.List = []
#         self.Dict = {}
#         for path_json in Dicts_glob:
#             name = path_json.split("/")[-1].replace(".json", "")
#             self.List.append(name)
#             self.Dict[name] = open_file_json(path_json)
#             setattr(self, name, self.Dict[name])


In [8]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.26.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: 38749d73a7d8f2b4d7906687d79829b7ff7b69d3

Git repo: https://github.com/ysraell/creditcardfraud.git

Git branch: main

pandas: 1.3.1
numpy : 1.19.5

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:           31G
Swap:         4.0G


# Initial search

In [9]:
#
n_jobs = 4

#
N_fraud_test = 200
N_truth_test = int(2e4)
N_truth_train = int(2e5)

#
split_seeds = [13, 17, 47, 53]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [200, 400, 800]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Minimum number of samples required to split a node
min_samples_split = [2, 8]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)

{'n_estimators': [200, 400, 800], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 8], 'min_samples_leaf': [1, 4], 'bootstrap': [True]}


In [10]:
target_col = 'Class'
ds_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
glob_paths = glob('/work/data/creditcard*.csv')

total_exps = len(glob_paths)*len(split_seeds)*len(ParameterGrid(search_grid))
print(total_exps)

288


In [7]:

with tqdm(total=total_exps) as progress_bar:

    def RunGrid(df_train, df_test, random_state):
        out = []
        for params in ParameterGrid(search_grid):
            params['random_state'] = random_state
            params['n_jobs'] = n_jobs
            rf = RandomForestClassifier(**params)
            rf.fit(df_train[ds_cols].to_numpy(), df_train[target_col].to_numpy())
            probs = rf.predict_proba(df_test[ds_cols].to_numpy())
            exp = {
                'probs' : probs,
                'rf_classes': rf.classes_,
                'params': params
            }
            out.append(exp)
            progress_bar.update(1)
        return out


    Results = {}
    for ds_path in glob_paths:
        df = pd.read_csv(ds_path)
        df = df[ds_cols+[target_col]]
        df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
        df_truth = df.query('Class == 0').reset_index(drop=True).copy()
        del df
        set_exp = {}
        for seed in split_seeds:
            df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
            df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
            df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
            df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
            out = RunGrid(df_train, df_test, random_state)
            set_exp[seed] = {
                'target_test': df_test[target_col].to_numpy(),
                'exps': out
            }
        Results[ds_path] = set_exp

100%|██████████| 288/288 [15:42:06<00:00, 196.27s/it]  


In [17]:
cols_results = ['ds_path', 'seed']
cols_param = ['bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split', 'n_estimators', 'random_state']
cols_metrics = ['Fraud_True_Sum','Truth_False_Sum', 'Fraud_False_Sum', 'F1_M', 'AUC_ROC_M', 'TP_0', 'TP_1']
cols = cols_results+cols_param+cols_metrics

In [9]:
', '.join(cols_metrics)

'Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1'

In [10]:
''.join([ f'param[\'{col}\'], ' for col in cols_param])

"param['bootstrap'], param['max_features'], param['min_samples_leaf'], param['min_samples_split'], param['n_estimators'], param['random_state'], "

In [11]:
data = []
for ds_path,sets_exp in Results.items():
    for seed,set_exp in sets_exp.items():
        target_test = set_exp['target_test']
        for exp in set_exp['exps']:
            df_exp = pd.DataFrame(exp['probs'], columns=exp['rf_classes'])
            df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: exp['rf_classes'][np.argmax(x)], axis=1)
            df_exp['target'] = target_test
            Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
            Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
            Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
            F1_M = f1_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            AUC_ROC_M = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            TP_0 = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 0)].shape[0]/sum(df_exp.target == 0)
            TP_1 = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)].shape[0]/sum(df_exp.target == 1)
            param = exp['params']
            data.append([
                ds_path, seed,
                param['bootstrap'], param['max_features'], param['min_samples_leaf'],
                param['min_samples_split'], param['n_estimators'], param['random_state'],
                Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1
            ])

In [12]:
df_Results = pd.DataFrame(data, columns=cols)
#df_Results.to_csv('/work/data/Results_creditcard_Init.csv', index=False)

In [13]:
df_Results.to_csv('/work/data/Results_creditcard_Init.csv', index=False)

In [14]:
df_Results = pd.read_csv('/work/data/Results_creditcard_Init.csv')

In [11]:
map_ds_path = {
    '/work/data/creditcard_trans_float.csv': 'Float',
    '/work/data/creditcard.csv': 'Original',
    '/work/data/creditcard_trans_int.csv': 'Integer'
}

In [17]:
df_Results['DS'] = df_Results.ds_path.apply(lambda x: map_ds_path[x])

In [18]:
for metric in cols_metrics:
    md(f'# {metric}')
    display(df_Results.sort_values(metric, ascending=False).head(20)[['DS', 'seed']+cols_param[:-1]+cols_metrics])
    

# Fraud_True_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
228,Integer,17,True,sqrt,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
216,Integer,17,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
217,Integer,17,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
229,Integer,17,True,sqrt,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
37,Float,17,True,sqrt,1,2,400,0.699037,0.197313,0.00011,0.933953,0.889925,0.99985,0.78
25,Float,17,True,auto,1,2,400,0.699037,0.197313,0.00011,0.933953,0.889925,0.99985,0.78
121,Original,17,True,auto,1,2,400,0.6989,0.1974,0.000109,0.933953,0.889925,0.99985,0.78
133,Original,17,True,sqrt,1,2,400,0.6989,0.1974,0.000109,0.933953,0.889925,0.99985,0.78
230,Integer,17,True,sqrt,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
218,Integer,17,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78


# Truth_False_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
261,Integer,47,True,sqrt,4,8,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
249,Integer,47,True,auto,4,8,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
258,Integer,47,True,sqrt,4,2,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
150,Original,47,True,auto,4,2,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
165,Original,47,True,sqrt,4,8,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
153,Original,47,True,auto,4,8,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
162,Original,47,True,sqrt,4,2,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
246,Integer,47,True,auto,4,2,200,0.629357,0.226146,0.000161,0.921848,0.874875,0.99975,0.75
57,Float,47,True,auto,4,8,200,0.629586,0.226125,0.000161,0.921848,0.874875,0.99975,0.75
66,Float,47,True,sqrt,4,2,200,0.629586,0.226125,0.000161,0.921848,0.874875,0.99975,0.75


# Fraud_False_Sum

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
64,Float,47,True,sqrt,1,8,400,0.642623,0.223501,0.000167,0.921848,0.874875,0.99975,0.75
52,Float,47,True,auto,1,8,400,0.642623,0.223501,0.000167,0.921848,0.874875,0.99975,0.75
244,Integer,47,True,auto,1,8,400,0.642436,0.223515,0.000167,0.921848,0.874875,0.99975,0.75
256,Integer,47,True,sqrt,1,8,400,0.642436,0.223515,0.000167,0.921848,0.874875,0.99975,0.75
160,Original,47,True,sqrt,1,8,400,0.642436,0.223515,0.000167,0.921848,0.874875,0.99975,0.75
148,Original,47,True,auto,1,8,400,0.642436,0.223515,0.000167,0.921848,0.874875,0.99975,0.75
65,Float,47,True,sqrt,1,8,800,0.643943,0.222702,0.000164,0.921848,0.874875,0.99975,0.75
53,Float,47,True,auto,1,8,800,0.643943,0.222702,0.000164,0.921848,0.874875,0.99975,0.75
245,Integer,47,True,auto,1,8,800,0.643792,0.22274,0.000164,0.921848,0.874875,0.99975,0.75
161,Original,47,True,sqrt,1,8,800,0.643792,0.22274,0.000164,0.921848,0.874875,0.99975,0.75


# F1_M

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78
278,Integer,53,True,sqrt,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78
86,Float,53,True,sqrt,1,2,800,0.670625,0.202756,0.0,0.937653,0.89,1.0,0.78
74,Float,53,True,auto,1,2,800,0.670625,0.202756,0.0,0.937653,0.89,1.0,0.78
170,Original,53,True,auto,1,2,800,0.667919,0.205275,0.0,0.936058,0.8875,1.0,0.775
182,Original,53,True,sqrt,1,2,800,0.667919,0.205275,0.0,0.936058,0.8875,1.0,0.775
87,Float,53,True,sqrt,1,8,200,0.659752,0.205351,2.6e-05,0.934819,0.887475,0.99995,0.775
279,Integer,53,True,sqrt,1,8,200,0.659652,0.205451,2.6e-05,0.934819,0.887475,0.99995,0.775
171,Original,53,True,auto,1,8,200,0.659652,0.205451,2.6e-05,0.934819,0.887475,0.99995,0.775
267,Integer,53,True,auto,1,8,200,0.659652,0.205451,2.6e-05,0.934819,0.887475,0.99995,0.775


# AUC_ROC_M

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78
74,Float,53,True,auto,1,2,800,0.670625,0.202756,0.0,0.937653,0.89,1.0,0.78
278,Integer,53,True,sqrt,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78
86,Float,53,True,sqrt,1,2,800,0.670625,0.202756,0.0,0.937653,0.89,1.0,0.78
229,Integer,17,True,sqrt,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
228,Integer,17,True,sqrt,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
217,Integer,17,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
218,Integer,17,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
220,Integer,17,True,auto,1,8,400,0.692122,0.197221,0.000111,0.933953,0.889925,0.99985,0.78
26,Float,17,True,auto,1,2,800,0.697856,0.197581,0.000111,0.933953,0.889925,0.99985,0.78


# TP_0

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
198,Integer,13,True,auto,4,2,200,0.652641,0.210827,0.0,0.931219,0.88,1.0,0.76
117,Original,13,True,sqrt,4,8,200,0.652663,0.210827,0.0,0.931219,0.88,1.0,0.76
21,Float,13,True,sqrt,4,8,200,0.65271,0.21084,0.0,0.931219,0.88,1.0,0.76
86,Float,53,True,sqrt,1,2,800,0.670625,0.202756,0.0,0.937653,0.89,1.0,0.78
213,Integer,13,True,sqrt,4,8,200,0.652641,0.210827,0.0,0.931219,0.88,1.0,0.76
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78
102,Original,13,True,auto,4,2,200,0.652663,0.210827,0.0,0.931219,0.88,1.0,0.76
88,Float,53,True,sqrt,1,8,400,0.658557,0.208051,0.0,0.934454,0.885,1.0,0.77
268,Integer,53,True,auto,1,8,400,0.658433,0.208141,0.0,0.934454,0.885,1.0,0.77
18,Float,13,True,sqrt,4,2,200,0.65271,0.21084,0.0,0.931219,0.88,1.0,0.76


# TP_1

Unnamed: 0,DS,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
25,Float,17,True,auto,1,2,400,0.699037,0.197313,0.00011,0.933953,0.889925,0.99985,0.78
26,Float,17,True,auto,1,2,800,0.697856,0.197581,0.000111,0.933953,0.889925,0.99985,0.78
122,Original,17,True,auto,1,2,800,0.697844,0.19765,0.000111,0.933953,0.889925,0.99985,0.78
123,Original,17,True,auto,1,8,200,0.690953,0.197247,0.000136,0.932734,0.8899,0.9998,0.78
39,Float,17,True,sqrt,1,8,200,0.690904,0.197197,0.000136,0.932734,0.8899,0.9998,0.78
124,Original,17,True,auto,1,8,400,0.692067,0.197211,0.000111,0.933953,0.889925,0.99985,0.78
216,Integer,17,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
38,Float,17,True,sqrt,1,2,800,0.697856,0.197581,0.000111,0.933953,0.889925,0.99985,0.78
27,Float,17,True,auto,1,8,200,0.690904,0.197197,0.000136,0.932734,0.8899,0.9998,0.78
266,Integer,53,True,auto,1,2,800,0.670319,0.202844,0.0,0.937653,0.89,1.0,0.78


In [19]:
for col in cols_param[:-1]:
    md(f'# {col}')
    display(df_Results[['DS', col]+cols_metrics].groupby(['DS', col]).mean())

# bootstrap

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
DS,bootstrap,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Float,True,0.660165,0.210346,8.5e-05,0.929424,0.882122,0.99987,0.764375
Integer,True,0.660063,0.210382,8.4e-05,0.929449,0.882123,0.999871,0.764375
Original,True,0.660021,0.210432,8.4e-05,0.929416,0.882071,0.999871,0.764271


# max_features

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
DS,max_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Float,auto,0.660165,0.210346,8.5e-05,0.929424,0.882122,0.99987,0.764375
Float,sqrt,0.660165,0.210346,8.5e-05,0.929424,0.882122,0.99987,0.764375
Integer,auto,0.660063,0.210382,8.4e-05,0.929449,0.882123,0.999871,0.764375
Integer,sqrt,0.660063,0.210382,8.4e-05,0.929449,0.882123,0.999871,0.764375
Original,auto,0.660021,0.210432,8.4e-05,0.929416,0.882071,0.999871,0.764271
Original,sqrt,0.660021,0.210432,8.4e-05,0.929416,0.882071,0.999871,0.764271


# min_samples_leaf

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
DS,min_samples_leaf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Float,1,0.669899,0.208542,7.2e-05,0.930613,0.883072,0.999894,0.76625
Float,4,0.650432,0.212151,9.7e-05,0.928235,0.881173,0.999846,0.7625
Integer,1,0.669831,0.208599,7.1e-05,0.930662,0.883073,0.999896,0.76625
Integer,4,0.650294,0.212165,9.7e-05,0.928235,0.881173,0.999846,0.7625
Original,1,0.669721,0.2087,7.1e-05,0.930598,0.882969,0.999896,0.766042
Original,4,0.65032,0.212164,9.7e-05,0.928235,0.881173,0.999846,0.7625


# min_samples_split

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
DS,min_samples_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Float,2,0.662465,0.210047,8.1e-05,0.929667,0.88223,0.999877,0.764583
Float,8,0.657866,0.210646,8.9e-05,0.92918,0.882015,0.999862,0.764167
Integer,2,0.662338,0.210081,8e-05,0.929717,0.882231,0.999879,0.764583
Integer,8,0.657788,0.210683,8.9e-05,0.92918,0.882015,0.999862,0.764167
Original,2,0.662248,0.210184,8e-05,0.929652,0.882127,0.999879,0.764375
Original,8,0.657793,0.21068,8.9e-05,0.92918,0.882015,0.999862,0.764167


# n_estimators

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
DS,n_estimators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Float,200,0.659563,0.210755,8.5e-05,0.929198,0.881809,0.999869,0.76375
Float,400,0.659802,0.210877,8.2e-05,0.929246,0.881656,0.999875,0.763437
Float,800,0.661131,0.209407,8.6e-05,0.929828,0.882902,0.999866,0.765938
Integer,200,0.659625,0.210638,8.4e-05,0.929374,0.881967,0.999872,0.764062
Integer,400,0.659679,0.210908,8.2e-05,0.929246,0.881656,0.999875,0.763437
Integer,800,0.660883,0.209599,8.6e-05,0.929726,0.882745,0.999866,0.765625
Original,200,0.659493,0.210792,8.4e-05,0.929275,0.881811,0.999872,0.76375
Original,400,0.659689,0.21091,8.2e-05,0.929246,0.881656,0.999875,0.763437
Original,800,0.66088,0.209594,8.6e-05,0.929728,0.882745,0.999866,0.765625


# Baseline model.

In [12]:
#
N_fraud_test = 200
N_truth_test = int(2e4)
N_truth_train = int(2e5)

#
split_seeds = [13, 17, 19, 41]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [100, 200, 400, 800, 100]

# Number of features to consider at every split
max_features = ['auto']

# Minimum number of samples required to split a node
min_samples_split = [2]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)

{'n_estimators': [100, 200, 400, 800, 100], 'max_features': ['auto'], 'min_samples_split': [2], 'min_samples_leaf': [1], 'bootstrap': [True]}


In [13]:
glob_paths = ['/work/data/creditcard_trans_int.csv']
total_exps = len(glob_paths)*len(split_seeds)*len(ParameterGrid(search_grid))
print(total_exps)

20


In [14]:

with tqdm(total=total_exps) as progress_bar:

    def RunGrid(df_train, df_test, random_state):
        out = []
        for params in ParameterGrid(search_grid):
            params['random_state'] = random_state
            params['n_jobs'] = n_jobs
            rf = RandomForestClassifier(**params)
            rf.fit(df_train[ds_cols].to_numpy(), df_train[target_col].to_numpy())
            probs = rf.predict_proba(df_test[ds_cols].to_numpy())
            exp = {
                'probs' : probs,
                'rf_classes': rf.classes_,
                'params': params
            }
            out.append(exp)
            progress_bar.update(1)
        return out


    Results = {}
    for ds_path in glob_paths:
        df = pd.read_csv(ds_path)
        df = df[ds_cols+[target_col]]
        df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
        df_truth = df.query('Class == 0').reset_index(drop=True).copy()
        del df
        set_exp = {}
        for seed in split_seeds:
            df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
            df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
            df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
            df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
            out = RunGrid(df_train, df_test, random_state)
            set_exp[seed] = {
                'target_test': df_test[target_col].to_numpy(),
                'exps': out
            }
        Results[ds_path] = set_exp

100%|██████████| 20/20 [44:34<00:00, 133.75s/it]


In [20]:
data = []
for ds_path,sets_exp in Results.items():
    for seed,set_exp in sets_exp.items():
        target_test = set_exp['target_test']
        for exp in set_exp['exps']:
            df_exp = pd.DataFrame(exp['probs'], columns=exp['rf_classes'])
            df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: exp['rf_classes'][np.argmax(x)], axis=1)
            df_exp['target'] = target_test
            Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
            Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
            Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
            F1_M = f1_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            AUC_ROC_M = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='macro')
            TP_0 = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 0)].shape[0]/sum(df_exp.target == 0)
            TP_1 = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)].shape[0]/sum(df_exp.target == 1)
            param = exp['params']
            data.append([
                ds_path, seed,
                param['bootstrap'], param['max_features'], param['min_samples_leaf'],
                param['min_samples_split'], param['n_estimators'], param['random_state'],
                Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1
            ])

In [18]:
df_Results = pd.DataFrame(data, columns=cols)
df_Results.to_csv('/work/data/Results_creditcard_Baseline.csv', index=False)

In [19]:
df_Results

Unnamed: 0,ds_path,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,random_state,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
0,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,100,42,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
1,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,200,42,0.678025,0.206275,2.8e-05,0.931604,0.882475,0.99995,0.765
2,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,400,42,0.679075,0.206475,2.7e-05,0.931604,0.882475,0.99995,0.765
3,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,800,42,0.679956,0.206375,2.6e-05,0.931604,0.882475,0.99995,0.765
4,/work/data/creditcard_trans_int.csv,13,True,auto,1,2,100,42,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
5,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,100,42,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
6,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,200,42,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
7,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,400,42,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
8,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,800,42,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
9,/work/data/creditcard_trans_int.csv,17,True,auto,1,2,100,42,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78


In [21]:
for metric in cols_metrics:
    md(f'# {metric}')
    display(df_Results.sort_values(metric, ascending=False).head(20)[cols_param[:-1]+cols_metrics])
    

# Fraud_True_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
6,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
5,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
9,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
7,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
8,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
11,True,auto,1,2,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
12,True,auto,1,2,400,0.685237,0.195187,4.6e-05,0.936413,0.889975,0.99995,0.78
13,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.936413,0.889975,0.99995,0.78
10,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
14,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775


# Truth_False_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
17,True,auto,1,2,400,0.675188,0.212313,0.000136,0.926317,0.8799,0.9998,0.76
16,True,auto,1,2,200,0.673975,0.212075,0.00016,0.925109,0.879875,0.99975,0.76
19,True,auto,1,2,100,0.67885,0.2102,0.000166,0.926725,0.882375,0.99975,0.765
15,True,auto,1,2,100,0.67885,0.2102,0.000166,0.926725,0.882375,0.99975,0.765
18,True,auto,1,2,800,0.680594,0.207244,0.000136,0.929543,0.8849,0.9998,0.77
0,True,auto,1,2,100,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
4,True,auto,1,2,100,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
2,True,auto,1,2,400,0.679075,0.206475,2.7e-05,0.931604,0.882475,0.99995,0.765
3,True,auto,1,2,800,0.679956,0.206375,2.6e-05,0.931604,0.882475,0.99995,0.765
1,True,auto,1,2,200,0.678025,0.206275,2.8e-05,0.931604,0.882475,0.99995,0.765


# Fraud_False_Sum

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
19,True,auto,1,2,100,0.67885,0.2102,0.000166,0.926725,0.882375,0.99975,0.765
15,True,auto,1,2,100,0.67885,0.2102,0.000166,0.926725,0.882375,0.99975,0.765
16,True,auto,1,2,200,0.673975,0.212075,0.00016,0.925109,0.879875,0.99975,0.76
17,True,auto,1,2,400,0.675188,0.212313,0.000136,0.926317,0.8799,0.9998,0.76
18,True,auto,1,2,800,0.680594,0.207244,0.000136,0.929543,0.8849,0.9998,0.77
8,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
7,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
6,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
5,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
9,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78


# F1_M

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
11,True,auto,1,2,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
13,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.936413,0.889975,0.99995,0.78
12,True,auto,1,2,400,0.685237,0.195187,4.6e-05,0.936413,0.889975,0.99995,0.78
10,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
14,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
5,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
6,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
7,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
8,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
9,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78


# AUC_ROC_M

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
11,True,auto,1,2,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
13,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.936413,0.889975,0.99995,0.78
12,True,auto,1,2,400,0.685237,0.195187,4.6e-05,0.936413,0.889975,0.99995,0.78
5,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
6,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
7,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
8,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
9,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
10,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
14,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775


# TP_0

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
0,True,auto,1,2,100,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
1,True,auto,1,2,200,0.678025,0.206275,2.8e-05,0.931604,0.882475,0.99995,0.765
14,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
13,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.936413,0.889975,0.99995,0.78
12,True,auto,1,2,400,0.685237,0.195187,4.6e-05,0.936413,0.889975,0.99995,0.78
11,True,auto,1,2,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
10,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
4,True,auto,1,2,100,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
3,True,auto,1,2,800,0.679956,0.206375,2.6e-05,0.931604,0.882475,0.99995,0.765
2,True,auto,1,2,400,0.679075,0.206475,2.7e-05,0.931604,0.882475,0.99995,0.765


# TP_1

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
11,True,auto,1,2,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
12,True,auto,1,2,400,0.685237,0.195187,4.6e-05,0.936413,0.889975,0.99995,0.78
5,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
6,True,auto,1,2,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
7,True,auto,1,2,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
8,True,auto,1,2,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
9,True,auto,1,2,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
13,True,auto,1,2,800,0.684669,0.195112,4.5e-05,0.936413,0.889975,0.99995,0.78
10,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
14,True,auto,1,2,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
