# Baseline Model 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [3]:
from tqdm import tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import json
# def open_file_json(path,mode='r',var=None):
#     if mode == 'w':
#         with open(path,'w') as f:
#             json.dump(var, f)
#     if mode == 'r':
#         with open(path,'r') as f:
#             return json.load(f)

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])

# import json
# from glob import glob
# from typing import NewType


# DictsPathType = NewType("DictsPath", str)


# def open_file_json(path):
#     with open(path, "r") as f:
#         return json.load(f)

# class LoadDicts:
#     def __init__(self, dict_path: DictsPathType = "./data"):
#         Dicts_glob = glob(f"{dict_path}/*.json")
#         self.List = []
#         self.Dict = {}
#         for path_json in Dicts_glob:
#             name = path_json.split("/")[-1].replace(".json", "")
#             self.List.append(name)
#             self.Dict[name] = open_file_json(path_json)
#             setattr(self, name, self.Dict[name])


In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.26.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: f9c855f89c297f0a881fa7d4842bf4ddc0f4243c

Git repo: https://github.com/ysraell/creditcardfraud.git

Git branch: main

pandas: 1.3.1
numpy : 1.19.5

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:           31G
Swap:         4.0G


# Initial search

In [5]:
#
N_fraud_test = 200
N_truth_test = int(2e4)
N_truth_train = int(2e5)

#
split_seeds = [13, 17, 47, 53]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [200, 400, 800]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Minimum number of samples required to split a node
min_samples_split = [2, 8]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]

# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)

{'n_estimators': [200, 400, 800], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 8], 'min_samples_leaf': [1, 4], 'bootstrap': [True]}


In [6]:
target_col = 'Class'
ds_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
glob_paths = glob('/work/data/creditcard*.csv')

total_exps = len(glob_paths)*len(split_seeds)*len(ParameterGrid(search_grid))
print(total_exps)

288


In [7]:

with tqdm(total=total_exps) as progress_bar:

    def RunGrid(df_train, df_test, random_state):
        out = []
        for params in ParameterGrid(search_grid):
            params['random_state'] = random_state
            params['n_jobs'] = 8
            rf = RandomForestClassifier(**params)
            rf.fit(df_train[ds_cols].to_numpy(), df_train[target_col].to_numpy())
            probs = rf.predict_proba(df_test[ds_cols].to_numpy())
            exp = {
                'probs' : probs,
                'rf_classes': rf.classes_,
                'params': params
            }
            out.append(exp)
            progress_bar.update(1)
        return out


    Results = {}
    for ds_path in glob_paths:
        df = pd.read_csv(ds_path)
        df = df[ds_cols+[target_col]]
        df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
        df_truth = df.query('Class == 0').reset_index(drop=True).copy()
        del df
        set_exp = {}
        for seed in split_seeds:
            df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
            df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
            df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
            df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
            out = RunGrid(df_train, df_test, random_state)
            set_exp[seed] = {
                'target_test': df_test[target_col].to_numpy(),
                'exps': out
            }
        Results[ds_path] = set_exp

100%|██████████| 288/288 [12:17:57<00:00, 153.74s/it]  


In [8]:
cols_results = ['ds_path', 'seed']
cols_param = ['bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split', 'n_estimators', 'random_state']
cols_metrics = ['Fraud_True_Sum','Truth_False_Sum', 'Fraud_False_Sum', 'F1_W', 'AUC_ROC_W']
cols = cols_results+cols_param+cols_metrics

In [9]:
', '.join(cols_metrics)

'Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_W, AUC_ROC_W'

In [10]:
''.join([ f'param[\'{col}\'], ' for col in cols_param])

"param['bootstrap'], param['max_features'], param['min_samples_leaf'], param['min_samples_split'], param['n_estimators'], param['random_state'], "

In [11]:
data = []
for ds_path,sets_exp in Results.items():
    for seed,set_exp in sets_exp.items():
        target_test = set_exp['target_test']
        for exp in set_exp['exps']:
            df_exp = pd.DataFrame(exp['probs'], columns=exp['rf_classes'])
            df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: exp['rf_classes'][np.argmax(x)], axis=1)
            df_exp['target'] = target_test
            Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
            Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
            Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
            F1_W = f1_score(target_test, df_exp['pred'].to_numpy(), average='weighted')
            AUC_ROC_W = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='weighted')
            param = exp['params']
            data.append([
                ds_path, seed,
                param['bootstrap'], param['max_features'], param['min_samples_leaf'],
                param['min_samples_split'], param['n_estimators'], param['random_state'],
                Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_W, AUC_ROC_W
            ])

In [12]:
df_Results = pd.DataFrame(data, columns=cols)
df_Results.to_csv('/work/data/Results_creditcard_Init.csv', index=False)

In [13]:
df_Results.sort_values('AUC_ROC_W', ascending=False)

Unnamed: 0,ds_path,seed,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,random_state,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_W,AUC_ROC_W
266,/work/data/creditcard_trans_int.csv,53,True,auto,1,2,800,42,0.670319,0.202844,0.000000,0.997688,0.890000
74,/work/data/creditcard_trans_float.csv,53,True,auto,1,2,800,42,0.670625,0.202756,0.000000,0.997688,0.890000
278,/work/data/creditcard_trans_int.csv,53,True,sqrt,1,2,800,42,0.670319,0.202844,0.000000,0.997688,0.890000
86,/work/data/creditcard_trans_float.csv,53,True,sqrt,1,2,800,42,0.670625,0.202756,0.000000,0.997688,0.890000
229,/work/data/creditcard_trans_int.csv,17,True,sqrt,1,2,400,42,0.699100,0.197350,0.000110,0.997542,0.889925
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,/work/data/creditcard_trans_float.csv,47,True,sqrt,1,2,400,42,0.650900,0.224213,0.000109,0.997146,0.872425
241,/work/data/creditcard_trans_int.csv,47,True,auto,1,2,400,42,0.650575,0.224225,0.000109,0.997146,0.872425
145,/work/data/creditcard.csv,47,True,auto,1,2,400,42,0.650662,0.224238,0.000109,0.997146,0.872425
157,/work/data/creditcard.csv,47,True,sqrt,1,2,400,42,0.650662,0.224238,0.000109,0.997146,0.872425


# Full search.

In [None]:
#
N_fraud_test = 200
N_truth_test = int(1e3)
N_truth_train = int(1e3)

#
split_seeds = [13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53]

# random_state used by RandomForestClassifier
random_state = 42

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
search_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(search_grid)