# Customized Random Forest Classifier Model 

- TODO: implement tqdm.contrib.concurrent for build trees in parallel.

## Fundamentals:

- Based on Random Forest method principles: ensemble of models (decision trees).

- In bootstrap process:

    - the data sampled ensure the balance between classes, for training and validation;

    - the list of features used are randomly sampled (with random number of features and order).
    
- For each tree:

    - fallowing the sequence of a given list of features, the data is splited half/half based on meadian value;
    
    - the splitting process ends when the samples have one only class;
    
    - validation process based on dynamic threshold can discard the tree.
    
- For use the forest:

    - all trees predictions are combined as a vote;
    
    - it is possible to use soft or hard-voting.
    
- Positive side-effects:

    - possible more generalization caused by the combination of overfitted trees, each tree is highly specialized in a smallest and different set of feature;
    
    - robustness for unbalanced and missing data, in case of missing data, the feature could be skipped without degrade the optimization process;
    
    - in prediction process, a missing value could be dealt with a tree replication considering the two possible paths;
    
    - the survived trees have a potential information about feature importance.

### Premises: 
- all features must be numeric.
   
- Is case of categorical data, the splitting is done for each categorical value, creating one branch for each value.

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import random
import numpy as np
import pandas as pd
from collections import deque

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
from tqdm import tqdm

# from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

# from IPython.display import Markdown, display
# def md(arg):
#     display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import json
# def open_file_json(path,mode='r',var=None):
#     if mode == 'w':
#         with open(path,'w') as f:
#             json.dump(var, f)
#     if mode == 'r':
#         with open(path,'r') as f:
#             return json.load(f)

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])

# import json
# from glob import glob
# from typing import NewType


# DictsPathType = NewType("DictsPath", str)


# def open_file_json(path):
#     with open(path, "r") as f:
#         return json.load(f)

# class LoadDicts:
#     def __init__(self, dict_path: DictsPathType = "./data"):
#         Dicts_glob = glob(f"{dict_path}/*.json")
#         self.List = []
#         self.Dict = {}
#         for path_json in Dicts_glob:
#             name = path_json.split("/")[-1].replace(".json", "")
#             self.List.append(name)
#             self.Dict[name] = open_file_json(path_json)
#             setattr(self, name, self.Dict[name])


In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.26.0

Compiler    : GCC 8.3.0
OS          : Linux
Release     : 5.11.0-7620-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: 6d920b4e4c7541ec2a07092b1e66afc5352c0acc

Git repo: https://github.com/ysraell/creditcardfraud.git

Git branch: main

pandas: 1.3.1
numpy : 1.19.5

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:           31G
Swap:         4.0G


In [5]:
df = pd.read_csv('/work/data/creditcard_trans_int.csv')
target_col = 'Class'
ds_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

## How much data for each class:

In [6]:
df[target_col].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [7]:
# How much for test step:
N_fraud_test = 200
N_truth_test = int(2e4)

# How much for training step:
N_truth_train = int(2e5)
# The remaing data for fraud is all used.

# How much (survived) trees:
T = 100

# Parameters for bootstrap

# Data for each tree:
N_T = df[target_col].value_counts()[1] - N_fraud_test # remaing data for fraud.
N_V = N_T # keep balanced amount for each classe.

# How much features:
n_F_max = len(ds_cols)
n_F_min = n_F_max//2
# From EDA: decision trees with small amount of features could be not so efficient.

# Droped trees limit:
D = 10
# When a total of D trees are dropped, the validation threshold decreases delta_Th, restarting the counting.

# Delta Threshold
delta_Th = 0.01
Th_start = 0.98
# The validation threshold decreases dynamically as need to selection more specialized tree.

# Metric in validation
M = 'Accuracy'
# This parameter have no effect, only to say which metric is used in validation process.

#Seeds
split_seeds = [13, 17, 19, 41]
# One for each experiment.

In [8]:
# Splits the data to build the decision tree.
def split_train_val(df_train, N_T, N_V):
    df_1 = df_train.query('Class == 1')
    df_0 = df_train.query('Class == 0')
    XT1, XV1 = train_test_split(df_1, train_size=df_1.shape[0]//2)
    XT0, XV0 = train_test_split(df_0, train_size=N_T, test_size=N_V)
    return pd.concat([XT1, XT0]).reset_index(drop=True), pd.concat([XV1, XV0]).reset_index(drop=True)

# Sample the features.
def sampleFeats(n_F_min, n_F_max):
    n_f = random.randint(n_F_min, n_F_max)
    return random.sample(ds_cols, n_f)

# Set the leaf of the descion tree.
def genLeaf(ds):
    return {
        'leaf' : int(ds.Class.mode()[0])
    }

# Splits the data during the tree's growth process.
def splitData(feat,ds):
    if ds.shape[0] > 2:
        split_val = int(ds[feat].quantile())
        ds_a = ds.query(f'{feat} >= {split_val}').reset_index(drop=True)
        ds_b = ds.query(f'{feat} < {split_val}').reset_index(drop=True)
        if (ds_a.shape[0] > 0) and (ds_b.shape[0] > 0):
            return (
                ds_a,
                ds_b,
                split_val
            )
        # I know that is a trick!
        ds_a = ds.query(f'{feat} > {split_val}').reset_index(drop=True)
        ds_b = ds.query(f'{feat} <= {split_val}').reset_index(drop=True)
        return (
            ds_a,
            ds_b,
            split_val
        )
               
    ds = ds.sort_values(feat).reset_index(drop=True)
    return (
            ds.loc[1].to_frame().T,
            ds.loc[0].to_frame().T,
            ds[feat].loc[1]
        )

# Make the tree grow
def growTree(F,ds):

    if ds.Class.nunique() == 1:
        return genLeaf(ds)

    Pass = False
    first_feat = F[0]
    while not Pass:
        feat = F[0]
        ds_a, ds_b, split_val = splitData(feat,ds)
        F.append(F.pop(0))
        if (ds_a.shape[0] > 0) and (ds_b.shape[0] > 0):
            Pass = True
        else:
            if first_feat == F[0]:
                return genLeaf(ds)
    
    return {
            feat: {
                'split': {
                    'split_val': split_val,
                    '>=' : growTree(F[:],ds_a),
                    '<' : growTree(F[:],ds_b)
                }
            }
        }

# Makes a prediction using a decision tree.
def useTree(Tree, sample):
    while True:
        node = list(Tree.keys())[0]
        if node == 'leaf':
            return Tree['leaf']
        val = sample[node]
        Tree = Tree[node]['split']['>='] if val >= Tree[node]['split']['split_val'] else Tree[node]['split']['<']

# Generates the prediction probabilities for each class (like `predict_proba` function of Scikit-Learn models).
def useForest(Forest, sample):
    y_pred = [useTree(Tree, sample) for Tree in Forest]
    return (y_pred.count(0)/len(y_pred), y_pred.count(1)/len(y_pred))

# Test step for a dataset `ds`.
def testForest(Tree,ds):
    return [useForest(Forest, sample) for _,sample in ds.iterrows()]

# Generates the metric for validation process.
def validationTree(Tree,ds):
    y_pred = [useTree(Tree, sample) for _,sample in ds.iterrows()]
    y_val = ds.Class.to_list()
    return sum([v == p for v,p in zip(y_val,y_pred)])/len(y_pred)

In [9]:
# Splitting by unique values of the classes.
df = df[ds_cols+[target_col]]
df_fraud = df.query('Class == 1').reset_index(drop=True).copy()
df_truth = df.query('Class == 0').reset_index(drop=True).copy()

Results = {}
total_exps = len(split_seeds)*T
with tqdm(total=total_exps) as progress_bar:
    
    # Each seed is a experiment.
    for seed in split_seeds:
        
        # Start the experiment
        df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=N_fraud_test, random_state=seed)
        df_truth_train, df_truth_test = train_test_split(df_truth, train_size=N_truth_train, test_size=N_truth_test, random_state=seed)
        df_train = pd.concat([df_fraud_train, df_truth_train]).reset_index(drop=True)
        df_test = pd.concat([df_fraud_test, df_truth_test]).reset_index(drop=True)
        del df_fraud_test, df_truth_test, df_fraud_train, df_truth_train

        # Builds the Forest (training step)
        Forest = []
        for t in range(T):
            # Builds the decision tree
            X_T, X_V = split_train_val(df_train, N_T, N_V)
            Threshold_for_drop = Th_start
            droped_trees = 0
            Pass = False
            # Only survived trees
            while not Pass:
                F = sampleFeats(n_F_min,n_F_max)
                Tree = growTree(F,X_T)
                if validationTree(Tree,X_V) < Threshold_for_drop:
                    droped_trees += 1
                else:
                    Pass = True
                if droped_trees >= D:
                    Threshold_for_drop -= delta_Th
                    droped_trees = 0
            Forest.append(Tree)
            progress_bar.update(1)

        # Test step
        probs = testForest(Tree,df_test)

        # Save results
        target_test = df_test.Class.to_list()
        Results[seed] = {
            'probs': probs,
            'target_test': target_test
        }

100%|██████████| 400/400 [7:01:07<00:00, 63.17s/it]  


In [10]:

# Generates the metrics
data = []
for seed,exp in Results.items():
    target_test = exp['target_test']
    rf_classes = [0, 1]
    df_exp = pd.DataFrame(exp['probs'], columns=rf_classes)
    df_exp['pred'] = df_exp[[0, 1]].apply(lambda x: rf_classes[np.argmax(x)], axis=1)
    df_exp['target'] = target_test
    Fraud_True_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)][1].sum()/sum(df_exp.target == 1)
    Truth_False_Sum = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 1)][0].sum()/sum(df_exp.target == 1)
    Fraud_False_Sum = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 0)][1].sum()/sum(df_exp.target == 0)
    F1_M = f1_score(target_test, df_exp['pred'].to_numpy(), average='macro')
    AUC_ROC_M = roc_auc_score(target_test, df_exp['pred'].to_numpy(), average='macro')
    TP_0 = df_exp.loc[(df_exp.pred == 0) & (df_exp.target == 0)].shape[0]/sum(df_exp.target == 0)
    TP_1 = df_exp.loc[(df_exp.pred == 1) & (df_exp.target == 1)].shape[0]/sum(df_exp.target == 1)


    data.append([
        seed, Fraud_True_Sum, Truth_False_Sum, Fraud_False_Sum, F1_M, AUC_ROC_M, TP_0, TP_1
    ])

In [11]:
columns = ['seed', 'Fraud_True_Sum','Truth_False_Sum', 'Fraud_False_Sum', 'F1_M', 'AUC_ROC_M', 'TP_0', 'TP_1']
df_Results = pd.DataFrame(data, columns=columns)
df_Results.to_csv('/work/data/Results_creditcard_Custom.csv', index=False)

# Results

In [12]:
df_Results

Unnamed: 0,seed,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
0,13,0.8047,0.1146,0.001283,0.912143,0.926425,0.99785,0.855
1,17,0.81315,0.10405,0.001009,0.928917,0.934225,0.99845,0.87
2,19,0.8068,0.1124,0.001514,0.907588,0.928775,0.99755,0.86
3,41,0.7905,0.1361,0.000699,0.930507,0.914525,0.99905,0.83


### Basic statistics for all experiments

In [13]:
df_Results[df_Results.columns[1:]].describe().loc[['mean', 'std', 'min', 'max']]

Unnamed: 0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
mean,0.803787,0.116787,0.001126,0.919789,0.925987,0.998225,0.85375
std,0.009559,0.013653,0.000352,0.011626,0.008311,0.000665,0.017017
min,0.7905,0.10405,0.000699,0.907588,0.914525,0.99755,0.83
max,0.81315,0.1361,0.001514,0.930507,0.934225,0.99905,0.87


# Baseline model metrics

In [14]:
df_Baseline = pd.read_csv('Results_creditcard_Baseline.csv')

In [15]:
columns = ['seed', 'n_estimators', 'Fraud_True_Sum','Truth_False_Sum', 'Fraud_False_Sum', 'F1_M', 'AUC_ROC_M', 'TP_0', 'TP_1']
df_Baseline[columns].groupby(['seed','n_estimators']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fraud_True_Sum,Truth_False_Sum,Fraud_False_Sum,F1_M,AUC_ROC_M,TP_0,TP_1
seed,n_estimators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
13,100,0.67765,0.20705,2.8e-05,0.931604,0.882475,0.99995,0.765
13,200,0.678025,0.206275,2.8e-05,0.931604,0.882475,0.99995,0.765
13,400,0.679075,0.206475,2.7e-05,0.931604,0.882475,0.99995,0.765
13,800,0.679956,0.206375,2.6e-05,0.931604,0.882475,0.99995,0.765
17,100,0.70015,0.1969,0.000104,0.933953,0.889925,0.99985,0.78
17,200,0.70035,0.196325,0.000109,0.933953,0.889925,0.99985,0.78
17,400,0.6991,0.19735,0.00011,0.933953,0.889925,0.99985,0.78
17,800,0.698,0.197581,0.00011,0.933953,0.889925,0.99985,0.78
19,100,0.6828,0.199,4.6e-05,0.934819,0.887475,0.99995,0.775
19,200,0.687875,0.192925,4.6e-05,0.937998,0.892475,0.99995,0.785
