# EDA 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
import numpy as np
from random_forest_mc.model import RandomForestMC
from random_forest_mc.utils import LoadDicts, dump_file_json, load_file_json
from tqdm import notebook as tqdm
from datetime import datetime
from collections import Counter
from glob import glob

In [3]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.27.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 6.8.0-44-generic
Machine     : x86_64
Processor   : 
CPU cores   : 20
Architecture: 64bit

Git hash: 1fa21bde606ae8fa41e261b8bc96908a11a39755

Git repo: https://github.com/ysraell/random-forest-mc-utils.git

Git branch: main

numpy : 1.26.4
pandas: 2.2.2
tqdm  : 4.66.5

CPU	: 12th Gen Intel(R) Core(TM) i7-12700
Mem:            15G
Swap:          3.7G


In [5]:
dataset_path = '/work/tmp/credit-card-fraud-prediction'

In [6]:
!ls $dataset_path

model_eda_16trees_20240914_1212.json  sample_submission.csv  train.csv
model_eda_32trees_20240914_1216.json  test.csv


In [7]:
df_train = pd.read_csv(f'{dataset_path}/train.csv')

In [8]:
df_train.loc[0]

id                    0.000000
Time                  0.000000
feat1                 2.074329
feat2                -0.129425
feat3                -1.137418
feat4                 0.412846
feat5                -0.192638
feat6                -1.210144
feat7                 0.110697
feat8                -0.263477
feat9                 0.742144
feat10                0.108782
feat11               -1.070243
feat12               -0.234910
feat13               -1.099360
feat14                0.502467
feat15                0.169318
feat16                0.065688
feat17               -0.306957
feat18               -0.323800
feat19                0.103348
feat20               -0.292969
feat21               -0.334701
feat22               -0.887840
feat23                0.336701
feat24               -0.110835
feat25               -0.291459
feat26                0.207733
feat27               -0.076576
feat28               -0.059577
Transaction_Amount    1.980000
IsFraud               0.000000
Name: 0,

In [9]:
df_train['Transaction_Amount'].describe()

count    150000.000000
mean         71.065194
std         158.712510
min           0.000000
25%           6.990000
50%          24.900000
75%          74.000000
max        6513.350000
Name: Transaction_Amount, dtype: float64

In [10]:
df_train['Transaction_Amount'] = (df_train['Transaction_Amount'] * 100).astype(np.uint32)
df_train['Transaction_Amount'].describe()

count    150000.000000
mean       7106.476180
std       15871.250544
min           0.000000
25%         699.000000
50%        2490.000000
75%        7400.000000
max      651335.000000
Name: Transaction_Amount, dtype: float64

In [11]:
df_train = df_train.drop(columns=['id', 'Time'])

In [12]:
IsFraud_count, Not_IsFraud_count = df_train['IsFraud'].value_counts()[1], df_train['IsFraud'].value_counts()[0]
IsFraud_count, Not_IsFraud_count

(269, 149731)

In [13]:
df_test.loc[0]

id                    150000.000000
Time                   73899.000000
feat1                     -0.681234
feat2                     -0.371212
feat3                      0.385966
feat4                      0.955703
feat5                      2.064078
feat6                      0.338827
feat7                     -0.539452
feat8                     -0.254046
feat9                      0.010960
feat10                     0.753938
feat11                    -0.547338
feat12                     0.559844
feat13                     0.261633
feat14                     0.847703
feat15                    -0.992736
feat16                     0.552428
feat17                    -0.381524
feat18                    -0.063139
feat19                     0.919230
feat20                     0.183560
feat21                    -0.253324
feat22                     0.266668
feat23                    -0.153829
feat24                     0.455969
feat25                    -0.503628
feat26                     0

In [43]:
feat_cols = df_train.columns.to_list()
feat_cols

['feat1',
 'feat2',
 'feat3',
 'feat4',
 'feat5',
 'feat6',
 'feat7',
 'feat8',
 'feat9',
 'feat10',
 'feat11',
 'feat12',
 'feat13',
 'feat14',
 'feat15',
 'feat16',
 'feat17',
 'feat18',
 'feat19',
 'feat20',
 'feat21',
 'feat22',
 'feat23',
 'feat24',
 'feat25',
 'feat26',
 'feat27',
 'feat28',
 'Transaction_Amount',
 'IsFraud']

In [18]:
target_col = 'IsFraud'

In [19]:
IsFraud_count

269

In [20]:
batch_train_pclass = IsFraud_count - 100
batch_val_pclass = 100
max_depth = None
n_trees = 16
max_discard_trees = 128

cpu_cores = 8

cls = RandomForestMC(
    n_trees=n_trees, 
    target_col=target_col, 
    max_discard_trees=max_discard_trees,
    batch_train_pclass=batch_train_pclass,
    batch_val_pclass=batch_val_pclass,
    max_depth=max_depth
)
cls

RandomForestMC(len(Forest)=0,n_trees=16,model_version=1.2.0-dev,module_version=1.2.0-dev)

# Create the model

In [26]:
max_workers = cpu_cores

In [None]:
cls.fitParallel(dataset=df_train, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'{dataset_path}/model_eda_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [30]:
y_test = df_train[target_col].to_list()

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df_train)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '1']) / len([y for y in y_test if str(y) == '1'])
accuracy_soft_valid = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '0']) / len([y for y in y_test if str(y) == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df_train, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'{dataset_path}/model_eda_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df_train)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '1']) / len([y for y in y_test if str(y) == '1'])
accuracy_soft_valid = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '0']) / len([y for y in y_test if str(y) == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

# Load trained model

In [None]:
y_test = df_train[target_col].to_list()

In [21]:
glob(f'{dataset_path}/model_eda_*trees_*.json')

['/work/tmp/credit-card-fraud-prediction/model_eda_16trees_20240914_1212.json',
 '/work/tmp/credit-card-fraud-prediction/model_eda_32trees_20240914_1216.json']

In [22]:
cls.process_dataset(df_train)

In [23]:
for model_json in glob(f'{dataset_path}/model_eda_*trees_*.json'):
    model_dict = load_file_json(model_json)
    cls_tmp = RandomForestMC(target_col=target_col)
    cls_tmp.dict2model(model_dict)
    cls.mergeForest(cls_tmp)
cls

RandomForestMC(len(Forest)=48,n_trees=16,model_version=1.2.0-dev,module_version=1.2.0-dev)

In [24]:
cls.n_trees = 256 - cls.Forest_size
cls.n_trees

208

In [27]:
max_workers = 2*cpu_cores

In [28]:
cls.fitParallel(dataset=df_train, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'{dataset_path}/model_eda_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/208 [00:00<?, ?it/s]

In [31]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df_train)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '1']) / len([y for y in y_test if str(y) == '1'])
accuracy_soft_valid = sum([str(v) == str(p) for v, p in zip(y_test, y_pred_soft) if str(v) == '0']) / len([y for y in y_test if str(y) == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(1.0, 0.8379493892380335, 0.83824)

In [32]:
cls

RandomForestMC(len(Forest)=256,n_trees=208,model_version=1.2.0-dev,module_version=1.2.0-dev)

In [33]:
fetPairImp = cls.featPairImportance()
sorted(fetPairImp.items(), key=lambda x: x[1], reverse=True)[:100]

Counting pair occurences: 100%|██████████| 256/256 [00:41<00:00,  6.22it/s]


[(('feat2', 'feat3'), 0.19921875),
 (('feat3', 'feat4'), 0.1953125),
 (('feat3', 'feat7'), 0.18359375),
 (('feat3', 'feat10'), 0.17578125),
 (('feat4', 'feat26'), 0.17578125),
 (('feat3', 'feat8'), 0.1640625),
 (('feat4', 'feat9'), 0.1640625),
 (('feat3', 'feat26'), 0.15625),
 (('feat1', 'feat3'), 0.15234375),
 (('feat2', 'feat4'), 0.14453125),
 (('feat4', 'feat10'), 0.14453125),
 (('feat7', 'feat8'), 0.14453125),
 (('feat3', 'feat16'), 0.140625),
 (('feat9', 'feat10'), 0.140625),
 (('feat4', 'feat7'), 0.13671875),
 (('feat4', 'feat13'), 0.13671875),
 (('feat4', 'feat18'), 0.13671875),
 (('feat8', 'feat26'), 0.13671875),
 (('feat3', 'feat9'), 0.1328125),
 (('feat3', 'Transaction_Amount'), 0.1328125),
 (('feat4', 'Transaction_Amount'), 0.1328125),
 (('feat1', 'feat4'), 0.12890625),
 (('feat2', 'feat7'), 0.12890625),
 (('feat2', 'feat26'), 0.12890625),
 (('feat3', 'feat18'), 0.12890625),
 (('feat4', 'feat19'), 0.12890625),
 (('feat9', 'feat26'), 0.12890625),
 (('feat3', 'feat19'), 0.125)

In [34]:
fc_stars, fc_list = cls.featCount()
fc_stars

(7.875, 2.0367713421000406, 2, 10)

In [35]:
Counter(fc_list).most_common()

[(9, 173),
 (7, 20),
 (4, 15),
 (6, 11),
 (2, 10),
 (8, 10),
 (5, 7),
 (3, 6),
 (10, 4)]

In [41]:
feat_imp = cls.featImportance()
sorted([(k, v) for k, v in feat_imp.items()], key=lambda x: x[1], reverse=True)

[('feat3', 0.4921875),
 ('feat4', 0.44921875),
 ('feat2', 0.37109375),
 ('feat10', 0.3515625),
 ('feat26', 0.3515625),
 ('feat8', 0.34765625),
 ('feat7', 0.33203125),
 ('feat18', 0.3203125),
 ('feat9', 0.3046875),
 ('feat1', 0.30078125),
 ('feat13', 0.27734375),
 ('feat20', 0.26953125),
 ('Transaction_Amount', 0.25390625),
 ('feat16', 0.25),
 ('feat21', 0.25),
 ('feat5', 0.24609375),
 ('feat14', 0.2421875),
 ('feat19', 0.2421875),
 ('feat12', 0.23828125),
 ('feat27', 0.23046875),
 ('feat17', 0.21875),
 ('feat25', 0.21484375),
 ('feat11', 0.203125),
 ('feat28', 0.203125),
 ('feat23', 0.19921875),
 ('feat22', 0.1953125),
 ('feat15', 0.18359375),
 ('feat6', 0.16796875),
 ('feat24', 0.16796875)]

# Conclusios: