# Explanatory Data Analisys 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
import numpy as np
from random_forest_mc.model import RandomForestMC
from random_forest_mc.utils import LoadDicts, dump_file_json, load_file_json
from tqdm import notebook as tqdm
from datetime import datetime
from collections import Counter

In [3]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 6.8.0-41-generic
Machine     : x86_64
Processor   : 
CPU cores   : 8
Architecture: 64bit

Git hash: 5831ff1cc24b1af22b0d6eac98686f23aa66d6d7

Git repo: https://github.com/ysraell/random-forest-mc-utils.git

Git branch: main

pandas: 2.2.2
tqdm  : 4.66.4
numpy : 1.26.4

CPU	: Intel(R) Xeon(R) CPU E3-1241 v3 @ 3.50GHz
Mem:            15G
Swap:           15G


In [5]:
!ls /work/tmp/nus-fintech-recruitment

customer.csv
model2_tuning_4trees_20240903_2136.json
model_tmp__fitp_20240902_2002.json
model_tmp_fitp_20240902_2049.json
model_tmp_fitp_20240902_2143.json
model_tmp_fitp_20240902_2206.json
model_tmp_fitp_20240902_2330.json
model_tuning_112trees_20240903_1041.json
model_tuning_128trees_20240903_1217.json
model_tuning_144trees_20240903_1359.json
model_tuning_160trees_20240903_1546.json
model_tuning_16trees_20240903_0302.json
model_tuning_176trees_20240903_1742.json
model_tuning_32trees_20240903_0411.json
model_tuning_48trees_20240903_0519.json
model_tuning_64trees_20240903_0633.json
model_tuning_80trees_20240903_0750.json
model_tuning_96trees_20240903_0913.json
sample_submission.csv
terminal.csv
test.csv
train.csv


In [6]:
df = pd.read_csv('/work/tmp/nus-fintech-recruitment/train.csv')
df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,59383,2021-08-01 00:04:37,323,217,4.60,0
1,59384,2021-08-01 00:12:10,6,429,8.61,0
2,59385,2021-08-01 00:12:34,714,1011,64.00,0
3,59386,2021-08-01 00:15:40,266,1969,12.72,0
4,59387,2021-08-01 00:16:01,890,1482,98.88,0
...,...,...,...,...,...,...
291226,350609,2021-12-30 23:33:02,221,41,61.26,0
291227,350610,2021-12-30 23:46:15,101,761,58.80,0
291228,350611,2021-12-30 23:54:38,7,1991,15.08,0
291229,350612,2021-12-30 23:56:36,161,1166,54.24,0


In [7]:
df.TX_DATETIME.min(), df.TX_DATETIME.max()

('2021-08-01 00:04:37', '2021-12-30 23:58:21')

In [8]:
df.TX_DATETIME = pd.to_datetime(df.TX_DATETIME)

In [9]:
df['DAY'] = df.TX_DATETIME.apply(lambda x: x.day)
df['DAYOFWEEK'] = df.TX_DATETIME.apply(lambda x: x.dayofweek)
df['HOUR'] = df.TX_DATETIME.apply(lambda x: x.hour)
df['MINUTE'] = df.TX_DATETIME.apply(lambda x: x.minute)

In [10]:
df.columns

Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_FRAUD', 'DAY', 'DAYOFWEEK', 'HOUR', 'MINUTE'],
      dtype='object')

In [11]:
feat_cols = ['CUSTOMER_ID', 'TERMINAL_ID','TX_AMOUNT', 'DAY', 'DAYOFWEEK', 'HOUR', 'MINUTE']
target_col = 'TX_FRAUD'
df = df[feat_cols + [target_col]]
df

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,DAY,DAYOFWEEK,HOUR,MINUTE,TX_FRAUD
0,323,217,4.60,1,6,0,4,0
1,6,429,8.61,1,6,0,12,0
2,714,1011,64.00,1,6,0,12,0
3,266,1969,12.72,1,6,0,15,0
4,890,1482,98.88,1,6,0,16,0
...,...,...,...,...,...,...,...,...
291226,221,41,61.26,30,3,23,33,0
291227,101,761,58.80,30,3,23,46,0
291228,7,1991,15.08,30,3,23,54,0
291229,161,1166,54.24,30,3,23,56,0


In [12]:
for col in ['CUSTOMER_ID', 'TERMINAL_ID','TX_FRAUD', 'DAY', 'DAYOFWEEK', 'HOUR', 'MINUTE']:
    df[col] = df[col].astype(str)
df.dtypes

CUSTOMER_ID     object
TERMINAL_ID     object
TX_AMOUNT      float64
DAY             object
DAYOFWEEK       object
HOUR            object
MINUTE          object
TX_FRAUD        object
dtype: object

In [13]:
df['TX_FRAUD'].value_counts()

TX_FRAUD
0    284649
1      6582
Name: count, dtype: int64

In [14]:
batch_train_pclass = 1000
batch_val_pclass = 2*batch_train_pclass
max_depth = 2*(len(feat_cols) + 1)
n_trees = 32
max_discard_trees = 2*n_trees

cls = RandomForestMC(
    n_trees=n_trees, 
    target_col=target_col, 
    max_discard_trees=max_discard_trees,
    batch_train_pclass=batch_train_pclass,
    batch_val_pclass=batch_val_pclass,
    max_depth=max_depth
)
cls

RandomForestMC(len(Forest)=0,n_trees=32,model_version=1.1.2,module_version=1.1.2)

In [69]:
cpu_cores = 4
max_workers = 2*cpu_cores

In [25]:
cls.fitParallel(dataset=df, max_workers=max_workers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target_col] = dataset[self.target_col].astype(str)


Planting the forest:   0%|          | 0/32 [00:00<?, ?it/s]

In [26]:
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tmp_fitp_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [27]:
cls

RandomForestMC(len(Forest)=160,n_trees=32,model_version=1.1.2,module_version=1.1.2)

In [66]:
fc_stars, fc_list = cls.featCount()
fc_stars

(2.7625, 0.9841716059712351, 2, 6)

In [67]:
Counter(fc_list).most_common()

[(2, 81), (3, 51), (4, 18), (5, 5), (6, 5)]

In [30]:
df_featCorr = cls.featCorrDataFrame()
df_featCorr

Counting pair occurences: 100%|██████████| 160/160 [00:01<00:00, 82.08it/s]


Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,DAY,DAYOFWEEK,HOUR,MINUTE
CUSTOMER_ID,0.987305,0.575195,0.068726,0.412598,0.606445,0.024994,0.0625
TERMINAL_ID,0.575195,0.587402,0.0625,0.256348,0.21875,0.024994,0.056244
TX_AMOUNT,0.068726,0.0625,0.068726,0.056244,0.049988,0.024994,0.012497
DAY,0.412598,0.256348,0.056244,0.412598,0.293701,0.018753,0.03125
DAYOFWEEK,0.606445,0.21875,0.049988,0.293701,0.612305,0.024994,0.037506
HOUR,0.024994,0.024994,0.024994,0.018753,0.024994,0.024994,0.0
MINUTE,0.0625,0.056244,0.012497,0.03125,0.037506,0.0,0.068726


In [31]:
fetPairImp = cls.featPairImportance()
fetPairImp

Counting pair occurences: 100%|██████████| 160/160 [00:01<00:00, 80.27it/s]


{('CUSTOMER_ID', 'TERMINAL_ID'): 0.574999999999999,
 ('CUSTOMER_ID', 'TX_AMOUNT'): 0.06874999999999999,
 ('CUSTOMER_ID', 'DAY'): 0.41249999999999953,
 ('CUSTOMER_ID', 'DAYOFWEEK'): 0.6062499999999988,
 ('CUSTOMER_ID', 'HOUR'): 0.025,
 ('CUSTOMER_ID', 'MINUTE'): 0.06249999999999999,
 ('TERMINAL_ID', 'TX_AMOUNT'): 0.06249999999999999,
 ('TERMINAL_ID', 'DAY'): 0.2562500000000001,
 ('TERMINAL_ID', 'DAYOFWEEK'): 0.2187500000000001,
 ('TERMINAL_ID', 'HOUR'): 0.025,
 ('TERMINAL_ID', 'MINUTE'): 0.056249999999999994,
 ('TX_AMOUNT', 'DAY'): 0.056249999999999994,
 ('TX_AMOUNT', 'DAYOFWEEK'): 0.049999999999999996,
 ('TX_AMOUNT', 'HOUR'): 0.025,
 ('TX_AMOUNT', 'MINUTE'): 0.0125,
 ('DAY', 'DAYOFWEEK'): 0.29374999999999996,
 ('DAY', 'HOUR'): 0.018750000000000003,
 ('DAY', 'MINUTE'): 0.03125,
 ('DAYOFWEEK', 'HOUR'): 0.025,
 ('DAYOFWEEK', 'MINUTE'): 0.0375,
 ('HOUR', 'MINUTE'): 0.0}

In [34]:
sorted(fetPairImp.items(), key=lambda x: x[1], reverse=True)

[(('CUSTOMER_ID', 'DAYOFWEEK'), 0.6062499999999988),
 (('CUSTOMER_ID', 'TERMINAL_ID'), 0.574999999999999),
 (('CUSTOMER_ID', 'DAY'), 0.41249999999999953),
 (('DAY', 'DAYOFWEEK'), 0.29374999999999996),
 (('TERMINAL_ID', 'DAY'), 0.2562500000000001),
 (('TERMINAL_ID', 'DAYOFWEEK'), 0.2187500000000001),
 (('CUSTOMER_ID', 'TX_AMOUNT'), 0.06874999999999999),
 (('CUSTOMER_ID', 'MINUTE'), 0.06249999999999999),
 (('TERMINAL_ID', 'TX_AMOUNT'), 0.06249999999999999),
 (('TERMINAL_ID', 'MINUTE'), 0.056249999999999994),
 (('TX_AMOUNT', 'DAY'), 0.056249999999999994),
 (('TX_AMOUNT', 'DAYOFWEEK'), 0.049999999999999996),
 (('DAYOFWEEK', 'MINUTE'), 0.0375),
 (('DAY', 'MINUTE'), 0.03125),
 (('CUSTOMER_ID', 'HOUR'), 0.025),
 (('TERMINAL_ID', 'HOUR'), 0.025),
 (('TX_AMOUNT', 'HOUR'), 0.025),
 (('DAYOFWEEK', 'HOUR'), 0.025),
 (('DAY', 'HOUR'), 0.018750000000000003),
 (('TX_AMOUNT', 'MINUTE'), 0.0125),
 (('HOUR', 'MINUTE'), 0.0)]

# Frist prediction (with train data)

In [36]:
y_test = df[target_col].to_list()
y_test[:10]

['0', '0', '0', '0', '0', '0', '0', '0', '1', '0']

### Hard voting, no weighted

In [38]:
y_pred_hard = cls.testForest(df)

In [61]:
accuracy_hard = sum([v == p for v, p in zip(y_test, y_pred_hard)]) / len(y_test)
accuracy_hard

0.9742369459295198

In [59]:
accuracy_hard_fraud = sum([v == p for v, p in zip(y_test, y_pred_hard) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_hard_valid = sum([v == p for v, p in zip(y_test, y_pred_hard) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_hard_fraud, accuracy_hard_valid

(0.07247037374658159, 0.9950886881738562)

### Soft voting, no weighted

In [62]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

0.9428632254121299

In [63]:
accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid

(0.42403524764509265, 0.9548601962416872)

# Tuning

In [79]:
batch_train_pclass = 2000
batch_val_pclass = batch_train_pclass
max_depth = 4*(len(feat_cols) + 1)
n_trees = 16
max_discard_trees = 6*n_trees
max_feature = 4

cls = RandomForestMC(
    n_trees=n_trees, 
    target_col=target_col, 
    max_discard_trees=max_discard_trees,
    batch_train_pclass=batch_train_pclass,
    batch_val_pclass=batch_val_pclass,
    max_feature=max_feature,
    max_depth=max_depth
)
cls

RandomForestMC(len(Forest)=0,n_trees=16,model_version=1.1.2,module_version=1.1.2)

In [84]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [85]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5765724703737466, 0.9383451197790964)

In [87]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [88]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid

(0.585232452142206, 0.9448232735755263)

In [89]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [90]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5862959586751747, 0.9518178528644049, 0.9435568328921028)

In [91]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [92]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5864478881798846, 0.9521761889203897, 0.9439105040328811)

In [93]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [94]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5879671832269827, 0.9524677761032008, 0.9442298381697003)

In [95]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [96]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5873594652081434, 0.9522253723006229, 0.9439791780407992)

In [97]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [98]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5902461257976299, 0.9533706424403388, 0.9451638046773867)

In [99]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [100]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.59009419629292, 0.9528612431450664, 0.9446624844195844)

In [101]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [102]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5902461257976299, 0.9526961275114264, 0.9445045342013728)

In [103]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:01<?, ?it/s]

In [104]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5902461257976299, 0.9529631230041209, 0.9447654954314616)

In [105]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/16 [00:00<?, ?it/s]

In [106]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.5913096323305986, 0.9530087932857659, 0.9448341694393797)

# N Experiment

In [15]:
y_test = df[target_col].to_list()
y_test[:10]

['0', '0', '0', '0', '0', '0', '0', '0', '1', '0']

In [16]:
N=2

In [17]:
batch_train_pclass = 4000
batch_val_pclass = 1000
max_depth = 8*(len(feat_cols) + 1)
n_trees = 4
max_discard_trees = 32
max_feature = 4

cls = RandomForestMC(
    n_trees=n_trees, 
    target_col=target_col, 
    max_discard_trees=max_discard_trees,
    batch_train_pclass=batch_train_pclass,
    batch_val_pclass=batch_val_pclass,
    max_feature=max_feature,
    max_depth=max_depth
)
cls

RandomForestMC(len(Forest)=0,n_trees=4,model_version=1.1.2,module_version=1.1.2)

In [18]:
cpu_cores = 4
max_workers = cpu_cores

In [19]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.8059860224855667, 0.9167009193779005, 0.9141986945071094)

In [21]:
# 4 discarts: (0.6353691886964449, 0.9230912457096283, 0.9165885499826598)

In [22]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

(0.7839562443026435, 0.9482415185017337, 0.9445285701041441)

In [24]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

Planting the forest:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft

In [None]:
cls.fitParallel(dataset=df, max_workers=max_workers)
ModelDict = cls.model2dict()
datetime_tag = datetime.now().strftime('%Y%m%d_%H%M')
path_dict = f'/work/tmp/nus-fintech-recruitment/model{N}_tuning_{len(cls)}trees_{datetime_tag}.json'
dump_file_json(path_dict, ModelDict)
del ModelDict

In [None]:
cls.setSoftVoting()
y_pred_soft  = cls.testForest(df)
accuracy_soft = sum([v == p for v, p in zip(y_test, y_pred_soft )]) / len(y_test)
accuracy_soft

accuracy_soft_fraud = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '1']) / len([y for y in y_test if y == '1'])
accuracy_soft_valid = sum([v == p for v, p in zip(y_test, y_pred_soft) if v == '0']) / len([y for y in y_test if y == '0'])
accuracy_soft_fraud, accuracy_soft_valid, accuracy_soft