# Tuning 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import pandas as pd
import numpy as np
from random_forest_mc.utils import LoadDicts, dump_file_json, load_file_json
from tqdm.notebook import tqdm
from datetime import datetime
from collections import Counter, defaultdict
from glob import glob
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib 

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
from cycler import cycler

rcParams['figure.figsize'] = 6, 2 # 18, 5
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
rcParams['axes.grid'] = True
rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
rcParams['lines.linewidth'] = 2.5

In [4]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [5]:
import functools
import operator
def flat(a):
    return functools.reduce(operator.iconcat, a, [])

In [6]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.27.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 6.8.0-44-generic
Machine     : x86_64
Processor   : 
CPU cores   : 20
Architecture: 64bit

Git hash: afffba9ffbd19f4c4b696c18f695f73af62368d0

Git repo: https://github.com/ysraell/random-forest-mc-utils.git

Git branch: main

numpy     : 1.26.4
sklearn   : 1.3.2
pandas    : 2.2.2
matplotlib: 3.9.2

CPU	: 12th Gen Intel(R) Core(TM) i7-12700
Mem:            15G
Swap:          3.7G


In [7]:
dataset_path = '/work/tmp/credit-card-fraud-prediction'

In [8]:
!ls $dataset_path

model_trans_tuning_256trees_20240917_1808.json
model_trans_tuning_32trees_20240917_1322.json
model_trans_tuning_32trees_20240917_1416.json
model_tuned_256trees_20240918_0134.json
model_tuned_not_trans_256trees_20240918_0559.json
sample_submission.csv
submission_exp1.csv
submission_exp2_trans.csv
submission_exp2_trans_probs.csv
submission_exp2_trans_probs2.csv
submission_exp2_trans_probs4.csv
submission_exp2_trans_t2.csv
submission_exp2_trans_t3.csv
submission_exp2_trans_t4.csv
submission_exp2_trans_t5.csv
submission_exp2_trans_t6.csv
submission_exp4_not_trans_probs1.csv
submission_sklearn_exp1.csv
submission_sklearn_exp1_proba.csv
submission_sklearn_exp1_proba2.csv
submission_sklearn_exp1_proba3.csv
submission_sklearn_exp2_proba1.csv
test.csv
test_trans.csv
train.csv
train_trans.csv
trees_trans_metrcis_sim_20240917_1950.json


In [9]:
df = pd.read_csv(f'{dataset_path}/train.csv')
#df = df.drop(columns='Time')
target_col = 'IsFraud'
df[target_col] = df[target_col].astype(str)
feat_cols = df.columns.to_list()
feat_cols.remove('id')
feat_cols.remove(target_col)
IsFraud_count, Not_IsFraud_count = df[target_col].value_counts()['1'], df[target_col].value_counts()['0']
IsFraud_count, Not_IsFraud_count

(269, 149731)

In [None]:
RandomForestClassifier?

In [None]:
HalvingGridSearchCV?

In [10]:
X, y = df[feat_cols].to_numpy(), df[target_col].to_numpy()
clf = RandomForestClassifier(random_state=0)
param_grid = {
    "max_depth": [2, 3, 4],
    "min_samples_split": [3],
    "max_features" : ["sqrt", "log2", None],
    "criterion" : ["gini", "entropy", "log_loss"]
}
search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', max_resources=512, random_state=0, n_jobs=12).fit(X, y)
search.best_params_  

{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 'log2',
 'min_samples_split': 3,
 'n_estimators': 486}

In [10]:
X, y = df[feat_cols].to_numpy(), df[target_col].to_numpy()
clf = GradientBoostingClassifier(random_state=0)
param_grid = {
    "loss": ['log_loss', 'exponential'],
    "min_samples_split": [2, 3],
    "max_depth": [2, 3, 4],
    "max_features" : ["sqrt", "log2", None],
    "criterion" : ["friedman_mse", "squared_error"]
}
search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', max_resources=512, random_state=0, n_jobs=12).fit(X, y)
search.best_params_  

{'criterion': 'friedman_mse',
 'loss': 'exponential',
 'max_depth': 2,
 'max_features': 'log2',
 'min_samples_split': 2,
 'n_estimators': 486}

In [None]:
DecisionTreeClassifier?

In [25]:
X, y = df[feat_cols].to_numpy(), df[target_col].to_numpy()
clf = AdaBoostClassifier(random_state=0)
estimator1 = DecisionTreeClassifier(criterion= 'entropy', max_depth= 3, max_features= 'log2', min_samples_split=3, random_state=0)
estimator2 = DecisionTreeClassifier(criterion= 'gini', max_depth= 3, max_features= 'log2', min_samples_split=3, random_state=0)
estimator3 = DecisionTreeClassifier(criterion= 'entropy', max_depth= 3, max_features= 'sqrt', min_samples_split=3, random_state=0)
estimator4 = DecisionTreeClassifier(criterion= 'gini', max_depth= 3, max_features= 'sqrt', min_samples_split=3, random_state=0)
param_grid = {
    "estimator": [estimator1, estimator2, estimator3, estimator4],
}
search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', max_resources=512, random_state=0, n_jobs=12).fit(X, y)
search.best_params_  

{'estimator': DecisionTreeClassifier(max_depth=3, max_features='sqrt', min_samples_split=3,
                        random_state=0),
 'n_estimators': 510}

In [None]:
search.best_score_

In [11]:
cls = search.best_estimator_

In [None]:
search.cv_results_

# Generate the submission file

In [12]:
df_test = pd.read_csv(f'{dataset_path}/test.csv')
df_test

Unnamed: 0,id,Time,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat20,feat21,feat22,feat23,feat24,feat25,feat26,feat27,feat28,Transaction_Amount
0,150000,73899,-0.681234,-0.371212,0.385966,0.955703,2.064078,0.338827,-0.539452,-0.254046,...,0.183560,-0.253324,0.266668,-0.153829,0.455969,-0.503628,0.257588,-0.456685,-0.298919,30.42
1,150001,73899,-1.098947,-0.959377,0.324934,0.703908,1.090582,-1.595909,0.584548,0.260069,...,0.334764,0.130108,0.676928,-0.161070,-0.638011,-0.273424,0.711132,0.349967,0.141233,23.00
2,150002,73899,0.977029,-0.270984,0.471526,-1.232570,0.957537,-0.636602,-0.953060,-1.491744,...,0.355728,0.517912,1.175087,-0.325895,-0.362636,0.306037,0.004828,0.037389,0.058222,198.00
3,150003,73900,1.176658,-0.225816,-0.246600,0.015513,1.103831,1.229516,-1.527098,-0.459769,...,-0.152613,-0.104600,0.003800,-0.023180,-0.458338,0.481427,-0.381415,0.080165,0.027372,9.99
4,150004,73900,0.804828,-0.272967,0.959910,-1.117567,0.395748,0.589855,1.059880,-1.101203,...,-0.127818,-0.011849,0.078540,-0.241700,0.010130,0.302614,-0.259568,0.023127,0.056957,239.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69124,219124,120580,1.891079,-0.041423,0.858158,-1.272908,-3.783908,-1.388939,-0.280639,2.012789,...,-0.195703,-0.181369,-0.456538,-0.069571,0.756765,0.244479,-0.147566,-0.054725,-0.044588,198.65
69125,219125,120580,0.139724,-0.917395,-0.257933,0.948649,-2.913655,-2.184829,-0.457534,1.883716,...,-0.116538,0.491469,1.478823,-0.085398,-0.091409,-1.053488,0.467570,0.358918,0.294735,24.00
69126,219126,120580,2.058343,0.391801,-0.136498,-0.038993,-1.928553,0.330117,0.179926,0.270127,...,-0.384830,-0.306640,-0.965783,0.307799,-0.021434,-0.343989,0.181065,-0.098387,-0.044064,1.79
69127,219127,120580,2.079227,0.301966,1.536193,-2.162389,-1.785833,-2.804889,-0.058879,0.552845,...,-0.190984,0.109909,0.590401,0.286621,0.675660,-0.510736,-0.090044,0.056749,-0.017126,88.00


In [13]:
df_id = df_test['id'].to_frame()
df_test = df_test.drop(columns=['id'])

In [14]:
y_pred_class = cls.predict(df_test.to_numpy())
y_pred_proba = cls.predict_proba(df_test.to_numpy())

In [15]:
y_pred_proba[:3], y_pred_class[:3]

(array([[9.99427077e-01, 5.72922940e-04],
        [9.99581177e-01, 4.18822662e-04],
        [9.99556557e-01, 4.43443086e-04]]),
 array(['0', '0', '0'], dtype=object))

In [16]:
fraud_proba = [x[1] for x in y_pred_proba]

In [17]:
!head $dataset_path/sample_submission.csv

id,IsFraud
150000,0.5
150001,0.5
150002,0.5
150003,0.5
150004,0.5
150005,0.5
150006,0.5
150007,0.5
150008,0.5


In [18]:
#df_id['IsFraud'] = y_pred_soft
df_id['IsFraud'] = fraud_proba
df_id

Unnamed: 0,id,IsFraud
0,150000,0.000573
1,150001,0.000419
2,150002,0.000443
3,150003,0.000537
4,150004,0.001963
...,...,...
69124,219124,0.001198
69125,219125,0.002203
69126,219126,0.001915
69127,219127,0.001256


In [19]:
df_id.to_csv(f'{dataset_path}/submission_sklearn_exp3_proba1.csv', index=False)

In [None]:
!head $dataset_path/submission_exp2_trans_t2.csv

In [20]:
!mv /work/do /work/yes.do ||touch /work/yes

mv: cannot stat '/work/do': No such file or directory


# Kagle results

## Exp1:

Private Score: `0.53218`  (1st: `0.80611`)  
Public Score: `0.59206`  (1st: `0.83947`)  

## Exp2:
Private Score: `0.54502`  
Public score: `0.56385`  