# Usage 

- Author: Israel de Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [2]:
!pip install -U random-forest-mc

Collecting random-forest-mc
  Downloading random_forest_mc-1.0.0-py3-none-any.whl (13 kB)
Installing collected packages: random-forest-mc
  Attempting uninstall: random-forest-mc
    Found existing installation: random-forest-mc 0.4.0a0
    Uninstalling random-forest-mc-0.4.0a0:
      Successfully uninstalled random-forest-mc-0.4.0a0
Successfully installed random-forest-mc-1.0.0
[0m

In [3]:
%load_ext watermark

In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from random_forest_mc.model import RandomForestMC
from random_forest_mc.utils import load_file_json, dump_file_json


In [5]:
# from tqdm import notebook as tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])


######### LoadDicts

import json
from glob import glob
from typing import Any
from typing import NewType

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()


DictsPathType = NewType("DictsPath", str)


def load_file_json(path: DictsPathType):
    with open(path, "r") as f:
        return json.load(f)


def dump_file_json(path: DictsPathType, var: Any):
    with open(path, "w") as f:
        return json.dump(var, f, indent=4, default=np_encoder)


class LoadDicts:
    def __init__(self, dict_path: DictsPathType = "./data"):
        Dicts_glob = glob(f"{dict_path}/*.json")
        self.List = []
        self.Dict = {}
        for path_json in Dicts_glob:
            name = path_json.split("/")[-1].replace(".json", "")
            self.List.append(name)
            self.Dict[name] = load_file_json(path_json)
            setattr(self, name, self.Dict[name])

    def __repr__(self) -> str:
        return "LoadDicts: {}".format(", ".join(self.List))

In [6]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.10.5
IPython version      : 8.4.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 4.14.285-215.501.amzn2.x86_64
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit

Git hash: ed9d6a828038724ce12549dd4e4c79962779557a

Git repo: https://github.com/ysraell/random-forest-mc.git

Git branch: main

pandas: 1.4.3
numpy : 1.23.1
json  : 2.0.9

CPU	: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
Mem:            15G
Swap:             0B          0B          0B


In [7]:
dataset_path_list = glob('/work/tmp/datasets/*.csv')
dataset_path_list

['/work/tmp/datasets/creditcard.csv',
 '/work/tmp/datasets/creditcard_trans_float.csv',
 '/work/tmp/datasets/creditcard_trans_int.csv',
 '/work/tmp/datasets/iris.csv',
 '/work/tmp/datasets/titanic.csv']

In [8]:
dataset_dict = {
    'titanic' : {
        'ds_cols' : ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'],
        'target_col' : 'Survived',
        'csv_path' : '/work/tmp/datasets/titanic.csv'
    },
    'iris': {
        'ds_cols' : ['sepal.length', 'sepal.width', 'petal.length', 'petal.width'],
        'target_col' : 'variety',
        'csv_path' : '/work/tmp/datasets/iris.csv'
    },
    'creditcard': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard.csv'
    },
    'creditcard_trans_int': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_int.csv'
    },
    'creditcard_trans_float': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_float.csv'
    }
}

# dump_file_json('../tests/datasets_metadata.json',dataset_dict)

In [9]:
ds_name = 'titanic'
md(f'# {ds_name}')
params = dataset_dict[ds_name]

dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
dataset['Age'] = dataset['Age'].astype(np.uint8)
dataset['SibSp'] = dataset['SibSp'].astype(np.uint8)
dataset['Pclass'] = dataset['Pclass'].astype(str)
dataset['Fare'] = dataset['Fare'].astype(np.uint32)
cls = RandomForestMC(
    n_trees=8,
    target_col = params['target_col'],
    max_discard_trees = 4
)
cls.process_dataset(dataset)
cls.fit()
y_test = dataset[params['target_col']].to_list()
y_pred = cls.testForest(dataset)
accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
cls.soft_voting=True
y_pred = cls.testForest(dataset)
accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
md('## Accuracy:') 
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')

# titanic

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.94it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 70.93\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 70.79\% (soft-voting)

In [10]:
ds_name = 'iris'
md(f'# {ds_name}')
params = dataset_dict[ds_name]

dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
dataset.rename(columns={col: col.replace('.','_') for col in dataset.columns}, inplace=True)
params['ds_cols'] = [col.replace('.','_') for col in params['ds_cols']]
cls = RandomForestMC(
    n_trees=8,
    target_col = params['target_col'],
    max_discard_trees = 4
)
cls.process_dataset(dataset)
cls.fit()
y_test = dataset[params['target_col']].to_list()
y_pred = cls.testForest(dataset)
accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
cls.soft_voting=True
y_pred = cls.testForest(dataset)
accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
md('## Accuracy:') 
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')

# iris

Planting the forest: 100%|██████████| 8/8 [00:01<00:00,  5.73it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 92.00\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 94.00\% (soft-voting)

In [11]:
for ds_name in ['creditcard', 'creditcard_trans_int', 'creditcard_trans_float']:
    md(f'# {ds_name}')
    params = dataset_dict[ds_name]

    dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
    cls = RandomForestMC(
        n_trees=8,
        target_col = params['target_col'],
        max_discard_trees = 4
    )
    cls.process_dataset(dataset)
    cls.fit()
    dataset = dataset.sample(n=1000)
    y_test = dataset[params['target_col']].to_list()
    y_pred = cls.testForest(dataset)
    accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
    cls.soft_voting=True
    y_pred = cls.testForest(dataset)
    accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
    md('## Accuracy:') 
    md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
    md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')
    model_dict = cls.model2dict()
    model_path = f'/work/tmp/cls_rfmc_{ds_name}.json'
    dump_file_json(model_path, model_dict)

# creditcard

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.80it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 99.80\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 99.80\% (soft-voting)

# creditcard_trans_int

Planting the forest: 100%|██████████| 8/8 [00:01<00:00,  4.35it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 99.50\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 99.50\% (soft-voting)

# creditcard_trans_float

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.78it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 98.50\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 98.50\% (soft-voting)

In [12]:
cls.data

[DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.8,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.75,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0)]

In [13]:
cls.mergeForest(cls)

In [14]:
cls.data

[DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.8,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.75,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.8,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.75,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0)]

In [15]:
cls.mergeForest(cls, 10, 'random')

In [16]:
cls

RandomForestMC(len(Forest)=10,n_trees=8,model_version=1.0.0,module_version=1.0.0)

In [17]:
cls.data

[DecisionTreeMC(survived_score=0.8,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.75,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.85,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0),
 DecisionTreeMC(survived_score=0.9,module_version=1.0.0)]

In [18]:
cls.mergeForest(cls, 7,'score')

In [19]:
cls

RandomForestMC(len(Forest)=7,n_trees=8,model_version=1.0.0,module_version=1.0.0)

In [20]:
Tree = cls.data[0]

In [21]:
len(cls.survived_scores)

7

In [22]:
#Tree == Tree.data

In [23]:
dataset = dataset.reset_index(drop=True)
row = dataset.loc[0]

In [28]:
dataset.reset_index(drop=True).loc[0]

V1       -0.583218
V2        0.955092
V3        1.407827
V4        1.559776
V5        0.251586
V6       -0.166301
V7        0.567984
V8       -0.140859
V9       -1.370046
V10       0.559036
V11       0.290585
V12       0.437309
V13       1.041657
V14      -0.016315
V15       0.706518
V16      -0.317833
V17       0.007561
V18       -1.17142
V19      -0.812256
V20      -0.068132
V21       0.056497
V22       0.061986
V23       0.129657
V24       1.050658
V25      -1.380546
V26      -0.415294
V27      -0.241776
V28       0.569081
Amount        2.11
Class            0
Name: 0, dtype: object

In [24]:
cls.predict(row)

{'0': 1.0, '1': 0.0}

In [25]:
cls.predict(dataset.sample(n=10))

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

In [26]:
cls.predict_proba(row)

{'0': 1.0, '1': 0.0}

In [27]:
cls.predict_proba(dataset.sample(n=10))

[{'0': 1.0, '1': 0.0},
 {'0': 1.0, '1': 0.0},
 {'0': 1.0, '1': 0.0},
 {'0': 1.0, '1': 0.0},
 {'0': 0.8571428571428571, '1': 0.14285714285714285},
 {'0': 1.0, '1': 0.0},
 {'0': 1.0, '1': 0.0},
 {'0': 1.0, '1': 0.0},
 {'0': 0.5714285714285714, '1': 0.42857142857142855},
 {'0': 0.5714285714285714, '1': 0.42857142857142855}]

In [None]:
cls.predict(row)