# Usage 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
!pip install -U ../

Processing /work/random-forest-mc
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: random-forest-mc
  Building wheel for random-forest-mc (pyproject.toml) ... [?25ldone
[?25h  Created wheel for random-forest-mc: filename=random_forest_mc-0.4.0a0-py3-none-any.whl size=14057 sha256=f29e79ccea3dd7e85eee71037d8e01a76d080717bbf512c560e1edccd9f067b4
  Stored in directory: /root/.cache/pip/wheels/eb/5f/19/4722d7f6bf0d13102d1af6ab98e4b53771da1ed44ce678108e
Successfully built random-forest-mc
Installing collected packages: random-forest-mc
  Attempting uninstall: random-forest-mc
    Found existing installation: random-forest-mc 0.4.0a0
    Uninstalling random-forest-mc-0.4.0a0:
      Successfully uninstalled random-forest-mc-0.4.0a0
Successfully installed random-forest-mc-0.4.0a0
[0m

In [2]:
%load_ext watermark

In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from random_forest_mc.model import RandomForestMC
from random_forest_mc.utils import load_file_json, dump_file_json


In [4]:
# from tqdm import notebook as tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])


######### LoadDicts

import json
from glob import glob
from typing import Any
from typing import NewType

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()


DictsPathType = NewType("DictsPath", str)


def load_file_json(path: DictsPathType):
    with open(path, "r") as f:
        return json.load(f)


def dump_file_json(path: DictsPathType, var: Any):
    with open(path, "w") as f:
        return json.dump(var, f, indent=4, default=np_encoder)


class LoadDicts:
    def __init__(self, dict_path: DictsPathType = "./data"):
        Dicts_glob = glob(f"{dict_path}/*.json")
        self.List = []
        self.Dict = {}
        for path_json in Dicts_glob:
            name = path_json.split("/")[-1].replace(".json", "")
            self.List.append(name)
            self.Dict[name] = load_file_json(path_json)
            setattr(self, name, self.Dict[name])

    def __repr__(self) -> str:
        return "LoadDicts: {}".format(", ".join(self.List))

In [5]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.10.5
IPython version      : 8.4.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 4.14.285-215.501.amzn2.x86_64
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit

Git hash: bcc395921b05b82fc8664b049faa785c4936800f

Git repo: https://github.com/ysraell/random-forest-mc.git

Git branch: dev

json  : 2.0.9
numpy : 1.23.1
pandas: 1.4.3

CPU	: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
Mem:            15G
Swap:             0B          0B          0B


In [6]:
dataset_path_list = glob('/work/tmp/datasets/*.csv')
dataset_path_list

['/work/tmp/datasets/creditcard.csv',
 '/work/tmp/datasets/creditcard_trans_float.csv',
 '/work/tmp/datasets/creditcard_trans_int.csv',
 '/work/tmp/datasets/iris.csv',
 '/work/tmp/datasets/titanic.csv']

In [7]:
dataset_dict = {
    'titanic' : {
        'ds_cols' : ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'],
        'target_col' : 'Survived',
        'csv_path' : '/work/tmp/datasets/titanic.csv'
    },
    'iris': {
        'ds_cols' : ['sepal.length', 'sepal.width', 'petal.length', 'petal.width'],
        'target_col' : 'variety',
        'csv_path' : '/work/tmp/datasets/iris.csv'
    },
    'creditcard': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard.csv'
    },
    'creditcard_trans_int': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_int.csv'
    },
    'creditcard_trans_float': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_float.csv'
    }
}

# dump_file_json('../tests/datasets_metadata.json',dataset_dict)

In [8]:
ds_name = 'titanic'
md(f'# {ds_name}')
params = dataset_dict[ds_name]

dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
dataset['Age'] = dataset['Age'].astype(np.uint8)
dataset['SibSp'] = dataset['SibSp'].astype(np.uint8)
dataset['Pclass'] = dataset['Pclass'].astype(str)
dataset['Fare'] = dataset['Fare'].astype(np.uint32)
cls = RandomForestMC(
    n_trees=8,
    target_col = params['target_col'],
    max_discard_trees = 4
)
cls.process_dataset(dataset)
cls.fit()
y_test = dataset[params['target_col']].to_list()
y_pred = cls.testForest(dataset)
accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
cls.soft_voting=True
y_pred = cls.testForest(dataset)
accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
md('## Accuracy:') 
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')

# titanic

Planting the forest: 100%|██████████| 8/8 [00:01<00:00,  4.01it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 78.37\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 76.97\% (soft-voting)

In [9]:
ds_name = 'iris'
md(f'# {ds_name}')
params = dataset_dict[ds_name]

dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
dataset.rename(columns={col: col.replace('.','_') for col in dataset.columns}, inplace=True)
params['ds_cols'] = [col.replace('.','_') for col in params['ds_cols']]
cls = RandomForestMC(
    n_trees=8,
    target_col = params['target_col'],
    max_discard_trees = 4
)
cls.process_dataset(dataset)
cls.fit()
y_test = dataset[params['target_col']].to_list()
y_pred = cls.testForest(dataset)
accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
cls.soft_voting=True
y_pred = cls.testForest(dataset)
accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
md('## Accuracy:') 
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')

# iris

Planting the forest: 100%|██████████| 8/8 [00:00<00:00,  8.19it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 93.33\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 93.33\% (soft-voting)

In [18]:
for ds_name in ['creditcard', 'creditcard_trans_int', 'creditcard_trans_float']:
    md(f'# {ds_name}')
    params = dataset_dict[ds_name]

    dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
    cls = RandomForestMC(
        n_trees=8,
        target_col = params['target_col'],
        max_discard_trees = 4
    )
    cls.process_dataset(dataset)
    cls.fit()
    dataset = dataset.sample(n=1000)
    y_test = dataset[params['target_col']].to_list()
    y_pred = cls.testForest(dataset)
    accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
    cls.soft_voting=True
    y_pred = cls.testForest(dataset)
    accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
    md('## Accuracy:') 
    md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
    md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')
    model_dict = cls.model2dict()
    model_path = f'/work/tmp/cls_rfmc_{ds_name}.json'
    dump_file_json(model_path, model_dict)

# creditcard

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.45it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 98.80\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 98.40\% (soft-voting)

# creditcard_trans_int

Planting the forest: 100%|██████████| 8/8 [00:01<00:00,  4.81it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 99.80\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 99.80\% (soft-voting)

# creditcard_trans_float

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.89it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 99.00\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 99.00\% (soft-voting)

In [19]:
cls.mergeForest(cls)

In [20]:
cls.data

[DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.7,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.6,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.7,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.6,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.

In [21]:
cls.Forest_size

16

In [22]:
cls

RandomForestMC(len(Forest)=16,n_trees=8,model_version=0.4.0-alpha,module_version=0.4.0-alpha)

In [23]:
cls.mergeForest(cls, 10, 'random')

In [24]:
cls

RandomForestMC(len(Forest)=10,n_trees=8,model_version=0.4.0-alpha,module_version=0.4.0-alpha)

In [25]:
cls.data

[DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.7,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.7,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.9,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.6,module_version=0.4.0-alpha),
 DecisionTreeMC(survived_score=0.85,module_version=0.4.0-alpha)]

In [26]:
cls.mergeForest(cls, 7,'score')

In [27]:
cls

RandomForestMC(len(Forest)=7,n_trees=8,model_version=0.4.0-alpha,module_version=0.4.0-alpha)

In [32]:
Tree = cls.data[0]

In [31]:
len(cls.survived_scores)

7

In [35]:
Tree == Tree.data

TypeError: Both objects must be instances of 'DecisionTreeMC' class.

In [39]:
dataset = dataset.reset_index(drop=True)
row = dataset.loc[0]

In [42]:
cls.predict(dataset.sample(n=10))

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union