# Save and Load Model 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
#!pip3 install -U random-forest-mc

In [2]:
%load_ext watermark

In [6]:
%load_ext autoreload
%autoreload 2

import pandas as pd

import sys
sys.path.append('../src/random_forest_mc')
from model import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# from tqdm import notebook as tqdm

from glob import glob

# import matplotlib.pyplot as plt
# %matplotlib inline
# from matplotlib import rcParams
# from cycler import cycler

# rcParams['figure.figsize'] = 12, 8 # 18, 5
# rcParams['axes.spines.top'] = False
# rcParams['axes.spines.right'] = False
# rcParams['axes.grid'] = True
# rcParams['axes.prop_cycle'] = cycler(color=['#365977'])
# rcParams['lines.linewidth'] = 2.5

# import seaborn as sns
# sns.set_theme()

# pd.set_option("max_columns", None)
# pd.set_option("max_rows", None)
# pd.set_option('display.max_colwidth', None)

from IPython.display import Markdown, display
def md(arg):
    display(Markdown(arg))

# from pandas_profiling import ProfileReport
# #report = ProfileReport(#DataFrame here#, minimal=True)
# #report.to

# import pyarrow.parquet as pq
# #df = pq.ParquetDataset(path_to_folder_with_parquets, filesystem=None).read_pandas().to_pandas()

# import functools
# import operator
# def flat(a):
#     return functools.reduce(operator.iconcat, a, [])


######### LoadDicts

import json
from glob import glob
from typing import Any
from typing import NewType

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()


DictsPathType = NewType("DictsPath", str)


def load_file_json(path: DictsPathType):
    with open(path, "r") as f:
        return json.load(f)


def dump_file_json(path: DictsPathType, var: Any):
    with open(path, "w") as f:
        return json.dump(var, f, indent=4, default=np_encoder)


class LoadDicts:
    def __init__(self, dict_path: DictsPathType = "./data"):
        Dicts_glob = glob(f"{dict_path}/*.json")
        self.List = []
        self.Dict = {}
        for path_json in Dicts_glob:
            name = path_json.split("/")[-1].replace(".json", "")
            self.List.append(name)
            self.Dict[name] = load_file_json(path_json)
            setattr(self, name, self.Dict[name])

    def __repr__(self) -> str:
        return "LoadDicts: {}".format(", ".join(self.List))

In [8]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v
!cat /proc/cpuinfo |grep 'model name'|head -n 1 |sed -e 's/model\ name/CPU/'
!free -h |cut -d'i' -f1  |grep -v total

Python implementation: CPython
Python version       : 3.10.5
IPython version      : 8.4.0

Compiler    : GCC 10.2.1 20210110
OS          : Linux
Release     : 4.14.285-215.501.amzn2.x86_64
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit

Git hash: 7371f16915e157920474ed4312c148ba08863bbe

Git repo: https://github.com/ysraell/random-forest-mc.git

Git branch: dev

logging: 0.5.1.2
pandas : 1.4.3
json   : 2.0.9
numpy  : 1.23.1
sys    : 3.10.5 (main, Jul 12 2022, 11:32:11) [GCC 10.2.1 20210110]
re     : 2.2.1

CPU	: Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
Mem:            15G
Swap:             0B          0B          0B


In [9]:
dataset_path_list = glob('/work/tmp/datasets/*.csv')
dataset_path_list

['/work/tmp/datasets/creditcard.csv',
 '/work/tmp/datasets/creditcard_trans_float.csv',
 '/work/tmp/datasets/creditcard_trans_int.csv',
 '/work/tmp/datasets/iris.csv',
 '/work/tmp/datasets/titanic.csv']

In [10]:
dataset_dict = {
    'titanic' : {
        'ds_cols' : ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'],
        'target_col' : 'Survived',
        'csv_path' : '/work/tmp/datasets/titanic.csv'
    },
    'iris': {
        'ds_cols' : ['sepal.length', 'sepal.width', 'petal.length', 'petal.width'],
        'target_col' : 'variety',
        'csv_path' : '/work/tmp/datasets/iris.csv'
    },
    'creditcard': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard.csv'
    },
    'creditcard_trans_int': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_int.csv'
    },
    'creditcard_trans_float': {
        'ds_cols' : ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
        'target_col' : 'Class',
        'csv_path' : '/work/tmp/datasets/creditcard_trans_float.csv'
    }
}


In [11]:
ds_name = 'titanic'
md(f'# {ds_name}')
params = dataset_dict[ds_name]

dataset = pd.read_csv(params['csv_path'])[params['ds_cols']+[params['target_col']]].dropna().reset_index(drop=True)
dataset['Age'] = dataset['Age'].astype(np.uint8)
dataset['SibSp'] = dataset['SibSp'].astype(np.uint8)
dataset['Pclass'] = dataset['Pclass'].astype(str)
dataset['Fare'] = dataset['Fare'].astype(np.uint32)
cls = RandomForestMC(
    n_trees=8,
    target_col = params['target_col'],
    max_discard_trees = 4
)
cls.process_dataset(dataset)
cls.fit()
y_test = dataset[params['target_col']].to_list()
y_pred = cls.testForest(dataset)
accuracy_hard = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
cls.soft_voting=True
y_pred = cls.testForest(dataset)
accuracy_soft = "{:.2f}\%".format(100*sum([v == p for v, p in zip(y_test, y_pred)]) / len(y_pred))
md('## Accuracy:') 
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_hard} (hard-voting)')
md(f'## &nbsp;&nbsp;&nbsp; {accuracy_soft} (soft-voting)')

# titanic

Planting the forest: 100%|██████████| 8/8 [00:02<00:00,  3.93it/s]


## Accuracy:

## &nbsp;&nbsp;&nbsp; 77.81\% (hard-voting)

## &nbsp;&nbsp;&nbsp; 77.81\% (soft-voting)

In [12]:
model_dict = cls.model2dict()

In [13]:
model_dict

{'batch_train_pclass': 10,
 'batch_val_pclass': 10,
 '_N': 20,
 'min_feature': 2,
 'max_feature': 6,
 'th_start': 0.9,
 'delta_th': 0.1,
 'max_discard_trees': 4,
 'n_trees': 8,
 'class_vals': ['0', '1'],
 'survived_scores': [0.7, 0.7, 0.6, 0.7, 0.7, 0.8, 0.6, 0.6],
 'version': '0.4.0-alpha',
 'numeric_cols': ['Age', 'SibSp', 'Fare'],
 'feature_cols': ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'],
 'type_of_cols': {'Age': 'numeric',
  'SibSp': 'numeric',
  'Fare': 'numeric',
  'Embarked': 'categorical',
  'Pclass': 'categorical',
  'Sex': 'categorical'},
 'target_col': 'Survived',
 'data': [{'data': {'Sex': {'split': {'feat_type': 'categorical',
      'split_val': 'male',
      '>=': {'Fare': {'split': {'feat_type': 'numeric',
         'split_val': 8,
         '>=': {'SibSp': {'split': {'feat_type': 'numeric',
            'split_val': 0,
            '>=': {'leaf': {'0': 1.0}},
            '<': {'Embarked': {'split': {'feat_type': 'categorical',
               'split_val': 'S',
 

In [19]:
!ls /work/tmp/cls_rfmc_titanic.json
!rm /work/tmp/cls_rfmc_titanic.json

/work/tmp/cls_rfmc_titanic.json


In [20]:
model_path = '/work/tmp/cls_rfmc_titanic.json'
dump_file_json(model_path, model_dict)

In [21]:
modeldict = load_file_json(model_path)
modeldict

{'batch_train_pclass': 10,
 'batch_val_pclass': 10,
 '_N': 20,
 'min_feature': 2,
 'max_feature': 6,
 'th_start': 0.9,
 'delta_th': 0.1,
 'max_discard_trees': 4,
 'n_trees': 8,
 'class_vals': ['0', '1'],
 'survived_scores': [0.7, 0.7, 0.6, 0.7, 0.7, 0.8, 0.6, 0.6],
 'version': '0.4.0-alpha',
 'numeric_cols': ['Age', 'SibSp', 'Fare'],
 'feature_cols': ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Embarked'],
 'type_of_cols': {'Age': 'numeric',
  'SibSp': 'numeric',
  'Fare': 'numeric',
  'Embarked': 'categorical',
  'Pclass': 'categorical',
  'Sex': 'categorical'},
 'target_col': 'Survived',
 'data': [{'data': {'Sex': {'split': {'feat_type': 'categorical',
      'split_val': 'male',
      '>=': {'Fare': {'split': {'feat_type': 'numeric',
         'split_val': 8,
         '>=': {'SibSp': {'split': {'feat_type': 'numeric',
            'split_val': 0,
            '>=': {'leaf': {'0': 1.0}},
            '<': {'Embarked': {'split': {'feat_type': 'categorical',
               'split_val': 'S',
 

In [22]:
cls = RandomForestMC()
cls

RandomForestMC(len(Forest)=0,n_trees=16,model_version=0.4.0-alpha,module_version=0.4.0-alpha)

In [25]:
modeldict.keys()

dict_keys(['batch_train_pclass', 'batch_val_pclass', '_N', 'min_feature', 'max_feature', 'th_start', 'delta_th', 'max_discard_trees', 'n_trees', 'class_vals', 'survived_scores', 'version', 'numeric_cols', 'feature_cols', 'type_of_cols', 'target_col', 'data'])

In [23]:
cls.dict2model(modeldict)

AttributeError: 'dict' object has no attribute 'data'

In [18]:
cls

RandomForestMC(len(Forest)=0,n_trees=8,model_version=0.4.0-alpha,module_version=0.4.0-alpha)

In [24]:
cls.data

[]