In [None]:
# The following is from "big run"
import sys
sys.path.append('..')
from lib import *
import pandas as pd
import numpy as np

# Setup
lsa_np = np.load('../data/parsed/lsa_output.npy')
metadata = pd.read_pickle('../data/parsed/pickles/pickled_data_test.pickle')
metadata = metadata.loc[metadata['Scenario'] == '401']
metadata = metadata.reset_index(drop=True)
lsa_df = pd.DataFrame(lsa_np)
df = pd.concat([metadata, lsa_df], axis=1, join_axes=[metadata.index])
df = df.loc[df['Label'] != '-1']
df = df.reset_index(drop=True)
cat_features = ['To','From']
features = list(range(100))
features.extend(cat_features + ['Date'])
# features.extend(cat_features + ['ID'])

df = df[features + ['Label'] + ['ID']]
# df = df[features + ['ID']]

# Built-in incremental learning vs trees training on larger initial sizes

## Initializing control variables

In [None]:
n_trees = 64
tree_depth = 10
random_seed = 42
n_max_features = 11
cat_features = ['To', 'From']

## Forests Trained on increasing datasets

In [None]:
forest_100 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)
forest_200 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 200, cat_features)
forest_300 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 300, cat_features)
forest_400 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 400, cat_features)
forest_500 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 500, cat_features)
incremental_forests = [forest_100, forest_200, forest_300, forest_400, forest_500]

In [None]:
for forest in incremental_forests:
    forest.fit_parallel()

In [None]:
forest_500.fit_parallel()

In [None]:
print(evalStats(forest_500.predict_parallel(df[-100:])[1], df[-100:]), end='\n\n')

In [None]:
# evaluation
for forest in incremental_forests:
    print(evalStats(forest.predict_parallel(df[-100:])[1], df[-100:]), end='\n\n')

Test with 64, 100, 42, 11:  
(0.8285714285714286, 0.8055555555555556, 0.87, 0.8169014084507044)  
(0.6285714285714286, 0.9565217391304348, 0.86, 0.7586206896551724)  
(0.2571428571428571, 0.75, 0.71, 0.3829787234042553)  
(0.2, 0.7777777777777778, 0.7, 0.3181818181818182)  
(0.02857142857142857, 0.3333333333333333, 0.64, 0.05263157894736842)  

Test with 64, 10, 42, 11:  
(0.7428571428571429, 0.7878787878787878, 0.84, 0.7647058823529412)  
(0.8, 0.7368421052631579, 0.83, 0.7671232876712328)  
(0.9142857142857143, 0.5333333333333333, 0.69, 0.6736842105263158)  
(0.9714285714285714, 0.4788732394366197, 0.62, 0.6415094339622641)  
(0.9714285714285714, 0.4857142857142857, 0.63, 0.6476190476190476)  

##  Incremental Forests

In [None]:
incremental_forest = RNF(df[0:100], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)

In [None]:
incremental_forest.fit_parallel()
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

In [None]:
incremental_forest.update(df[100:200])
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[200:300])
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[300:400])
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[400:500])
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

In [None]:
incremental_forest.update(df[500:600])
print(len(set(incremental_forest.trees)))
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

# Testing limited core usage

In [None]:
f = RNF(df[0:500], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)
f.fit_parallel()

In [None]:
f.trees[20].visualize()