In [2]:
# The following is from "big run"
import sys
sys.path.append('..')
from lib import *
import pandas as pd
import numpy as np

# Setup
lsa_np = np.load('../data/parsed/lsa_output.npy')
metadata = pd.read_pickle('../data/parsed/pickles/pickled_data_test.pickle')
metadata = metadata.loc[metadata['Scenario'] == '401']
metadata = metadata.reset_index(drop=True)
lsa_df = pd.DataFrame(lsa_np)
df = pd.concat([metadata, lsa_df], axis=1, join_axes=[metadata.index])
df = df.loc[df['Label'] != '-1']
df = df.reset_index(drop=True)
cat_features = ['To','From']
features = list(range(100))
features.extend(cat_features + ['Date'])
# features.extend(cat_features + ['ID'])

df = df[features + ['Label'] + ['ID']]
# df = df[features + ['ID']]

# Built-in incremental learning vs trees training on larger initial sizes

## Initializing control variables

In [3]:
n_trees = 64
tree_depth = 10
random_seed = 42
n_max_features = 11
cat_features = ['To', 'From']

## Forests Trained on increasing datasets

In [None]:
forest_100 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)
forest_200 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 200, cat_features)
forest_300 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 300, cat_features)
forest_400 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 400, cat_features)
forest_500 = RNF(df[:-100], n_trees, tree_depth, random_seed, n_max_features, 500, cat_features)
incremental_forests = [forest_100, forest_200, forest_300, forest_400, forest_500]

In [None]:
for forest in incremental_forests:
    forest.fit_parallel()

In [None]:
# evaluation
for forest in incremental_forests:
    print(evalStats(forest.predict_parallel(df[-100:])[1], df[-100:]), end='\n\n')

Test with 64, 100, 42, 11:  
(0.8285714285714286, 0.8055555555555556, 0.87, 0.8169014084507044)  
(0.6285714285714286, 0.9565217391304348, 0.86, 0.7586206896551724)  
(0.2571428571428571, 0.75, 0.71, 0.3829787234042553)  
(0.2, 0.7777777777777778, 0.7, 0.3181818181818182)  
(0.02857142857142857, 0.3333333333333333, 0.64, 0.05263157894736842)  

Test with 64, 10, 42, 11:  
(0.7428571428571429, 0.7878787878787878, 0.84, 0.7647058823529412)  
(0.8, 0.7368421052631579, 0.83, 0.7671232876712328)  
(0.9142857142857143, 0.5333333333333333, 0.69, 0.6736842105263158)  
(0.9714285714285714, 0.4788732394366197, 0.62, 0.6415094339622641)  
(0.9714285714285714, 0.4857142857142857, 0.63, 0.6476190476190476)  

##  Incremental Forests

In [3]:
incremental_forest = RNF(df[0:100], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)

In [4]:
incremental_forest.fit_parallel()
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

(0.2, 1.0, 0.72, 0.33333333333333337)


In [5]:
incremental_forest.update(df[100:200])
print(incremental_forest.trees[0].traverse())

[<lib.node.Node object at 0x7fb88d3767b8>, <lib.node.Node object at 0x7fb88d272470>, <lib.node.Node object at 0x7fb88d272240>, <lib.node.Node object at 0x7fb88d2724e0>, <lib.node.Node object at 0x7fb88d2724a8>, <lib.node.Node object at 0x7fb88d272550>, <lib.node.Node object at 0x7fb88d272518>, <lib.node.Node object at 0x7fb88d2725c0>, <lib.node.Node object at 0x7fb88d272588>, <lib.node.Node object at 0x7fb88d2726a0>, <lib.node.Node object at 0x7fb88d2725f8>, <lib.node.Node object at 0x7fb88d272710>, <lib.node.Node object at 0x7fb88d2726d8>, <lib.node.Node object at 0x7fb88d272668>, <lib.node.Node object at 0x7fb88d272630>, <lib.node.Node object at 0x7fb88d2727f0>, <lib.node.Node object at 0x7fb88d272748>, <lib.node.Node object at 0x7fb88d2727b8>, <lib.node.Node object at 0x7fb88d272780>]


In [6]:
for i in range(len(incremental_forest.trees)):
    a = [len(n.rows) for n in incremental_forest.trees[i].traverse()]
    if 0 in a:
        print(i)

37


In [7]:
incremental_forest.update(df[100:200])
print(len(set(incremental_forest.trees)))
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[200:300])
print(len(set(incremental_forest.trees)))
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[300:400])
print(len(set(incremental_forest.trees)))
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

incremental_forest.update(df[400:500])
print(len(set(incremental_forest.trees)))
print(evalStats(incremental_forest.predict_parallel(df[-100:])[1], df[-100:]))

ZeroDivisionError: division by zero

# Testing limited core usage

In [10]:
f = RNF(df[0:500], n_trees, tree_depth, random_seed, n_max_features, 100, cat_features)
f.fit_parallel()

when splitting categorically, no best address found: [310 487]


In [14]:
f.trees[20].visualize()

'digraph Tree {\nnode [shape=box];\n8934927891 [label="X[10] < 0.003076105946907499\ngini = 0.41794704861111115\nsamples = 100\ndistribution = [36, 64]"];\n5127460020 [label="X[50] < 0.011433239453815455\ngini = 0.29327286470143604\nsamples = 36\ndistribution = [21, 15]"];\n8934927891 -> 5127460020 [labeldistance=8, labelangle=30, xlabel="True"]\n5989794006 [label="X[49] < -0.0011506426194517398\ngini = 0.29699753770390885\nsamples = 64\ndistribution = [19, 45]"];\n8934927891 -> 5989794006 [labeldistance=8, labelangle=-30, xlabel="False"]\n5993162332 [label="X[92] < 0.03385265871022291\ngini = 0.08595238095238103\nsamples = 21\ndistribution = [20, 1]"];\n5127460020 -> 5993162332 [labeldistance=8, labelangle=30, xlabel="True"]\n6982570830 [label="X[49] < 0.03796428181838095\ngini = 0.20370370370370364\nsamples = 15\ndistribution = [12, 3]"];\n5127460020 -> 6982570830 [labeldistance=8, labelangle=-30, xlabel="False"]\n5796857334 [label="X[92] < 0.04800362738305229\ngini = 0.3496630850482