In [14]:
import numpy as np
import sys
from pyinstrument import Profiler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from bart_playground import *
import bartz

In [15]:
proposal_probs = {"grow" : 0.5,
                  "prune" : 0.5}
generator = DataGenerator(n_samples=160, n_features=2, noise=0.1, random_seed=42)
X, y = generator.generate(scenario="piecewise_flat")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
np.set_printoptions(suppress=True)
print(y_train[:12])

[ 0.50327821  0.60672224  0.26898966  0.55211673  0.50693811  0.66162097
 -0.64127659  0.65112284  0.03487759  0.23276531  0.44055996  0.38216964]


In [16]:
profiler = Profiler()
profiler.start()
bart = DefaultBART(ndpost=300, nskip=100, n_trees=100, proposal_probs=proposal_probs)
bart.fit(X_train, y_train)
profiler.stop()
profiler.print()

Running iteration 0
Running iteration 10
Running iteration 20
Running iteration 30
Running iteration 40
Running iteration 50
Running iteration 60
Running iteration 70
Running iteration 80
Running iteration 90
Running iteration 100
Running iteration 110
Running iteration 120
Running iteration 130
Running iteration 140
Running iteration 150
Running iteration 160
Running iteration 170
Running iteration 180
Running iteration 190
Running iteration 200
Running iteration 210
Running iteration 220
Running iteration 230
Running iteration 240
Running iteration 250
Running iteration 260
Running iteration 270
Running iteration 280
Running iteration 290
Running iteration 300
Running iteration 310
Running iteration 320
Running iteration 330
Running iteration 340
Running iteration 350
Running iteration 360
Running iteration 370
Running iteration 380
Running iteration 390

  _     ._   __/__   _ _  _  _ _/_   Recorded: 21:00:57  Samples:  18806
 /_//_/// /_\ / //_// / //_'/ //     Duration: 19.062    

In [17]:
arrays = [tree.vars for tree in bart.trace[-1].trees]
counts = np.array([np.count_nonzero(arr >= 0) for arr in arrays])
print(counts)
deep_trees = np.array([count >= 3 for count in counts])
print(np.where(deep_trees))

[1 1 3 1 1 2 1 1 1 1 1 1 1 1 1 3 1 0 2 1 1 2 1 1 1 0 1 1 0 1 1 1 2 1 0 2 1
 1 2 1 1 2 2 1 2 1 1 1 1 1 1 2 1 1 3 0 1 1 1 2 1 0 1 1 2 1 2 1 1 2 1 1 2 3
 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1]
(array([ 2, 15, 54, 73], dtype=int64),)


In [18]:
print(bart.sampler.trace[-1].global_params)

{'eps_sigma2': 0.006508667493912149}


In [19]:
from bart_playground import visualize_tree
tree_sp : Tree = bart.sampler.trace[1].trees[0]

print(tree_sp)
print(tree_sp.vars)
print(tree_sp.leaf_vals)
# print(tree_sp.node_indicators)
# visualize_tree(tree_sp, tree_sp)

X_0 <= 0.830 (split, n = 120)
	Val: -0.046 (leaf, n = 102)
	Val: -0.032 (leaf, n = 18)
[ 0 -1 -1 -2 -2 -2 -2 -2]
[        nan -0.04601371 -0.0316645          nan         nan         nan
         nan         nan]


In [20]:
y_train_le = y_train[X_train[:, 0] <= 0.830]
y_train_gt = y_train[X_train[:, 0] > 0.830]

print("y_train where X_train[:,0] <= 0.830 mean:", y_train_le.mean())
print("y_train where X_train[:,0] > 0.830 mean:", y_train_gt.mean())

y_train where X_train[:,0] <= 0.830 mean: 0.28478345154810286
y_train where X_train[:,0] > 0.830 mean: 0.5282667945838203


In [21]:
rf = RandomForestRegressor()
lr = LinearRegression()
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

btz = bartz.BART.gbart(np.transpose(X_train), y_train, ntree=100, ndpost=200, nskip=100)
btpred_all = btz.predict(np.transpose(X_test))
btpred = np.mean(np.array(btpred_all), axis=0)

Iteration 100/300 P_grow=0.55 P_prune=0.45 A_grow=0.36 A_prune=0.36 (burnin)
Iteration 200/300 P_grow=0.57 P_prune=0.43 A_grow=0.35 A_prune=0.37
Iteration 300/300 P_grow=0.57 P_prune=0.43 A_grow=0.39 A_prune=0.40


In [22]:
print(btz.lamda)
print(btz._show_tree(1, 0))# , print_all=True))
# btz.first_sigma

0.009311548
  1 ┐(1: 10)
  2 ├── -0.0022
  3 └── 0.14
None


In [23]:

models = {"bart" : bart, 
          "rf" : rf, 
          "lr" : lr,
          "btz" : btz}
results = {}
for model_name, model in models.items():
    if model_name == "btz":
        results[model_name] = mean_squared_error(y_test, btpred)
    else:
        results[model_name] = mean_squared_error(y_test, model.predict(X_test))
results

{'bart': 0.023760391537674806,
 'rf': 0.020302800535874868,
 'lr': 0.048045521328019404,
 'btz': 0.02328283761397566}

In [24]:
print(bart.sampler.trace[-1].evaluate(X_train)[:12])
print(y_train[:12])

[ 0.42521308  0.34242477  0.21400383  0.38150004  0.32231214  0.35365035
 -0.31397141  0.48707495 -0.04346153  0.08063366  0.18502987  0.34382066]
[ 0.50327821  0.60672224  0.26898966  0.55211673  0.50693811  0.66162097
 -0.64127659  0.65112284  0.03487759  0.23276531  0.44055996  0.38216964]


In [25]:
mean_squared_error(y_test, np.ones_like(y_test) * y_test.mean())

0.10534048469161521

In [26]:
if all([(bart.sampler.trace[-1].trees[i].evaluate() == bart.sampler.trace[-1].trees[i].evaluate(X_train)).all()
            for i in range(100)]):
    print("True")
else:
    print("False")

True
