In [1]:
import numpy as np
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from bart_playground.bcf.bcf import BCF
from bart_playground.params import Tree
from bart_playground import *

In [2]:
proposal_probs = {"grow" : 0.5,
                  "prune" : 0.5}
n_samples = 640
generator = DataGenerator(n_samples=n_samples, n_features=2, noise=0.1, random_seed=42)
X, y = generator.generate(scenario="piecewise_flat")
z_rng = np.random.default_rng(0)
z1 = z_rng.binomial(1, 0.5, n_samples).astype(bool)
z2 = ((1 - z1) * z_rng.binomial(1, 0.5, n_samples)).astype(bool)
z = np.column_stack((z1, z2))
y = y + z[:, 0] * 0.5 - z[:, 1] * 0.5
# z = z1.reshape(-1, 1)
# y = y + z[:, 0] * 0.5 - 0.5

X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, z, random_state=42)
np.set_printoptions(suppress=True)
print(y_train[:10])

[ 0.56517481 -0.04621271 -0.27779943  0.53019707  0.98857901  0.44933973
  0.77858126  0.43837069 -0.01191688  0.875094  ]


In [3]:
print(X_train[0:5, :])

[[0.73489316 0.20240459]
 [0.74882078 0.80138943]
 [0.58106114 0.3468698 ]
 [0.97069802 0.89312112]
 [0.23855282 0.84940884]]


In [None]:
bcf = BCF(
    n_treat_arms=z.shape[1],  # Number of treatment arms
    n_mu_trees=100,       # Number of prognostic effect trees
    n_tau_trees=[50, 50],       # Number of treatment effect trees
    ndpost=100,          # Posterior samples
    nskip=100,            # Burn-in iterations
    random_state=42
)


Iterations: 100%|██████████| 200/200 [00:03<00:00, 55.26it/s]


In [5]:
# Just to hint for compilation
bcf2 = BCF(
    n_treat_arms=z.shape[1], n_mu_trees=100, n_tau_trees=[50, 50], ndpost=1, nskip=1, random_state=42
)
bcf2.fit(X_train, y_train, z_train)

Iterations:   0%|          | 0/2 [00:00<?, ?it/s]

Iterations: 100%|██████████| 2/2 [00:03<00:00,  1.91s/it]


In [6]:
%prun -s cumtime -D profile.prof -q bcf.fit(X_train, y_train, z_train)

Iterations: 100%|██████████| 200/200 [00:05<00:00, 36.41it/s]

 
*** Profile stats marshalled to file 'profile.prof'.





In [7]:
!gprof2dot -f pstats profile.prof -o profile.dot
!dot -Tpng profile.dot -o profile.png

In [8]:
tree_sp : Tree = bcf.sampler.trace[-1].mu_trees[70]

print(tree_sp)
print(tree_sp.vars)
print(tree_sp.leaf_vals)
# print(bcf.sampler.trace[-1].evaluate(z = np.zeros_like(y_train, dtype = bool), X = X_train)[0:10])

np.testing.assert_allclose(bcf.sampler.trace[-1].evaluate(z_train), bcf.sampler.trace[-1].evaluate(z_train, X_train))

X_0 <= 0.441 (split, n = 480)
	X_1 <= 0.416 (split, n = 221)
		Val: -0.038 (leaf, n = 90)
		Val: -0.009 (leaf, n = 131)
	Val: 0.014 (leaf, n = 259)
[ 0  1 -1 -1 -1 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2]
[        nan         nan  0.0139598  -0.03781563 -0.00940739         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan]


In [9]:
bcf_result = bcf.predict_components(X_test, z_test)

In [10]:
print(bcf_result[0][0:10])
print(bcf_result[1][0:10])

[ 0.18912502  0.1735647  -0.01754936  0.15050377  0.01464981  0.19353664
  0.18154303  0.16566044  0.17730278 -0.00757243]
[[ 0.20732212 -0.17476558]
 [ 0.20647187 -0.18653574]
 [ 0.21389026 -0.19276108]
 [ 0.26435612 -0.18038378]
 [ 0.25933444 -0.18163852]
 [ 0.20825599 -0.19500159]
 [ 0.20737898 -0.18777154]
 [ 0.2571358  -0.17876338]
 [ 0.2078754  -0.1760631 ]
 [ 0.26130818 -0.18408952]]


In [11]:
print(bcf_result[1][0:10].shape)

(10, 2)


#### MLearner class

In [12]:
from sklearn import clone

def control_indices(z):
    result = np.zeros(z.shape[0], dtype=bool)
    for arm in range(z.shape[1]):
        result = result | z[:, arm]
    return ~result

class MLearner:
    def __init__(self, n_treated_arms, model_treated, model_control):
        self.model_treated_list = [None] * n_treated_arms
        for i in range(n_treated_arms):
            self.model_treated_list[i] = clone(model_treated)
        self.model_control = clone(model_control)

    def fit(self, X, y, z):
        X_treated_list = [X[z[:, arm]] for arm in range(z.shape[1])]
        y_treated_list = [y[z[:, arm]] for arm in range(z.shape[1])]
        X_control = X[control_indices(z)]
        y_control = y[control_indices(z)]

        # Fit the models.
        for i in range(len(self.model_treated_list)):
            self.model_treated_list[i].fit(X_treated_list[i], y_treated_list[i])
        self.model_control.fit(X_control, y_control)
        # return self

    def predict(self, X, z):
        # Predict outcome: if z is True, use model_treated; else, use model_control.
        preds = np.empty(len(X))
        preds[control_indices(z)] = self.model_control.predict(X[control_indices(z)])
        for arm in range(z.shape[1]):
            preds[z[:, arm]] = self.model_treated_list[arm].predict(X[z[:, arm]])
        return preds

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)
lr = LinearRegression()

#### Comparison

In [14]:
nta = z.shape[1]  # Number of treatment arms
tlearner_rf = MLearner(n_treated_arms=nta, model_treated=rf, model_control=rf)
tlearner_rf.fit(X_train, y_train, z_train)
tlearner_lr = MLearner(n_treated_arms=nta, model_treated=lr, model_control=lr)
tlearner_lr.fit(X_train, y_train, z_train)
tlearner_dt = MLearner(n_treated_arms=nta, model_treated=dt, model_control=dt)
tlearner_dt.fit(X_train, y_train, z_train)

In [15]:
models = {"bcf" : bcf, 
          "rf" : tlearner_rf, 
          "lr" : tlearner_lr,
          "dt" : tlearner_dt}
results = {}
for model_name, model in models.items():
    results[model_name] = mean_squared_error(y_test, model.predict(X_test, z_test))
results

{'bcf': 0.0234999515646783,
 'rf': 0.021441280609918477,
 'lr': 0.05466893346335514,
 'dt': 0.03223509459291045}

In [16]:
print(mean_squared_error(bcf_result[2], y_test))

0.0234999515646783
