In [None]:
import sys, os

import numpy as np
import pandas as pd

os.chdir(os.path.join(os.getcwd(), "../.."))

from CART import *
from Utils.plotting import  *
from scipy.stats import norm as ndist
import joblib

# For tree-values
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector

# Select a CRAN mirror to download from
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)  # Select the first mirror

# Install 'remotes' if it's not already installed
if not rpackages.isinstalled('remotes'):
    utils.install_packages(StrVector(('remotes',)))

import rpy2.robjects as ro

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri

In [None]:
# Run the GitHub installation command for 'treevalues'
ro.r('remotes::install_github("anna-neufeld/treevalues")')
ro.r('library(treevalues)')
ro.r('library(rpart)')

R[write to console]: Using GitHub PAT from the git credential store.

R[write to console]: Skipping install of 'treevalues' from a github remote, the SHA1 (55573782) has not changed since last install.
  Use `force = TRUE` to force installation



0,1,2,3,4,5,6
'rpart','treevalu...,'tools',...,'datasets','methods','base'


In [None]:
def generate_test(mu, sd_y):
    n = mu.shape[0]
    return mu + np.random.normal(size=(n,), scale=sd_y)

# Tree-values inference

In [None]:
def tree_values_inference(X, y, mu, sd_y, max_depth=5, level=0.1,
                          X_test=None):
    # Convert the NumPy matrix to an R matrix
    X_r = numpy2ri.py2rpy(X)
    y_r = numpy2ri.py2rpy(y)

    # Assign the R matrix to a variable in the R environment (optional)
    ro.globalenv['X_r'] = X_r
    ro.globalenv['y_r'] = y_r
    ro.globalenv['p'] = X.shape[1]

    # Construct dataset
    ro.r('data <- cbind(y_r, X_r)')
    # Set the column names to "y", "x1", "x2", ..., "x10"
    ro.r('colnames(data) <- c("y", paste0("x", 1:p))')
    ro.r('data = data.frame(data)')

    # Define the rpart tree model
    tree_cmd = ('bls.tree <- rpart(y ~ ., data=data, model = TRUE, ' +
                'control = rpart.control(cp=0.00, minsplit = 50, minbucket = 20, maxdepth=') + str(max_depth) + '))'
    ro.r(tree_cmd)
    bls_tree = ro.r('bls.tree')
    # Plot the tree values (this will plot directly if you have a plotting backend set up)
    # ro.r('treeval.plot(bls.tree, inferenceType=0)')

    # ro.r('print(row.names(bls.tree$frame)[bls.tree$frame$var == "<leaf>"])')
    ro.r('leaf_idx <- (row.names(bls.tree$frame)[bls.tree$frame$var == "<leaf>"])')
    leaf_idx = ro.r['leaf_idx']

    # Get node mapping
    ro.r('idx_full <- 1:nrow(bls.tree$frame)')
    ro.r('mapped_idx <- idx_full[bls.tree$frame$var == "<leaf>"]')

    len = []
    coverage = []
    len_naive = []
    coverage_naive = []

    for i, idx in enumerate(leaf_idx):
        # Get the branch information for a specific branch in the tree
        command = 'branch <- getBranch(bls.tree, ' + str(idx) + ')'
        ro.r(command)
        # Perform branch inference
        ro.r(f'result <- branchInference(bls.tree, branch, type="reg", alpha = 0.10, sigma_y={sd_y})')
        # Get confidence intervals
        confint = ro.r('result$confint')

        target_cmd = "contrast <- (bls.tree$where == mapped_idx[" + str(i + 1) + "])"
        ro.r(target_cmd)
        contrast = ro.r('contrast')
        contrast = np.array(contrast)

        contrast = np.array(contrast * 1 / np.sum(contrast))

        target = contrast.dot(mu)
        root_n = 1/np.linalg.norm(contrast)
        coverage.append(target >= confint[0] and target <= confint[1])
        len.append((confint[1] - confint[0])*root_n)

        # Naive after tree value
        # Confidence intervals
        naive_CI = [contrast.dot(y) -
                    np.linalg.norm(contrast) * sd_y * ndist.ppf(1 - level / 2),
                    contrast.dot(y) +
                    np.linalg.norm(contrast) * sd_y * ndist.ppf(1 - level / 2)]
        coverage_naive.append((target >= naive_CI[0] and target <= naive_CI[1]))
        len_naive.append((naive_CI[1] - naive_CI[0])*root_n)

    if X_test is not None:
        X_test_r = numpy2ri.py2rpy(X_test)
        ro.globalenv['X_test_r'] = X_test_r
        ro.r('pred <- predict(bls.tree, data = X_test_r)')
        pred = ro.r['pred']
    else:
        pred = None

    return (np.mean(coverage), np.mean(len),
            np.mean(coverage_naive), np.mean(len_naive), pred)

# RRT inference

In [13]:
def randomized_inference(reg_tree, sd_y, y, mu, level=0.1):
    # print(reg_tree.terminal_nodes)
    coverage_i = []
    lengths_i = []

    for i, node in enumerate(reg_tree.terminal_nodes):
        (pval, dist, contrast, norm_contrast, obs_tar, logW, suff,
         sel_probs, ref_hat_layer) \
            = (reg_tree.node_inference(node=node,
                                       ngrid=10000,
                                       ncoarse=300,
                                       grid_w_const=5,#15*noise_sd,
                                       sd=sd_y,
                                       #query_grid=True,
                                       interp_kind='cubic',
                                       query_size=100))
        target = contrast.dot(mu)

        # This is an interval for
        # eta_*'mu = eta'mu / (norm(eta) * sd_y)
        selective_CI = (dist.equal_tailed_interval(observed=norm_contrast.dot(y),
                                                   alpha=level))
        selective_CI = np.array(selective_CI)
        selective_CI *= np.linalg.norm(contrast) * sd_y
        coverage_i.append((target >= selective_CI[0] and target <= selective_CI[1]))
        lengths_i.append((selective_CI[1] - selective_CI[0]) / np.linalg.norm(contrast))
        if not coverage_i[-1]:
            print(f"{i} not covered, depth {node.depth}")
        print(f'Mean coverage so far: {np.mean(coverage_i)}, Mean length so far: {np.mean(lengths_i)}')

    return coverage_i, lengths_i

# Inference with UV decomposition

In [14]:
def UV_decomposition(X, y, mu, sd_y,
                     max_depth=5, min_prop=0, min_sample=10, min_bucket=5,
                     level=0.1, gamma=1,
                     X_test=None):
    n = X.shape[0]
    W = np.random.normal(loc=0, scale=sd_y * np.sqrt(gamma), size=(n,))
    U = y + W
    V = y - W / gamma
    sd_V = sd_y * np.sqrt(1 + 1 / gamma)
    reg_tree = RegressionTree(min_samples_split=min_sample, max_depth=max_depth,
                              min_proportion=min_prop, min_bucket=min_bucket)
    reg_tree.fit(X, U, sd=0)

    coverage = []
    lengths = []

    for node in reg_tree.terminal_nodes:
        contrast = node.membership

        contrast = np.array(contrast * 1 / np.sum(contrast))

        target = contrast.dot(mu)

        # Naive after tree value
        # Confidence intervals
        CI = [contrast.dot(V) -
              np.linalg.norm(contrast) * sd_V * ndist.ppf(1 - level / 2),
              contrast.dot(V) +
              np.linalg.norm(contrast) * sd_V * ndist.ppf(1 - level / 2)]

        root_n = 1/np.linalg.norm(contrast)
        coverage.append((target >= CI[0] and target <= CI[1]))
        lengths.append((CI[1] - CI[0]) * root_n)

    if X_test is not None:
        pred = reg_tree.predict(X_test)
    else:
        pred = None

    return coverage, lengths, pred

# Replicating Figure 2

In [None]:
def terminal_inference_sim(n=50, p=5, a=0.1, b=0.1,
                           sd_y=1,
                           noise_sd_list=[0.5, 1, 2, 5],
                           UV_gamma_list=[],
                           use_nonrand=True,
                           start=0, end=100,
                           level=0.1, path=None):
    method_list = [f"RRT_{sd}" for sd in noise_sd_list]
    if use_nonrand:
        method_list += ["Tree val", "Naive"]
    for gamma in UV_gamma_list:
        method_list.append("UV_" + str(gamma))

    coverage_dict = {m: [] for m in method_list}
    length_dict = {m: [] for m in method_list}
    MSE_dict = {m: [] for m in method_list}

    for i in range(start, end):
        print(i, "th simulation")
        np.random.seed(i + 10000)
        X = np.random.normal(size=(n, p))

        mu = b * ((X[:, 0] <= 0) * (1 + a * (X[:, 1] > 0) + (X[:, 2] * X[:, 1] <= 0)))
        y = mu + np.random.normal(size=(n,), scale=sd_y)
        y_test = generate_test(mu, sd_y)
        
        if use_nonrand:
            # Tree value & naive inference & prediction
            (coverage_treeval, avg_len_treeval,
             coverage_treeval_naive, avg_len_treeval_naive,
             pred_test_treeval) = tree_values_inference(X, y, mu, sd_y=sd_y,
                                                        X_test=X, max_depth=3)
            MSE_test_treeval = (np.mean((y_test - pred_test_treeval) ** 2))

            coverage_dict["Tree val"].append(coverage_treeval)
            length_dict["Tree val"].append(avg_len_treeval)
            MSE_dict["Tree val"].append(MSE_test_treeval)
            coverage_dict["Naive"].append(coverage_treeval_naive)
            length_dict["Naive"].append(avg_len_treeval_naive)
            MSE_dict["Naive"].append(MSE_test_treeval)
            print(f"Tree val coverage: {coverage_treeval}, length: {avg_len_treeval}")
            print(f"Naive coverage: {coverage_treeval_naive}, length: {avg_len_treeval_naive}")

        for gamma in UV_gamma_list:
            gamma_key = "UV_" + str(gamma)
            # UV decomposition
            coverage_UV, len_UV, pred_UV = UV_decomposition(X, y, mu, sd_y, X_test=X,
                                                            min_prop=0., max_depth=3,
                                                            min_sample=50, min_bucket=20,
                                                            gamma=gamma)
            MSE_UV = (np.mean((y_test - pred_UV) ** 2))
            coverage_dict[gamma_key].append(np.mean(coverage_UV))
            length_dict[gamma_key].append(np.mean(len_UV))
            MSE_dict[gamma_key].append(MSE_UV)
            print(f"UV {gamma} coverage: {np.mean(coverage_UV)}, length: {np.mean(len_UV)}")

        for noise_sd in noise_sd_list:
            # Create and train the regression tree
            reg_tree = RegressionTree(min_samples_split=50, max_depth=3,
                                      min_proportion=0., min_bucket=20)

            reg_tree.fit(X, y, sd=noise_sd * sd_y)

            coverage_i, lengths_i = randomized_inference(reg_tree=reg_tree,
                                                         y=y, sd_y=sd_y, mu=mu,
                                                         level=level)
            pred_test = reg_tree.predict(X)
            MSE_test = (np.mean((y_test - pred_test) ** 2))
            # Record results
            coverage_dict[f"RRT_{noise_sd}"].append(np.mean(coverage_i))
            length_dict[f"RRT_{noise_sd}"].append(np.mean(lengths_i))
            MSE_dict[f"RRT_{noise_sd}"].append(MSE_test)

        if path is not None:
            joblib.dump([coverage_dict, length_dict, MSE_dict], path, compress=1)

    return coverage_dict, length_dict, MSE_dict

In [None]:
coverage_dict_fig1, length_dict_fig1, MSE_dict_fig1\
    = terminal_inference_sim(n=200, p=5, a=1, b=2,
                             sd_y=2,
                             noise_sd_list=[1, 2, 3, 4],
                             UV_gamma_list=[],
                             use_nonrand=True,
                             start=0, end=5,
                             level=0.1, path=None)

0 th simulation
Tree val coverage: 0.7142857142857143, length: 122.35742192633414
Naive coverage: 0.7142857142857143, length: 6.579414507805888
Mean coverage so far: 1.0, Mean length so far: 7.657584246359095
Mean coverage so far: 1.0, Mean length so far: 7.656083174219065
Mean coverage so far: 1.0, Mean length so far: 7.496330663326972
Mean coverage so far: 1.0, Mean length so far: 7.477347852582906
4 not covered, depth 3
Mean coverage so far: 0.8, Mean length so far: 7.060710441606699
Mean coverage so far: 0.8333333333333334, Mean length so far: 6.7710874947873805
Mean coverage so far: 0.8571428571428571, Mean length so far: 6.500319559150195


In [41]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# Tree val: Tree-values
# Naive: naive inference
# Rows: Each row correspond to one round of simulation
pd.DataFrame(coverage_dict_fig1)

Unnamed: 0,RRT_1,RRT_2.5,RRT_5,RRT_10,Tree val,Naive
0,0.857143,0.833333,0.5,0.333333,0.714286,0.714286
1,1.0,1.0,0.333333,0.5,0.833333,0.5
2,1.0,0.5,0.333333,0.571429,1.0,0.666667
3,1.0,0.833333,0.833333,0.833333,1.0,0.666667
4,0.833333,0.833333,0.5,0.0,0.833333,0.5
5,0.666667,0.5,0.2,0.833333,1.0,0.666667
6,0.666667,0.666667,0.5,0.8,0.833333,0.833333
7,0.833333,0.571429,0.333333,0.666667,1.0,0.833333
8,0.666667,1.0,0.5,0.0,1.0,0.833333
9,0.857143,0.5,0.714286,0.8,0.857143,0.714286


In [42]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# Tree val: Tree-values
# Naive: naive inference
# Rows: Each row correspond to one round of simulation
pd.DataFrame(length_dict_fig1)

Unnamed: 0,RRT_1,RRT_2.5,RRT_5,RRT_10,Tree val,Naive
0,7.784962,3.223355,2.088642,1.555218,25.308622,1.258967
1,8.140773,3.335638,2.374394,3.588022,2.290091,1.166525
2,7.670032,24.522497,2.66647,1.834126,1.871536,1.178633
3,8.087814,3.729001,2.017375,1.61279,8.596456,1.175594
4,7.680314,3.294272,2.768114,1.531076,4.511139,1.1484
5,6.478428,3.037022,2.229905,1.681322,3.92466,1.162317
6,6.590405,3.178615,1.91074,2.3196,6.900845,1.187061
7,7.964584,3.795824,2.184275,2.927199,4.945598,1.183292
8,8.461452,9.54808,2.340645,1.824263,2.397167,1.196362
9,7.91885,3.688424,2.469129,2.797241,3.960502,1.246913


In [43]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# Tree val: Tree-values
# Naive: naive inference
# Rows: Each row correspond to one round of simulation
pd.DataFrame(MSE_dict_fig1)

Unnamed: 0,RRT_1,RRT_2.5,RRT_5,RRT_10,Tree val,Naive
0,4.754626,5.385438,6.062667,5.715567,4.774639,4.774639
1,5.007723,4.517161,5.284633,5.598432,4.747049,4.747049
2,5.004523,4.47755,4.93636,5.975434,4.853868,4.853868
3,6.023852,5.498041,6.179695,6.447653,5.547141,5.547141
4,3.863381,4.093041,4.473121,4.453665,4.162055,4.162055
5,4.755865,5.053366,4.820317,4.622594,4.527319,4.527319
6,3.988342,4.205111,5.472791,4.571316,3.973017,3.973017
7,4.853487,4.825132,4.799683,5.419323,4.97682,4.97682
8,4.664046,4.499219,4.788677,4.708756,4.531241,4.531241
9,3.778134,3.744351,4.469582,4.835217,3.530005,3.530005


# Replicating Figure 3

In [None]:
coverage_dict_fig2, length_dict_fig2, MSE_dict_fig2\
    = terminal_inference_sim(n=200, p=5, a=1, b=2,
                             sd_y=2,
                             noise_sd_list=[1],
                             UV_gamma_list=[0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
                             use_nonrand=False,
                             start=0, end=5,
                             level=0.1, path=None)

0 th simulation
UV 0.1 coverage: 1.0, length: 21.82144926261298
UV 0.2 coverage: 0.6666666666666666, length: 16.116208350389353
UV 0.3 coverage: 1.0, length: 13.696143477293719
UV 0.4 coverage: 0.8333333333333334, length: 12.30895744688977
UV 0.5 coverage: 0.5714285714285714, length: 11.395880211575573
Mean coverage so far: 1.0, Mean length so far: 7.5291315994555035
Mean coverage so far: 1.0, Mean length so far: 7.516834509667259
Mean coverage so far: 1.0, Mean length so far: 7.435125732792668
Mean coverage so far: 1.0, Mean length so far: 7.425790412954589
Mean coverage so far: 1.0, Mean length so far: 6.981822927715513
Mean coverage so far: 1.0, Mean length so far: 6.675017724826809
Mean coverage so far: 1.0, Mean length so far: 6.537833754193566
1 th simulation
UV 0.1 coverage: 0.6666666666666666, length: 21.821449262612983
UV 0.2 coverage: 1.0, length: 16.116208350389357
UV 0.3 coverage: 1.0, length: 13.696143477293719
UV 0.4 coverage: 1.0, length: 12.308957446889766
UV 0.5 covera

  self._partition *= np.exp(_largest)


2 not covered, depth 2
Mean coverage so far: 0.6666666666666666, Mean length so far: 6.883913188468905
Mean coverage so far: 0.75, Mean length so far: 6.426837659472451
Mean coverage so far: 0.8, Mean length so far: 6.089736657812509
Mean coverage so far: 0.8333333333333334, Mean length so far: 5.8852722533124835
2 th simulation
UV 0.1 coverage: 1.0, length: 21.821449262612983
UV 0.2 coverage: 0.8333333333333334, length: 16.116208350389353
UV 0.3 coverage: 1.0, length: 13.696143477293719
UV 0.4 coverage: 1.0, length: 12.30895744688977
UV 0.5 coverage: 1.0, length: 11.395880211575575
Mean coverage so far: 1.0, Mean length so far: 7.370520465180864
Mean coverage so far: 1.0, Mean length so far: 7.415275464809743
Mean coverage so far: 1.0, Mean length so far: 7.255436055869914
Mean coverage so far: 1.0, Mean length so far: 7.119708797031176
Mean coverage so far: 1.0, Mean length so far: 6.873134477287263
Mean coverage so far: 1.0, Mean length so far: 6.638299298931652
3 th simulation
UV 0

  self._partition *= np.exp(_largest)


Mean coverage so far: 1.0, Mean length so far: 7.10103788833613
Mean coverage so far: 1.0, Mean length so far: 6.472146428250577
Mean coverage so far: 1.0, Mean length so far: 6.458274582138017
Mean coverage so far: 1.0, Mean length so far: 6.482020360052393


In [10]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# UV_k: UV decomposition with gamma = k
# Rows: Each row correspond to one round of simulation
pd.DataFrame(coverage_dict_fig2)

Unnamed: 0,RRT_1,UV_0.1,UV_0.2,UV_0.3,UV_0.4,UV_0.5
0,1.0,1.0,0.666667,1.0,0.833333,0.571429
1,0.833333,0.666667,1.0,1.0,1.0,1.0
2,1.0,1.0,0.833333,1.0,1.0,1.0
3,1.0,0.666667,1.0,0.833333,0.8,1.0
4,1.0,0.833333,1.0,1.0,0.833333,0.666667


In [11]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# UV_k: UV decomposition with gamma = k
# Rows: Each row correspond to one round of simulation
pd.DataFrame(length_dict_fig2)

Unnamed: 0,RRT_1,UV_0.1,UV_0.2,UV_0.3,UV_0.4,UV_0.5
0,6.537834,21.821449,16.116208,13.696143,12.308957,11.39588
1,5.885272,21.821449,16.116208,13.696143,12.308957,11.39588
2,6.638299,21.821449,16.116208,13.696143,12.308957,11.39588
3,6.868349,21.821449,16.116208,13.696143,12.308957,11.39588
4,6.48202,21.821449,16.116208,13.696143,12.308957,11.39588


In [12]:
# Columns: 
# RRT_c: RRT with external randomization N(0, (c*sd_y)^2)
# UV_k: UV decomposition with gamma = k
# Rows: Each row correspond to one round of simulation
pd.DataFrame(MSE_dict_fig2)

Unnamed: 0,RRT_1,UV_0.1,UV_0.2,UV_0.3,UV_0.4,UV_0.5
0,4.850155,4.786383,5.834787,4.83821,5.457604,4.927014
1,4.759003,4.710673,4.840395,4.753985,4.710537,4.774679
2,4.351734,4.559251,4.849142,4.902989,4.852253,5.041466
3,5.217155,5.623968,5.686481,5.39728,6.000632,6.032576
4,3.904771,4.187648,4.118202,4.335947,4.209115,4.169053


In [62]:
cov = np.eye(5) + np.ones((5,5))

In [63]:
L = np.linalg.cholesky(np.linalg.inv(cov)).T

In [64]:
X = np.abs(np.random.normal(size=(5,100000)))

In [65]:
L

array([[ 0.91287093, -0.18257419, -0.18257419, -0.18257419, -0.18257419],
       [ 0.        ,  0.89442719, -0.2236068 , -0.2236068 , -0.2236068 ],
       [ 0.        ,  0.        ,  0.8660254 , -0.28867513, -0.28867513],
       [ 0.        ,  0.        ,  0.        ,  0.81649658, -0.40824829],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.70710678]])

In [25]:
np.min(L @ X)

np.float64(-1.6623417197792252)