In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from bart_playground import *
import bartz
from stochtree import BARTModel
import time

import itertools
import pandas as pd
from sklearn.model_selection import KFold

## Abalone Data

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets 

# variable information 
print(abalone.variables)

             name     role         type demographic  \
0             Sex  Feature  Categorical        None   
1          Length  Feature   Continuous        None   
2        Diameter  Feature   Continuous        None   
3          Height  Feature   Continuous        None   
4    Whole_weight  Feature   Continuous        None   
5  Shucked_weight  Feature   Continuous        None   
6  Viscera_weight  Feature   Continuous        None   
7    Shell_weight  Feature   Continuous        None   
8           Rings   Target      Integer        None   

                   description  units missing_values  
0         M, F, and I (infant)   None             no  
1    Longest shell measurement     mm             no  
2      perpendicular to length     mm             no  
3           with meat in shell     mm             no  
4                whole abalone  grams             no  
5               weight of meat  grams             no  
6  gut weight (after bleeding)  grams             no  
7        

In [3]:
X = X.drop(columns=['Sex'])
X

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [4]:
X = X.values.astype(float)
y = np.array(y).reshape(-1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# param_grid = {
#     'ndpost': [200, 300],
#     'nskip': [100],
#     'n_trees': [200, 300],
#     'tree_alpha': [0.95, 0.8],
#     'tree_beta': [2.0, 3.0],
#     'f_k': [2.0, 3.0],
#     'eps_q': [0.9, 0.8],
#     'eps_nu': [3.0, 5.0]
# }
param_grid = {
    'ndpost': [300],
    'nskip': [100],
    'n_trees': [300, 500],
    'tree_alpha': [0.95],
    'tree_beta': [2.0],
    'f_k': [4.0, 5.0, 7.0],
    'eps_q': [0.9],
    'eps_nu': [3.0, 5.0, 7.0]
}
param_names = list(param_grid.keys())
param_list = list(itertools.product(*param_grid.values()))

memory_file = "Abalone_tuning_memory_btz.csv"

In [7]:
if os.path.exists(memory_file):
    memory_df = pd.read_csv(memory_file)
else:
    memory_df = pd.DataFrame(columns=param_names + ["cv_mse", "cv_pi_length", "cv_coverage"])


In [8]:
start_idx = 0
param_list = param_list[start_idx:]

In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Bartz

In [10]:
from joblib import Parallel, delayed

def evaluate_params(params):
    ndpost, nskip, n_trees, tree_alpha, tree_beta, f_k, eps_q, eps_nu = params
    mses = []
    pi_lengths = []
    coverages = []
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        model = bartz.BART.gbart(np.transpose(X_tr), y_tr, 
                                 ntree=n_trees, ndpost=ndpost, nskip=nskip,
                                 base=tree_alpha, power=tree_beta, 
                                 k=f_k, sigquant=eps_q, sigdf=eps_nu)
        bart_pred_all_val = model.predict(np.transpose(X_val))
        y_pred = np.mean(np.array(bart_pred_all_val), axis=0)
        mse = mean_squared_error(y_val, y_pred)
        mses.append(mse)

        bart_lower = np.percentile(bart_pred_all_val, 2.5, axis=0)
        bart_upper = np.percentile(bart_pred_all_val, 97.5, axis=0)
        pi_length = np.mean(bart_upper - bart_lower)
        coverage = np.mean((y_val >= bart_lower) & (y_val <= bart_upper))
        pi_lengths.append(pi_length)
        coverages.append(coverage)

    avg_mse = np.mean(mses)
    avg_pi_length = np.mean(pi_lengths)
    avg_coverage = np.mean(coverages)
    print(f"Params: {params}, CV MSE: {avg_mse:.4f}, PI Length: {avg_pi_length:.4f}, Coverage: {avg_coverage:.4f}")
    return list(params) + [avg_mse, avg_pi_length, avg_coverage]

params_to_run = [params for params in param_list if not ((memory_df[param_names] == params).all(axis=1)).any()]

results = Parallel(n_jobs=3)(delayed(evaluate_params)(params) for params in params_to_run)

In [11]:
for res in results:
    memory_df.loc[len(memory_df)] = res
    memory_df.to_csv(memory_file, index=False)

In [12]:
import pandas as pd

memory_file = "Abalone_tuning_memory_btz.csv"

memory_df = pd.read_csv(memory_file)

In [13]:
print(memory_df.sort_values("cv_mse").head())

     ndpost  nskip  n_trees  tree_alpha  tree_beta  f_k  eps_q  eps_nu  \
101   300.0  100.0    300.0        0.95        2.0  3.0    0.9     5.0   
144   300.0  100.0    500.0        0.95        2.0  4.0    0.9     5.0   
145   300.0  100.0    500.0        0.95        2.0  4.0    0.9     7.0   
102   300.0  100.0    300.0        0.95        2.0  3.0    0.8     3.0   
140   300.0  100.0    300.0        0.95        2.0  4.0    0.9     3.0   

       cv_mse  cv_pi_length  cv_coverage  
101  4.637364      2.065311     0.438383  
144  4.640900      1.852411     0.400396  
145  4.647007      1.853728     0.395601  
102  4.649617      2.072455     0.429123  
140  4.650125      1.873226     0.396564  


In [14]:
print(memory_df.sort_values("cv_pi_length").head())

     ndpost  nskip  n_trees  tree_alpha  tree_beta  f_k  eps_q  eps_nu  \
139   300.0  100.0    500.0        0.95        2.0  7.0    0.9     7.0   
137   300.0  100.0    500.0        0.95        2.0  7.0    0.9     3.0   
138   300.0  100.0    500.0        0.95        2.0  7.0    0.9     5.0   
132   300.0  100.0    300.0        0.95        2.0  7.0    0.9     5.0   
133   300.0  100.0    300.0        0.95        2.0  7.0    0.9     7.0   

       cv_mse  cv_pi_length  cv_coverage  
139  4.778555      1.439065     0.305248  
137  4.772989      1.439781     0.305248  
138  4.773483      1.442573     0.306203  
132  4.757532      1.480303     0.310674  
133  4.749754      1.481503     0.309076  


In [15]:
print(memory_df.sort_values("cv_coverage", ascending=False).head())

    ndpost  nskip  n_trees  tree_alpha  tree_beta  f_k  eps_q  eps_nu  \
96   300.0  100.0    300.0        0.95        2.0  2.0    0.9     3.0   
98   300.0  100.0    300.0        0.95        2.0  2.0    0.8     3.0   
99   300.0  100.0    300.0        0.95        2.0  2.0    0.8     5.0   
97   300.0  100.0    300.0        0.95        2.0  2.0    0.9     5.0   
34   200.0  100.0    300.0        0.95        2.0  2.0    0.8     3.0   

      cv_mse  cv_pi_length  cv_coverage  
96  4.714252      2.321143     0.486911  
98  4.747186      2.344612     0.485309  
99  4.707543      2.334672     0.477964  
97  4.703890      2.339878     0.475741  
34  4.761030      2.253083     0.472217  


In [16]:
import plotly.express as px

memory_df = pd.read_csv(memory_file)

param_names = ['ndpost', 'nskip', 'n_trees', 'tree_alpha',
              'tree_beta', 'f_k', 'eps_q', 'eps_nu']

fig = px.scatter(
    memory_df,
    x="cv_pi_length",
    y="cv_coverage",
    color="cv_mse",
    hover_data=param_names + ["cv_mse", "cv_pi_length", "cv_coverage"],
    labels={
        "cv_pi_length": "Prediction Interval Length",
        "cv_coverage": "Coverage",
        "cv_mse": "MSE"
    },
    title="Parameter Search: PI Length vs Coverage (Color = MSE)"
)
fig.update_traces(marker=dict(size=12, line_width=1))
fig.show()

In [17]:
filtered = memory_df[memory_df["cv_mse"] < 5].copy()

filtered["slope"] = filtered["cv_coverage"] / filtered["cv_pi_length"]

filtered_sorted = filtered.sort_values("slope", ascending=False)

cols_to_show = param_names + ["cv_mse", "cv_pi_length", "cv_coverage", "slope"]
print(filtered_sorted[cols_to_show].head(10))

     ndpost  nskip  n_trees  tree_alpha  tree_beta  f_k  eps_q  eps_nu  \
144   300.0  100.0    500.0        0.95        2.0  4.0    0.9     5.0   
124   300.0  100.0    300.0        0.80        3.0  3.0    0.9     3.0   
60    200.0  100.0    300.0        0.80        3.0  3.0    0.9     3.0   
129   300.0  100.0    300.0        0.95        2.0  5.0    0.9     5.0   
142   300.0  100.0    300.0        0.95        2.0  4.0    0.9     7.0   
103   300.0  100.0    300.0        0.95        2.0  3.0    0.8     5.0   
130   300.0  100.0    300.0        0.95        2.0  5.0    0.9     7.0   
145   300.0  100.0    500.0        0.95        2.0  4.0    0.9     7.0   
134   300.0  100.0    500.0        0.95        2.0  5.0    0.9     3.0   
143   300.0  100.0    500.0        0.95        2.0  4.0    0.9     3.0   

       cv_mse  cv_pi_length  cv_coverage     slope  
144  4.640900      1.852411     0.400396  0.216149  
124  4.688760      1.931974     0.415709  0.215173  
60   4.703759      1.86650