In [4]:
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time

from licketyresplit_rashomon_importance_distribution import RashomonImportanceDistribution as LicketyRID
TreeFarmsRID = LicketyRID  # given that TreeFARMS has a bug, we are going to view Lickety with max lookahead as TreeFARMS because it is optimal.

if False:
    spambase = fetch_ucirepo(id=94)
    X = spambase.data.features
    y = spambase.data.targets
    df = pd.concat([X, y], axis=1)  # y already binary

if True:
    bike = fetch_ucirepo(id=275)
    X = bike.data.features
    y = bike.data.targets
    X = pd.get_dummies(X, columns=['season', 'mnth', 'weekday', 'weathersit'], drop_first=False)
    X['dteday'] = pd.to_datetime(X['dteday']).dt.year
    label_col = "cnt"
    label_quantile = 0.50
    thr = y[label_col].quantile(label_quantile)
    y_bin = (y[label_col] >= thr).astype(int).rename("label")
    df = pd.concat([X, y_bin], axis=1)

if False:
    X_raw, y_raw = fetch_openml(data_id=42193, as_frame=True, return_X_y=True)
    df_all = X_raw.copy()
    df_all["label"] = y_raw
    df_all = df_all.dropna(axis=0, how="any").reset_index(drop=True) # drop rows with any NA
    X_clean = df_all.drop(columns=["label"])
    y_clean = df_all["label"]
    X = pd.get_dummies(
        X_clean,
        drop_first=False,
        dtype="uint8"
    )
    y = pd.to_numeric(y_clean, errors="coerce").astype("uint8")
    df = pd.concat([X.reset_index(drop=True), y.rename("label").reset_index(drop=True)], axis=1)

if False:
    adult = fetch_openml("adult", version=2, as_frame=True)
    X_raw = adult.data.copy()
    y_raw = adult.target.copy()

    X_raw = X_raw.replace('?', pd.NA)
    X_raw = X_raw.dropna()
    y_raw = y_raw.loc[X_raw.index]
    X = pd.get_dummies(X_raw, drop_first=False)
    y = (y_raw.astype(str).str.contains(">50K")).astype(int)
    df = pd.concat([X.reset_index(drop=True), y.rename("income_gt_50k").reset_index(drop=True)], axis=1)

if False:
    cov = fetch_covtype(as_frame=True)
    X = cov.data
    y = cov.target
    y_bin = (y == 2).astype(np.uint8).rename("label")
    df = pd.concat([X.reset_index(drop=True), y_bin.reset_index(drop=True)], axis=1)


common_kwargs = dict(
    input_df=df,
    binning_map=None, # let RID binarize internally
    db=8,
    lam=0.007,
    eps=0.01,
    vi_metric='sub_mr',
    dataset_name='bike2',  # share bootstraps
    n_resamples=1,
    verbose=False,
    max_par_for_gosdt=2,
    allow_binarize_internally=True
)

t0 = time.perf_counter()
LRID = LicketyRID(**common_kwargs, lickety_lookahead=1)
t1 = time.perf_counter()
print(f"LicketyRID (lookahead=1) runtime: {t1 - t0:.3f} sec")

# we alias to keep the "TreeFarms" name externally, but underneath it’s Lickety with lh=db
t0 = time.perf_counter()
TRID = TreeFarmsRID(**common_kwargs, lickety_lookahead=common_kwargs["db"])
t1 = time.perf_counter()
print(f"TreeFarmsRID (optimal; lookahead=depth) runtime: {t1 - t0:.3f} sec")

assert TRID.n_vars == LRID.n_vars, "TRID and LRID must have same number of variables"
for v in range(TRID.n_vars):
    # "TreeFarms" (optimal)
    t_low, t_high = TRID.bwr(v)
    t_mean, t_median = TRID.mean(v), TRID.median(v)

    # Lickety (lh=1)
    l_low, l_high = LRID.bwr(v)
    l_mean, l_median = LRID.mean(v), LRID.median(v)

    col_name = df.columns[v]
    print(f"Variable {v} ({col_name}) --------------")
    print(f"TreeFarms (opt) : range=({float(t_low):.4f}, {float(t_high):.4f}), "
          f"mean={float(t_mean):.4f}, median={float(t_median):.4f}")
    print(f"Lickety (lh=1)  : range=({float(l_low):.4f}, {float(l_high):.4f}), "
          f"mean={float(l_mean):.4f}, median={float(l_median):.4f}")
    print()

from scipy.stats import pearsonr, spearmanr

tree_means = np.array([float(TRID.mean(v)) for v in range(TRID.n_vars)])
lickety_means = np.array([float(LRID.mean(v)) for v in range(LRID.n_vars)])

corr, pval = pearsonr(tree_means, lickety_means)
print("=== Linear Correlation: TreeFarms (opt) vs Lickety (lh=1) ===")
print(f"Pearson r = {corr:.4f}, p-value = {pval:.4e}")

rank_corr, rank_pval = spearmanr(tree_means, lickety_means)
print(f"Spearman rho = {rank_corr:.4f}, p-value = {rank_pval:.4e}")

# --- correlation on top-20 by "TreeFarms (opt)" ---
top20_idx = np.argsort(tree_means)[-20:]
tree_top20 = tree_means[top20_idx]
lickety_top20 = lickety_means[top20_idx]

corr, pval = pearsonr(tree_top20, lickety_top20)
print("=== Linear Correlation on Top 20 (TreeFarms opt features) ===")
print(f"Pearson r = {corr:.4f}, p-value = {pval:.4e}")

rank_corr, rank_pval = spearmanr(tree_top20, lickety_top20)
print(f"Spearman rho = {rank_corr:.4f}, p-value = {rank_pval:.4e}")




LicketyRID (lookahead=1) runtime: 40.901 sec




TreeFarmsRID (optimal; lookahead=depth) runtime: 48.732 sec
Variable 0 (dteday) --------------
TreeFarms (opt) : range=(-0.0425, 0.0709), mean=0.0152, median=0.0097
Lickety (lh=1)  : range=(-0.0425, 0.0709), mean=0.0152, median=0.0097

Variable 1 (yr) --------------
TreeFarms (opt) : range=(-0.0400, 0.0667), mean=0.0147, median=0.0095
Lickety (lh=1)  : range=(-0.0400, 0.0667), mean=0.0147, median=0.0095

Variable 2 (hr) --------------
TreeFarms (opt) : range=(0.2260, 0.2864), mean=0.2587, median=0.2563
Lickety (lh=1)  : range=(0.2260, 0.2864), mean=0.2587, median=0.2563

Variable 3 (holiday) --------------
TreeFarms (opt) : range=(-0.0000, 0.0000), mean=-0.0000, median=-0.0000
Lickety (lh=1)  : range=(-0.0000, 0.0000), mean=-0.0000, median=-0.0000

Variable 4 (workingday) --------------
TreeFarms (opt) : range=(-0.0000, 0.0000), mean=-0.0000, median=-0.0000
Lickety (lh=1)  : range=(-0.0000, 0.0000), mean=-0.0000, median=-0.0000

Variable 5 (temp) --------------
TreeFarms (opt) : range=