Why biased $\tau$ estimates?

In [None]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict
from pathlib import Path
from typing import Final

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from tabrel.utils.treatment import load_ihdp_data, generate_indices


ihdp_data, ihdp_exclude_cols, ihdp_tau_colname, _, _, _ = load_ihdp_data(Path("../CEVAE/datasets/IHDP"))
x_all = ihdp_data.drop(columns=ihdp_exclude_cols + [ihdp_tau_colname])


def split_treated_non_treated(x: pd.DataFrame, treatment: np.ndarray, y_fact: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    treated = treatment == 1
    x_treated, y_treated = x.loc[treated], y_fact.loc[treated]
    x_non_treated, y_non_treated = x.loc[~treated], y_fact.loc[~treated]
    return x_treated.to_numpy(), y_treated.to_numpy(), x_non_treated.to_numpy(), y_non_treated.to_numpy()


y_fact_colname, y_cfact_colname = "y_factual", "y_cfactual"
data_y_fact, data_y_cfact, data_treatment = ihdp_data[y_fact_colname], ihdp_data[y_cfact_colname], ihdp_data["treatment"]


y_s = data_y_fact.to_numpy() # Y for S-learner
y_s_cfact = data_y_cfact.to_numpy()
treatment_np = data_treatment.to_numpy()
tau_true = ihdp_data[ihdp_tau_colname].to_numpy()

lr, n_epochs = 1e-3, 50
lgb_params: Final[dict[str, str | int]] = {"objective": "regression", "metric": "rmse", "verbosity": -1}

x_s = x_all
x_len = len(x_all)

x_s["treatment"] = data_treatment
x_s_np = x_s.to_numpy()
n_samples, n_feats = x_s_np.shape

seed: Final[int] = 42
np.random.seed(seed)
ids_q, ids_b, ids_v = generate_indices(seed, n_total=x_len)
ids_train = np.concatenate((ids_q, ids_b))
xb, yb, xq, yq = x_s_np[ids_b], y_s[ids_b], x_s_np[ids_q], y_s[ids_q]
xv, yv, yv_cfact, treatment_v = x_s_np[ids_v], y_s[ids_v], y_s_cfact[ids_v], treatment_np[ids_v]
n_val = len(xv)

yv = np.concatenate([yv, yv_cfact])
xv_cfact = xv.copy()
xv_cfact[:, -1] = 1 - treatment_v # assuming treatment is the last col
xv = np.concatenate([xv, xv_cfact])
x_s_np = np.concatenate([x_s_np[:len(x_s)], xv_cfact])
y_s = np.concatenate([y_s[:len(x_s)], yv_cfact])
tau_val_true = tau_true[ids_v]
ids_v = np.concatenate([ids_v, np.arange(start=len(x_s), stop=len(ids_v) + len(x_s))])

x_train = np.concatenate([xq, xb])
y_train = np.concatenate([yq, yb])
lgb_model = lgb.train(lgb_params, lgb.Dataset(x_train, label=y_train))
y_pred_lgb = lgb_model.predict(xv)

y_lgb_fact, y_lgb_cfact = y_pred_lgb[:n_val], y_pred_lgb[n_val:]
tau_lgb_pred = (y_lgb_fact - y_lgb_cfact) * (-1) ** (1 - treatment_v)


plt.figure()
plt.title(f"LGB seed {seed}")
plt.plot(range(len(tau_val_true)), tau_val_true, label="tau validate true")
plt.plot(range(len(tau_lgb_pred)), tau_lgb_pred, label = "tau val LGB")
plt.show()

In [None]:
x_all

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from scipy.cluster.hierarchy import linkage, dendrogram

x_notreatment = x_all.drop(columns=["treatment"])
X_feat = x_notreatment.T  # shape: (n_features x n_samples)
scaler = StandardScaler()
X_feat_scaled = scaler.fit_transform(X_feat)  # np.array, shape (n_features, n_samples)

Z = linkage(X_feat_scaled, method='ward')

plt.figure(figsize=(10, 5))
dendrogram(
    Z,
    labels=x_notreatment.columns, 
    leaf_rotation=90,
    leaf_font_size=8,
)
plt.title("Feature dendrogram")
plt.xlabel("Feature")
plt.ylabel("Distance")

In [None]:
Z.shape

In [None]:
X_feat.columns

In [None]:
x_scaled = scaler.fit_transform(x_all[["x1", "x2", "x14", "x16", "x18"]])
clusterer = hdbscan.HDBSCAN(min_cluster_size=30)
clusters = clusterer.fit_predict(x_scaled)
np.unique(clusters)

In [None]:
clusters