In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "experiments":
    os.chdir(cwd.parent)
print("Working dir:", Path.cwd())

Working dir: c:\Users\vikto\Desktop\mat-stk2011


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from src.utils.seeds import seed_everything
from src.utils.metrics import quadratic_weighted_kappa
from src.utils.splits import get_stratified_folds

seed_everything(312)

In [3]:
spacy = np.load("data/cached_features_spacy.npz")
X = spacy["X"]

teacher = np.load("outputs/2026-02-20_16-53_teacher_cv/oof_predictions.npz")
y_true = teacher["y"].astype(int)
soft_probs = teacher["probs"]
soft_targets = soft_probs @ np.arange(1, soft_probs.shape[1] + 1)

folds = get_stratified_folds(y_true, n_splits=5, seed=42)
n = len(y_true)

print("Features:", X.shape)

Features: (17307, 84)


In [5]:
# helper
def evaluate_model(model_fn, X, targets, y_true, folds, regression=True):
    oof = np.zeros(n)

    total_leaves = 0

    for tr_idx, va_idx in folds:
        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X[tr_idx])
        X_va = scaler.transform(X[va_idx])

        model = model_fn()
        model.fit(X_tr, targets[tr_idx])

        raw = model.predict(X_va)
        if regression:
            oof[va_idx] = np.clip(np.round(raw), 1, 6)
        else:
            oof[va_idx] = raw

        # count leaves
        if hasattr(model, "tree_"):
            total_leaves += model.tree_.n_leaves
        elif hasattr(model, "estimators_"):
            for est in np.array(model.estimators_).flat:
                total_leaves += est.tree_.n_leaves

    qwk = quadratic_weighted_kappa(y_true, oof.astype(int))
    avg_leaves = total_leaves / len(folds)

    return qwk, avg_leaves

# Depth sweep over trees

In [6]:
depths = [2, 3, 4, 5, 6, 8, 10, 12, 15, 20, None]

dt_hard = []
dt_soft = []

for d in depths:
    label = str(d) if d else "None"

    fn_hard = lambda d=d: DecisionTreeClassifier(max_depth=d, min_samples_leaf=10)
    qwk_h, leaves_h = evaluate_model(fn_hard, X, y_true, y_true, folds, regression=False)
    dt_hard.append({"depth": label, "qwk": qwk_h, "leaves": leaves_h})

    fn_soft = lambda d=d: DecisionTreeRegressor(max_depth=d, min_samples_leaf=10)
    qwk_s, leaves_s = evaluate_model(fn_soft, X, soft_targets, y_true, folds)
    dt_soft.append({"depth": label, "qwk": qwk_s, "leaves": leaves_s})

    print(f"depth={label:>4s}  hard={qwk_h:.4f} ({leaves_h:.0f} leaves)  soft={qwk_s:.4f} ({leaves_s:.0f} leaves)")


depth=   2  hard=0.5756 (4 leaves)  soft=0.5710 (4 leaves)
depth=   3  hard=0.6310 (8 leaves)  soft=0.6546 (8 leaves)
depth=   4  hard=0.6527 (16 leaves)  soft=0.6646 (16 leaves)
depth=   5  hard=0.6635 (32 leaves)  soft=0.6675 (32 leaves)
depth=   6  hard=0.6650 (63 leaves)  soft=0.6823 (61 leaves)
depth=   8  hard=0.6678 (189 leaves)  soft=0.6823 (175 leaves)
depth=  10  hard=0.6547 (392 leaves)  soft=0.6872 (372 leaves)
depth=  12  hard=0.6440 (615 leaves)  soft=0.6837 (623 leaves)
depth=  15  hard=0.6339 (833 leaves)  soft=0.6769 (934 leaves)
depth=  20  hard=0.6304 (946 leaves)  soft=0.6740 (1077 leaves)
depth=None  hard=0.6305 (953 leaves)  soft=0.6741 (1082 leaves)


# Random Forest - n_estimators sweep

In [7]:
n_trees_list = [5, 10, 20, 50, 100, 200]

rf_hard = []
rf_soft = []

for nt in n_trees_list:
    fn_hard = lambda nt=nt: RandomForestRegressor(
        n_estimators=nt, max_depth=8, min_samples_leaf=10, random_state=42
    )
    # train RF on hard labels as regression too (simpler, works fine for QWK)
    qwk_h, leaves_h = evaluate_model(fn_hard, X, y_true.astype(float), y_true, folds)
    rf_hard.append({"n_trees": nt, "qwk": qwk_h, "leaves": leaves_h})

    fn_soft = lambda nt=nt: RandomForestRegressor(
        n_estimators=nt, max_depth=8, min_samples_leaf=10, random_state=42
    )
    qwk_s, leaves_s = evaluate_model(fn_soft, X, soft_targets, y_true, folds)
    rf_soft.append({"n_trees": nt, "qwk": qwk_s, "leaves": leaves_s})

    print(f"RF n={nt:>3d}  hard={qwk_h:.4f} ({leaves_h:.0f} leaves)  soft={qwk_s:.4f} ({leaves_s:.0f} leaves)")


RF n=  5  hard=0.6888 (757 leaves)  soft=0.6988 (776 leaves)
RF n= 10  hard=0.6981 (1505 leaves)  soft=0.7012 (1557 leaves)
RF n= 20  hard=0.6978 (2996 leaves)  soft=0.7020 (3124 leaves)
RF n= 50  hard=0.7001 (7457 leaves)  soft=0.7013 (7772 leaves)
RF n=100  hard=0.6998 (14951 leaves)  soft=0.7035 (15512 leaves)
RF n=200  hard=0.7013 (29964 leaves)  soft=0.7043 (30975 leaves)


# GBM — n_estimators sweep


In [None]:
gbm_hard = []
gbm_soft = []

for nt in n_trees_list:
    fn_hard = lambda nt=nt: GradientBoostingRegressor(
        n_estimators=nt, max_depth=4, learning_rate=0.1, subsample=0.8, random_state=42
    )
    qwk_h, leaves_h = evaluate_model(fn_hard, X, y_true.astype(float), y_true, folds)
    gbm_hard.append({"n_trees": nt, "qwk": qwk_h, "leaves": leaves_h})

    fn_soft = lambda nt=nt: GradientBoostingRegressor(
        n_estimators=nt, max_depth=4, learning_rate=0.1, subsample=0.8, random_state=42
    )
    qwk_s, leaves_s = evaluate_model(fn_soft, X, soft_targets, y_true, folds)
    gbm_soft.append({"n_trees": nt, "qwk": qwk_s, "leaves": leaves_s})

    print(f"GBM n={nt:>3d}  hard={qwk_h:.4f} ({leaves_h:.0f} leaves)  soft={qwk_s:.4f} ({leaves_s:.0f} leaves)")


In [None]:
# %%
fig, ax = plt.subplots(figsize=(10, 6))

def plot_series(data, label, color, marker, linestyle="-"):
    leaves = [d["leaves"] for d in data]
    qwks = [d["qwk"] for d in data]
    ax.plot(leaves, qwks, marker=marker, color=color, linestyle=linestyle, label=label, markersize=6)

# hard label curves (dashed)
plot_series(dt_hard, "Decision Tree (hard)", "tab:blue", "o", "--")
plot_series(rf_hard, "Random Forest (hard)", "tab:orange", "s", "--")
plot_series(gbm_hard, "GBM (hard)", "tab:green", "^", "--")

# soft target curves (solid)
plot_series(dt_soft, "Decision Tree (soft)", "tab:blue", "o", "-")
plot_series(rf_soft, "Random Forest (soft)", "tab:orange", "s", "-")
plot_series(gbm_soft, "GBM (soft)", "tab:green", "^", "-")

# teacher ceiling
ax.axhline(y=0.80, color="red", linestyle=":", linewidth=1.5, label="Teacher NN (~0.80)")

ax.set_xscale("log")
ax.set_xlabel("Total Leaves (avg across folds, log scale)")
ax.set_ylabel("QWK")
ax.set_title("Interpretability–Performance Tradeoff")
ax.legend(loc="lower right", fontsize=8)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
