In [1]:
import os
from pathlib import Path

cwd = Path.cwd()
if cwd.name == "experiments":
    os.chdir(cwd.parent)

print("Working dir:", Path.cwd())


Working dir: c:\Users\vikto\Desktop\mat-stk2011


In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler

from src.utils.seeds import seed_everything
from src.utils.metrics import quadratic_weighted_kappa
from src.utils.splits import get_stratified_folds
from src.models.student import StudentConfig, StudentTree

seed_everything(312)



In [3]:
spacy = np.load("data/cached_features_spacy.npz")
X = spacy["X"]

teacher = np.load("outputs/2026-02-20_16-53_teacher_cv/oof_predictions.npz")
y_true = teacher["y"].astype(int)
soft_probs = teacher["probs"]  # (N, K)

# expected-value soft targets (Approach 2)
soft_targets = soft_probs @ np.arange(1, soft_probs.shape[1] + 1)

print("Features:    ", X.shape)
print("Hard labels: ", np.unique(y_true))
print("Soft targets:", soft_targets.min().round(2), "to", soft_targets.max().round(2))




Features:     (17307, 84)
Hard labels:  [1 2 3 4 5 6]
Soft targets: 1.0 to 5.89


In [4]:
print("soft_probs shape:", soft_probs.shape)

print("\nFirst 5 soft label vectors:")
print(soft_probs[:5])

print("\nRow sums (should be 1 if probabilities):")
print(soft_probs[:5].sum(axis=1))

print("\nExample interpretation:")
i = 0
print("Teacher distribution for sample 0:")
for k, p in enumerate(soft_probs[i], start=1):
    print(f"class {k}: {p:.3f}")


soft_probs shape: (17307, 6)

First 5 soft label vectors:
[[1.84676170e-01 6.26029432e-01 1.79044291e-01 9.66996700e-03
  4.67232778e-04 1.12863927e-04]
 [2.85923388e-03 3.47079873e-01 6.44006014e-01 6.05355855e-03
  1.29321995e-06 1.57624367e-08]
 [1.56955302e-05 4.81895462e-04 7.46353343e-02 8.51702869e-01
  7.27309212e-02 4.33198467e-04]
 [1.11122546e-03 6.99999789e-03 1.55102417e-01 7.18026876e-01
  1.15953557e-01 2.80598039e-03]
 [1.57676190e-01 5.20404935e-01 2.99094826e-01 2.24472620e-02
  3.31540650e-04 4.52884342e-05]]

Row sums (should be 1 if probabilities):
[0.99999996 0.99999999 0.99999991 1.00000005 1.00000004]

Example interpretation:
Teacher distribution for sample 0:
class 1: 0.185
class 2: 0.626
class 3: 0.179
class 4: 0.010
class 5: 0.000
class 6: 0.000


# Interesting problem

We consider a classification problem with labels
$$
y \in \mathcal C=\{1,\dots,K\}.
$$

However, the teacher model provides for each input $x$ a probability vector
$$
p(x)=(p_1(x),\dots,p_K(x)), \qquad \sum_{k=1}^K p_k(x)=1,
$$
rather than a single class label.

Thus the supervision is a distribution over classes, not a categorical target.

The problem is therefore:

$$
\text{How do we train a classifier } f(x)\in\mathcal C
\text{ when the training targets are } p(x)\in\Delta^{K-1}?
$$

## Approach 1: Argmax reduction

Discard the distribution entirely:
$$
\hat y(x) = \arg\max_k\, p_k(x).
$$
Reduces to standard classification. Loses all inter-class information —
a teacher outputting $(0.05, 0.45, 0.50)$ produces the same target as $(0, 0, 1)$.

Works with: any classifier (decision tree, SVM, anything).

## Approach 2: Expected value regression

Collapse the distribution to a scalar:
$$
\tilde y(x) = \sum_{k=1}^K k\, p_k(x) = \mathbb{E}_{Y \sim p(x)}[Y].
$$
Train a regressor $g(x) \in \mathbb{R}$, classify by rounding $f(x) = \mathrm{round}(g(x))$.

Preserves ordinal information — uncertainty between 3 and 4 gives $\tilde y \approx 3.5$.
But two very different distributions with the same mean produce identical targets.

Works with: any regressor, including a single decision tree (minimises MSE natively).

## Approach 3: Class-probability regression

Fit $K$ separate regressors:
$$
g_k(x) \approx p_k(x), \qquad k = 1,\dots,K,
$$
classify via $f(x) = \arg\max_k\, g_k(x)$.

Preserves the full distributional shape. Note that approach 2 is a linear
projection of this: $\tilde y = \sum_k k\, g_k(x)$. So this strictly generalises approach 2.

Cost: $K$ models instead of one.

Works with: any regressor, including $K$ separate decision trees.

## Approach 4: Weighted mixture

Interpolate between hard ground truth and soft teacher target:
$$
\tilde y_\alpha(x) = \alpha\, \tilde y_{\text{soft}}(x) + (1 - \alpha)\, y_{\text{hard}}(x), \qquad \alpha \in [0,1].
$$
Balances unbiased but noisy (hard) vs smooth but biased (soft) supervision.
Optimal $\alpha$ depends on teacher quality.

Works with: any regressor (same as approach 2 but with mixed targets).

## Approach 5: Direct KL minimisation (Frosst & Hinton, 2017)

**Why approaches 1–4 exist as workarounds:** a standard decision tree splits
greedily and never optimises a global loss. There is nowhere to plug in KL
divergence. The above approaches are projections of the distributional problem
into something a tree can handle natively.

**Hinton's solution:** replace the standard tree with a *soft decision tree* that
is fully differentiable and trained with gradient descent.

Each internal node $i$ has a learned filter $w_i$ and bias $b_i$. The probability
of taking the right branch is:
$$
p_i(x) = \sigma\bigl(\beta(x w_i + b_i)\bigr),
$$
where $\beta$ is an inverse temperature. Each leaf $\ell$ stores a learned distribution
over classes:
$$
Q_k^\ell = \frac{\exp(\phi_k^\ell)}{\sum_{k'} \exp(\phi_{k'}^\ell)}.
$$
The path probability to leaf $\ell$ is the product of all branch probabilities along
the path from the root. Because everything is differentiable, the loss directly
minimises cross-entropy against the teacher's soft targets:
$$
L(x) = -\log \sum_{\ell} P^\ell(x) \left( \sum_k T_k \log Q_k^\ell \right),
$$
where $T$ is the teacher's target distribution and $P^\ell(x)$ is the path probability
to leaf $\ell$.

This is the "proper" distillation into a tree — no projection, no workaround,
the full distribution is matched via KL/cross-entropy.

**Their results (MNIST):**
- Soft tree on hard labels: 94.45%
- Soft tree on teacher's soft labels: 96.76%
- Teacher NN: 99.21%

Distilled tree lands roughly halfway. Same pattern as our AES experiments.

---


In [5]:
# setup folds
folds = get_stratified_folds(y_true, n_splits=5, seed=312)
cfg = StudentConfig(max_depth=8, min_samples_leaf=10, random_state=312)

In [6]:
# get baseline

oof_preds_hard = np.zeros(len(y_true), dtype=int)

for i, (tr_idx, va_idx) in enumerate(folds):
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X[tr_idx])
    X_va = scaler.transform(X[va_idx])

    tree = StudentTree(cfg, mode="classification")
    tree.fit(X_tr, y_true[tr_idx])
    oof_preds_hard[va_idx] = tree.predict(X_va)

qwk_hard = quadratic_weighted_kappa(y_true, oof_preds_hard)
print(f"Student (hard labels):  QWK = {qwk_hard:.4f}")


Student (hard labels):  QWK = 0.6625


In [7]:
# try Argmax reduction (teacher → hard labels)

y_teacher = np.argmax(soft_probs, axis=1).astype(int) + 1  # convert to classes 1..K
print("Teacher argmax label distribution:", np.unique(y_teacher, return_counts=True))

oof_preds_argmax = np.zeros(len(y_true), dtype=int)

for tr_idx, va_idx in folds:
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X[tr_idx])
    X_va = scaler.transform(X[va_idx])

    tree = StudentTree(cfg, mode="classification")
    tree.fit(X_tr, y_teacher[tr_idx])
    oof_preds_argmax[va_idx] = tree.predict(X_va)

qwk_argmax = quadratic_weighted_kappa(y_true, oof_preds_argmax)

print("=" * 40)
print(f"Hard labels (baseline): {qwk_hard:.4f}")
print(f"Teacher argmax:         {qwk_argmax:.4f}")
print(f"Delta vs hard:          {qwk_argmax - qwk_hard:+.4f}")

Teacher argmax label distribution: (array([1, 2, 3, 4, 5, 6]), array([1210, 4575, 6452, 3941, 1042,   87]))
Hard labels (baseline): 0.6625
Teacher argmax:         0.6882
Delta vs hard:          +0.0256
