In [4]:
import numpy as np
import pandas as pd
from patsy import dmatrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# data loading...
df = pd.read_csv("https://hastie.su.domains/ElemStatLearn/datasets/phoneme.data")
feature_cols = [col for col in df.columns if col.startswith("x.")]
X = df[feature_cols]
y = df["g"] # since this is the response columns

n_samples, n_features = X.shape
grid = np.arange(1, n_features + 1)

# candidate models and percentile where a knot is located
candidate_configs = {
    "1_knot": [50],
    "2_knots": [33, 66],
    "3_knots": [25, 50, 75],
    "4_knots": [20, 40, 60, 80],
    "5_knots": [10, 30, 50, 70, 90]
}

candidate_results = {}

for candidate_name, percentiles in candidate_configs.items():
    # to get the actual coordinate for the knots
    knots = np.percentile(grid, percentiles)

    # to create a spline basis without an intercept, can use formula "cr(grid, knots=knots) - 1"
    design = dmatrix("cr(grid, knots=knots) - 1",
                     {"grid": grid, "knots": knots},
                     return_type="dataframe")
    H = np.asarray(design)  # H w/ shape (256, m)

    # x* = x dot H, w/ shape (n_samples, m).
    X_transformed = X.values.dot(H)

    # then QDA classify on the filtered data...
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(X_transformed, y)
    y_pred = qda.predict(X_transformed)
    loss = np.mean(y_pred != y)  # misclassification error

    print(f"Candidate {candidate_name}:")
    print(f"knots (percentile): {percentiles}")
    print(f"knots (actual): {knots}")
    print(f"spline basis functions: {H.shape[1]}")
    print(f"Misclassification error: {loss:.4f}\n")

    candidate_results[candidate_name] = {
        "loss": loss,
        "model": qda,
        "knots": knots,
        "percentiles": percentiles,
        "H": H,
        "X_transformed": X_transformed
    }

# now just print which model is lowest error
best_candidate = min(candidate_results, key=lambda k: candidate_results[k]["loss"])
best_info = candidate_results[best_candidate]

print(f"Final model selection: {best_candidate}")


Candidate 1_knot:
knots (percentile): [50]
knots (actual): [128.5]
spline basis functions: 3
Misclassification error: 0.1668

Candidate 2_knots:
knots (percentile): [33, 66]
knots (actual): [ 85.15 169.3 ]
spline basis functions: 4
Misclassification error: 0.1175

Candidate 3_knots:
knots (percentile): [25, 50, 75]
knots (actual): [ 64.75 128.5  192.25]
spline basis functions: 5
Misclassification error: 0.1098

Candidate 4_knots:
knots (percentile): [20, 40, 60, 80]
knots (actual): [ 52. 103. 154. 205.]
spline basis functions: 6
Misclassification error: 0.0916

Candidate 5_knots:
knots (percentile): [10, 30, 50, 70, 90]
knots (actual): [ 26.5  77.5 128.5 179.5 230.5]
spline basis functions: 7
Misclassification error: 0.0847

Final model selection: 5_knots
