In [2]:
import warnings

# Hide only sklearn's transition warnings (keeps other important warnings visible)
warnings.filterwarnings(
    "ignore",
    message=".*'penalty' was deprecated.*",
    category=FutureWarning,
    module=r"sklearn\.linear_model\._logistic"
)

warnings.filterwarnings(
    "ignore",
    message=".*Setting penalty=None will ignore the C and l1_ratio parameters.*",
    category=UserWarning,
    module=r"sklearn\.linear_model\._logistic"
)


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

In [None]:
# 1) Load digits (10 classes)
X, y = load_digits(return_X_y=True)

# 2) Add many irrelevant/noisy features -> encourages overfitting
rng = np.random.RandomState(42)
n_noise = 2000
X_noise = rng.normal(size=(X.shape[0], n_noise))
X_aug = np.hstack([X, X_noise])

# 3) Make training set small -> more overfitting
X_train, X_test, y_train, y_test = train_test_split(
    X_aug, y, test_size=0.8, random_state=42, stratify=y
)

In [9]:
print(f"X has {X_aug.shape[0]} samples and {X_aug.shape[1]} features")
print (X_aug[:5])

X has 1797 samples and 2064 features
[[ 0.          0.          5.         ... -0.88187465 -0.16306696
  -0.74490264]
 [ 0.          0.          0.         ... -0.3202978   1.64337816
   0.36064789]
 [ 0.          0.          0.         ... -1.21740379  0.46795042
  -1.17028071]
 [ 0.          0.          7.         ... -0.8351245   1.65291305
   2.07777759]
 [ 0.          0.          0.         ... -0.70531672  0.49576557
   0.64438845]]


In [15]:
print("Class labels:");
print (np.unique(y))
print("Class Frequency Counts");
print(pd.Series(y).value_counts())


Class labels:
[0 1 2 3 4 5 6 7 8 9]
Class Frequency Counts
3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
Name: count, dtype: int64


In [None]:
def eval_softmax(name, model):
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)
    yhat = np.argmax(proba, axis=1)
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, yhat),
        "log_loss": log_loss(y_test, proba),
    }

In [None]:
# 4) Softmax models
# "Plain" ~= almost no regularization: huge C
plain_softmax = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        solver="lbfgs",
        # keep penalty explicit for clarity; may show FutureWarning in sklearn 1.8+
       l1_ratio=0.0, #L2
        C=1e6,
        max_iter=2000
    ))
])

In [None]:
# L2 regularized softmax
l2_softmax = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        solver="lbfgs",
        l1_ratio=10.0,
        C=1.0,
        max_iter=2000
    ))
])

In [None]:
# L1 regularized softmax (needs saga)
l1_softmax = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        solver="saga",
            penalty="l1",l1_ratio=1.0,
        C=1.0,
        max_iter=4000
    ))
])

In [5]:
results = []
results.append(eval_softmax("Plain Softmax (C=1e6)", plain_softmax))
results.append(eval_softmax("Softmax + L2 (C=1.0)", l2_softmax))
results.append(eval_softmax("Softmax + L1 (C=1.0)", l1_softmax))

pd.DataFrame(results).sort_values("accuracy", ascending=False)

Unnamed: 0,model,accuracy,log_loss
2,Softmax + L1 (C=1.0),0.902643,0.354954
1,Softmax + L2 (C=1.0),0.64395,1.213797
0,Plain Softmax (C=1e6),0.638387,1.147474
