In [26]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, log_loss

In [27]:
import warnings

# Hide only sklearn's transition warnings (keeps other important warnings visible)
warnings.filterwarnings(
    "ignore",
    message=".*'penalty' was deprecated.*",
    category=FutureWarning,
    module=r"sklearn\.linear_model\._logistic"
)

warnings.filterwarnings(
    "ignore",
    message=".*Setting penalty=None will ignore the C and l1_ratio parameters.*",
    category=UserWarning,
    module=r"sklearn\.linear_model\._logistic"
)

In [28]:
# -------------------------
# 1) Load data (sparse high-dim)
# -------------------------
X, y = fetch_20newsgroups_vectorized(subset="all", return_X_y=True)

# Optional: downsample for a faster in-class demo
n_samples = 6000
X, y = X[:n_samples], y[:n_samples]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [29]:
pd.DataFrame(X[:5].toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130097,130098,130099,130100,130101,130102,130103,130104,130105,130106
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
print(y[:10])

[17  7 10 10  7  0 12 15  9  0]


In [31]:
def pct_nonzero_coef(model, is_ovr: bool) -> float:
    if is_ovr:
        coef = np.vstack([est.coef_ for est in model.estimators_])  # (K, d)
    else:
        coef = model.coef_  # (K, d)
    return 100.0 * np.mean(coef != 0)

def eval_model(name, model, is_ovr: bool):
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)  # (n_samples, n_classes)
    yhat = np.argmax(proba, axis=1)
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, yhat),
        "log_loss": log_loss(y_test, proba),
        "% nonzero coef": pct_nonzero_coef(model, is_ovr=is_ovr),
    }

In [32]:
# -------------------------
# 2) Models
# -------------------------
# "Plain" = effectively no regularization via huge C
C_l2 = 1.0
C_l1 = 1.0

# Use saga because it supports sparse + multinomial + L1/L2
softmax = LogisticRegression(solver="lbfgs", max_iter=2000)  # multiclass => multinomial automatically


In [33]:
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(LogisticRegression(solver="lbfgs", max_iter=2000))


In [34]:
# L2 (ridge-like)
l2_softmax = LogisticRegression(solver="lbfgs", l1_ratio=0.0, C=C_l2, max_iter=2000)

In [35]:
# L1 (lasso-like) needs saga (and supports multinomial + L1)
l1_softmax = LogisticRegression(solver="saga", l1_ratio=1.0, C=C_l1, max_iter=4000)

In [36]:
l2_ovr = OneVsRestClassifier(LogisticRegression(solver="lbfgs", l1_ratio=0.0, C=C_l2, max_iter=2000))
l1_ovr = OneVsRestClassifier(LogisticRegression(solver="saga",  l1_ratio=1.0, C=C_l1, max_iter=4000))


In [37]:
# -------------------------
# 3) Run + compare
# -------------------------
results = []
results.append(eval_model("Plain OVR (C=1e6)", ovr, is_ovr=True))
results.append(eval_model("Plain Softmax (C=1e6)", softmax, is_ovr=False))

results.append(eval_model("OVR + L2 (C=1.0)", l2_ovr, is_ovr=True))
results.append(eval_model("OVR + L1 (C=1.0)", l1_ovr, is_ovr=True))

results.append(eval_model("Softmax + L2 (C=1.0)", l2_softmax, is_ovr=False))
results.append(eval_model("Softmax + L1 (C=1.0)", l1_softmax, is_ovr=False))

df = pd.DataFrame(results).sort_values(["accuracy", "log_loss"], ascending=[False, True])
print(df.to_string(index=False))

                model  accuracy  log_loss  % nonzero coef
Plain Softmax (C=1e6)  0.713333  1.638435       62.782172
 Softmax + L2 (C=1.0)  0.713333  1.638435       62.782172
    Plain OVR (C=1e6)  0.710833  1.716110       62.782172
     OVR + L2 (C=1.0)  0.710833  1.716110       62.782172
     OVR + L1 (C=1.0)  0.675000  1.455780        0.029015
 Softmax + L1 (C=1.0)  0.665000  1.387920        0.025863
