In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import src.train as train
import importlib
importlib.reload(train)

# 0. add header
# -----------------------------
num_features = [f"I{i}" for i in range(1, 14)]
cat_features = [f"C{i}" for i in range(1, 27)]
columns = ["label"] + num_features + cat_features

# -----------------------------
# 1. Load small subset of dataset
# -----------------------------

# Read first 100k rows to start
data = pd.read_csv("../data/criteo/train.txt", sep='\t', header=None, names=columns, nrows=100*1000)

# Fill missing numeric features with 0
for col in num_features:
    data[col] = data[col].fillna(0)

# Fill missing categorical features with a placeholder
for col in cat_features:
    data[col] = data[col].fillna("missing")

# -----------------------------
# 2. Separate target and features
# -----------------------------
y = data['label']
X = data.drop(['label'], axis=1)

In [None]:
# 3. Run kfold cross validation and tune alpha.
alphas = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
results = []

for alpha in alphas:
    print(f"\n### alpha = {alpha}")
    aucs, lls, _ = train.run_kfold_cv(
        X=X,
        y=y,
        cat_features=cat_features,
        num_features=num_features,
        alpha=alpha,
        max_iter=10,
    )
    results.append({
        "alpha": alpha,
        "auc_mean": np.mean(aucs),
        "auc_std": np.std(aucs),
        "logloss_mean": np.mean(lls),
        "logloss_std": np.std(lls),
    })

print(results)



### alpha = 1e-06





### alpha = 3e-06





### alpha = 1e-05


In [None]:
import matplotlib.pyplot as plt

alphas = [r["alpha"] for r in results]
logloss = [r["logloss_mean"] for r in results]
auc = [r["auc_mean"] for r in results]

plt.figure(figsize=(8,5))
plt.semilogx(alphas, logloss, marker="o")
plt.xlabel("alpha (log scale)")
plt.ylabel("CV LogLoss (mean)")
plt.title("SGD Regularization Tuning (LogLoss)")
plt.grid(True)
plt.show()

plt.figure(figsize=(8,5))
plt.semilogx(alphas, auc, marker="o")
plt.xlabel("alpha (log scale)")
plt.ylabel("CV AUC (mean)")
plt.title("SGD Regularization Tuning (AUC)")
plt.grid(True)
plt.show()

SGDClassifier on 100k example

Everything on:
===== CV Summary =====
AUC    : 0.7510 ± 0.0043
LogLoss: 0.4724 ± 0.0060

No Rare handling, Freq Encoding, Hashing:
===== CV Summary =====
AUC    : 0.7582 ± 0.0048
LogLoss: 0.4639 ± 0.0061

Rare handling + Freq Encoding, No Hashing:
===== CV Summary =====
AUC    : 0.6954 ± 0.0064
LogLoss: 0.5030 ± 0.0047

Hashing Only:
===== CV Summary =====
AUC    : 0.5005 ± 0.0029
LogLoss: 9.7953 ± 1.8763


SGDClassifier on 500k example

Everything on:
===== CV Summary =====
AUC    : 0.7663 ± 0.0022
LogLoss: 0.4798 ± 0.0015


