comp4:

BATCH_SIZE = 200
TRAIN_SIZE = 26000
VAL_SIZE = 1000

# 使用改进的数据流
TRAIN_PATH = './dataset/train.csv'
train_stream, val_stream, train_size, val_size = get_data_streams(
    TRAIN_PATH, train_size=TRAIN_SIZE, val_size=VAL_SIZE, batch_size=BATCH_SIZE
)


iters = int((TRAIN_SIZE + BATCH_SIZE - 1) / BATCH_SIZE)  # 向上取整


# ========== 向量器（内存友好的 Hashing） ==========
hashvec = HashingVectorizer(
    n_features=2**22,                # 可按内存改 2**19~2**22
    alternate_sign=False,            # PassiveAggressive/SVM 用非负/非负也都可以；NB 必须非负
    ngram_range=(2,3),              # unigram+bigram
    preprocessor=preprocessor,
    tokenizer=tokenizer_stem_keepmeta  
)

def featurize(html_series: pd.Series) -> sp.csr_matrix:
    """一处封装，方便以后换成“词+字符双哈希”等更强特征。"""
    return hashvec.transform(html_series.astype(str))

# ========== 模型 ==========
classes = np.array([0, 1])

# 你现在用的是 PassiveAggressive（与文本流式很配）
clf = SGDClassifier(
    loss="hinge",           # 或 "hinge"
    penalty="elasticnet",
    alpha=1e-6,
    l1_ratio=0.15,
    learning_rate=1000,  # "constant"/"adaptive"/"optimal"/"invscaling"
                         # 步长
    average=True,
    max_iter=1, tol=None,
    random_state=42


    目前最佳 0.583

   eta=900 0.5940

   n_features=2**23,    0.5942
    

# =========================
# K 折交叉驗證 + 每折最佳模型集成輸出 test 預測
# =========================
import os, gc, copy, _pickle as pkl
import numpy as np
import pandas as pd
from math import ceil
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
from scipy.special import expit
import re
from bs4 import BeautifulSoup

# ---------- 基本配置 ----------
TRAIN_PATH = './dataset/train.csv'
TEST_PATH  = './dataset/test.csv'
OUT_DIR    = './output'
os.makedirs(OUT_DIR, exist_ok=True)

CV_MODE     = 'group'   # 'group' or 'stratified'
N_SPLITS    = 5
EPOCHS      = 3
BATCH_SIZE  = 2000
SEED        = 42
PATIENCE    = 2         # 早停：連續 PATIENCE 個 epoch 無提升就停
DO_FOLD_LDA = False     # True：每折在訓練集上預訓練 LDA（更穩但更慢）

# ---------- 分組鍵提取（publisher） ----------
_MONTH = dict(jan='01', feb='02', mar='03', apr='04', may='05', jun='06',
              jul='07', aug='08', sep='09', oct='10', nov='11', dec='12')

def _norm(s): return re.sub(r'[\W]+', ' ', (s or '').lower()).strip()
def _slug(s): return re.sub(r'[^a-z0-9_]+', '', _norm(s).replace(' ', '_'))

def extract_publisher_slug(html: str) -> str:
    if not isinstance(html, str) or not html.strip():
        return "unknown"
    soup = BeautifulSoup(html, 'html.parser')
    pub = soup.find('a', href=re.compile(r'/publishers/[^/]+/?', re.I))
    if pub:
        publisher = pub.get_text(' ', strip=True) or re.sub(r'.*/publishers/([^/]+)/?.*', r'\1', pub['href'], flags=re.I)
    else:
        publisher = "unknown"
    return _slug(publisher or 'unknown')

# ---------- 分類器工廠（文本穩定配置） ----------
def make_clf():
    return SGDClassifier(
        loss="log_loss",        # 用邏輯損失以獲得 predict_proba
        penalty="elasticnet",
        alpha=1e-5,
        l1_ratio=0.05,
        learning_rate="optimal",
        average=True,
        random_state=SEED
    )

# ---------- 讀取資料 ----------
df = pd.read_csv(TRAIN_PATH)
df['Popularity'] = (df['Popularity'].astype(int) == 1).astype(int)
y = df['Popularity'].values
texts = df['Page content'].astype(str)

# CV split 準備
if CV_MODE == 'group':
    groups = texts.apply(extract_publisher_slug).values
    splitter = GroupKFold(n_splits=N_SPLITS)
    split_iter = splitter.split(texts, y, groups)
    print(f"Using GroupKFold by publisher (groups={len(np.unique(groups))})")
else:
    splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    split_iter = splitter.split(texts, y)
    print("Using StratifiedKFold")

# 容器
fold_artifacts = []  # 保存每折最佳模型與（可選）LDA
oof_scores = np.zeros(len(df), dtype=float)
fold_aucs, fold_epochs = [], []

print(f"\nStart {N_SPLITS}-fold CV: EPOCHS={EPOCHS}, BATCH_SIZE={BATCH_SIZE}, DO_FOLD_LDA={DO_FOLD_LDA}")

for fold, (tr_idx, va_idx) in enumerate(split_iter, start=1):
    print(f"\n========== Fold {fold}/{N_SPLITS} ==========")
    tr_df = df.iloc[tr_idx].reset_index(drop=True)
    va_df = df.iloc[va_idx].reset_index(drop=True)
    y_val = va_df['Popularity'].values

    # 每折 LDA（可選，避免外洩需在訓練集上建）
    if DO_FOLD_LDA:
        lda_vec_f, lda_mod_f = pretrain_lda(tr_df, column='Page content', n_components=10, max_features=1000)
    else:
        lda_vec_f, lda_mod_f = (None, None)

    # 固定本折驗證特徵
    X_val = featurize_split(va_df['Page content'].astype(str), lda_vec_f, lda_mod_f, n_jobs=1)

    # 模型與早停
    clf = make_clf()
    best_auc, best_epoch = -1, -1
    best_state = None
    no_improve = 0

    # 多 epoch 訓練
    for epoch in range(1, EPOCHS+1):
        tr_shuf = tr_df.sample(frac=1.0, random_state=SEED+epoch).reset_index(drop=True)
        n_batches = ceil(len(tr_shuf)/BATCH_SIZE)

        for b in range(n_batches):
            batch = tr_shuf.iloc[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
            X_tr = featurize_split(batch['Page content'].astype(str), lda_vec_f, lda_mod_f, n_jobs=1)
            y_tr = batch['Popularity'].values
            if epoch == 1 and b == 0:
                clf.partial_fit(X_tr, y_tr, classes=np.array([0,1]))
            else:
                clf.partial_fit(X_tr, y_tr)

        # epoch 結束：評估本折 Val
        if hasattr(clf, "predict_proba"):
            val_prob = clf.predict_proba(X_val)[:, 1]
        else:
            val_prob = expit(clf.decision_function(X_val))
        val_auc = roc_auc_score(y_val, val_prob)
        print(f"Fold {fold} | epoch {epoch}/{EPOCHS} | Val AUC={val_auc:.4f}")

        if val_auc > best_auc:
            best_auc = val_auc
            best_epoch = epoch
            best_state = copy.deepcopy(clf)
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= PATIENCE:
                print(f"  Early stopping at epoch {epoch} (no improve {PATIENCE}×)")
                break

    # 保存 OOF（用最佳狀態）
    if hasattr(best_state, "predict_proba"):
        oof_scores[va_idx] = best_state.predict_proba(X_val)[:, 1]
    else:
        oof_scores[va_idx] = expit(best_state.decision_function(X_val))

    fold_aucs.append(best_auc); fold_epochs.append(best_epoch)

    # 保存每折最佳模型到硬碟
    model_path = os.path.join(OUT_DIR, f'clf_sgd_fold{fold}.pkl')
    pkl.dump(best_state, open(model_path, 'wb'))

    # 若用了 LDA，順便把該折的 LDA 也存起來
    lda_path = None
    if DO_FOLD_LDA:
        lda_path = os.path.join(OUT_DIR, f'lda_fold{fold}.pkl')
        pkl.dump({'lda_vec': lda_vec_f, 'lda_model': lda_mod_f}, open(lda_path, 'wb'))

    fold_artifacts.append({
        'model_path': model_path,
        'lda_path': lda_path
    })
    print(f"Fold {fold} BEST: epoch={best_epoch}, AUC={best_auc:.4f} | saved {model_path}")

    del X_val; gc.collect()

# CV 總結
oof_auc = roc_auc_score(y, oof_scores)
print("\n========== CV Summary ==========")
print("Fold AUCs:", ["%.4f" % a for a in fold_aucs])
print("Mean AUC = %.4f | Std = %.4f" % (np.mean(fold_aucs), np.std(fold_aucs)))
print("OOF  AUC = %.4f" % oof_auc)

# ---------- 用每折最佳模型對 test 預測並平均 ----------
df_test = pd.read_csv(TEST_PATH)
test_texts = df_test['Page content'].astype(str)
test_preds_each_fold = []

for fold, art in enumerate(fold_artifacts, start=1):
    # 讀模型
    clf = pkl.load(open(art['model_path'], 'rb'))
    # 讀折內 LDA（可選）
    if DO_FOLD_LDA and art['lda_path'] is not None:
        lda_pack = pkl.load(open(art['lda_path'], 'rb'))
        lda_vec_f, lda_mod_f = lda_pack['lda_vec'], lda_pack['lda_model']
    else:
        lda_vec_f, lda_mod_f = (None, None)

    # 特徵化（若各折 LDA 不同，需各自 transform 一次）
    X_test = featurize_split(test_texts, lda_vec_f, lda_mod_f, n_jobs=1)

    # 預測機率
    if hasattr(clf, "predict_proba"):
        prob = clf.predict_proba(X_test)[:, 1]
    else:
        prob = expit(clf.decision_function(X_test))
    test_preds_each_fold.append(prob)
    print(f"Fold {fold} test predicted. Shape={prob.shape}")

# 集成（平均）
test_pred = np.mean(np.vstack(test_preds_each_fold), axis=0)

# 導出提交
sub_path = os.path.join(OUT_DIR, f'submission_k{N_SPLITS}_ens.csv')
pd.DataFrame({'Id': df_test['Id'], 'Popularity': test_pred}).to_csv(sub_path, index=False)
print("Submission saved ->", sub_path)


目前最佳 0.584