Cell 1 — Imports & Paths

In [25]:
# =============================================
# Cell 1 — Imports & Config
# =============================================
import os
import re
import json
import math
import _pickle as pkl
from pathlib import Path
from typing import Any, Dict, Optional

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# boosting libs
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("./dataset")
OUT_DIR = Path("./output")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SUB_PATH   = OUT_DIR / "submission_model9_mimic_model8_plus_author_publisher.csv"
MODEL_PATH = OUT_DIR / "voting_model9_mimic_model8_plus_author_publisher.pkl"


In [26]:
# =============================================
# Cell 2 — Load Data
# =============================================
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"

assert TRAIN_PATH.exists() and TEST_PATH.exists(), "train.csv / test.csv 未找到（应位于 ./dataset/）"

df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)

assert {"Id", "Page content", "Popularity"}.issubset(df_train.columns)
assert {"Id", "Page content"}.issubset(df_test.columns)

print("train/test:", df_train.shape, df_test.shape)


train/test: (27643, 3) (11847, 2)


In [27]:
# =============================================
# Cell 3 — HTML parsing（mimic model8 + Publisher）
# =============================================
import re
from datetime import datetime
from urllib.parse import urlparse

def _s(x) -> str:
    return x if isinstance(x, str) else ""

def _clean(s: str) -> str:
    s = (s or "").strip().lower()
    return s if s else "unknown"

# 时间解析正则（与 model8 类似的优先级：ISO → 英文月名 → 仅时间）
_RE_ISO       = re.compile(r"\b(?P<y>\d{4})[-/](?P<m>\d{1,2})[-/](?P<d>\d{1,2})(?:[ T](?P<H>\d{1,2}):(?P<M>\d{2})(?::(?P<S>\d{2}))?)?", re.I)
_RE_MMM_D_Y   = re.compile(r"\b(?P<mon>jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*[ ,.-]+(?P<d>\d{1,2})[ ,.-]+(?P<y>\d{4})(?:[ ,T](?P<H>\d{1,2}):(?P<M>\d{2})(?::(?P<S>\d{2}))?)?", re.I)
_RE_HMS       = re.compile(r"\b(?P<H>\d{1,2}):(?P<M>\d{2})(?::(?P<S>\d{2}))?\b")
_MON_MAP      = {m:i for i,m in enumerate(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'], start=1)}

def _first_datetime(text: str):
    text = _s(text)

    m = _RE_ISO.search(text)
    if m:
        y = int(m.group('y')); mth = int(m.group('m')); d = int(m.group('d'))
        H = int(m.group('H')) if m.group('H') else None
        M = int(m.group('M')) if m.group('M') else None
        S = int(m.group('S')) if m.group('S') else None
        return y, mth, d, H, M, S

    m = _RE_MMM_D_Y.search(text)
    if m:
        y = int(m.group('y')); d = int(m.group('d'))
        mmm = m.group('mon').lower()
        mth = _MON_MAP.get(mmm, None)
        H = int(m.group('H')) if m.group('H') else None
        M = int(m.group('M')) if m.group('M') else None
        S = int(m.group('S')) if m.group('S') else None
        return y, mth, d, H, M, S

    m = _RE_HMS.search(text)
    if m:
        H = int(m.group('H')) if m.group('H') else None
        M = int(m.group('M')) if m.group('M') else None
        S = int(m.group('S')) if m.group('S') else None
        return None, None, None, H, M, S

    return None, None, None, None, None, None

def _ymd_to_weekday(y, m, d) -> Optional[int]:
    if y is None or m is None or d is None:
        return None
    try:
        return datetime(int(y), int(m), int(d)).weekday() + 1  # Monday=1 ... Sunday=7
    except Exception:
        return None

def _guess_publisher(soup: BeautifulSoup, text: str) -> str:
    """
    Publisher 提取策略：
    1) <meta property="og:site_name" content="...">
    2) <meta name="publisher" content="..."> 或 <meta property="article:publisher" ...>
    3) 若页面里有 <link rel="canonical" href="..."> 或首个 <a href> 域名，取域名主体（如 nytimes, bbc）
    """
    # og:site_name
    tag = soup.find("meta", attrs={"property": "og:site_name"})
    if tag and tag.get("content"):
        return _clean(tag.get("content"))

    # name=publisher / property=article:publisher (Facebook often uses a URL here)
    for key in ["publisher", "article:publisher"]:
        tag = soup.find("meta", attrs={"name": key}) or soup.find("meta", attrs={"property": key})
        if tag and tag.get("content"):
            val = tag.get("content")
            # 如果是 URL，提取域名主干
            if re.match(r"^https?://", val, re.I):
                try:
                    netloc = urlparse(val).netloc
                    base = netloc.split(".")
                    core = base[-2] if len(base) >= 2 else netloc
                    return _clean(core)
                except Exception:
                    pass
            return _clean(val)

    # canonical 或第一个 a[href] 的域名
    link = soup.find("link", rel="canonical")
    href = None
    if link and link.get("href"):
        href = link.get("href")
    if not href:
        a = soup.find("a", href=True)
        if a:
            href = a.get("href")

    if href and re.match(r"^https?://", href, re.I):
        try:
            netloc = urlparse(href).netloc
            base = netloc.split(".")
            core = base[-2] if len(base) >= 2 else netloc
            return _clean(core)
        except Exception:
            pass

    return "unknown"

def parse_html_like_model8_plus_publisher(html: str) -> Dict[str, Any]:
    # 输出字段（完全覆盖 model8 的列 + 新增 Publisher）
    # 'Title','Author','Channel','Topic','Publisher',
    # 'Day','Date','Month','Year','Hour','Minute','Second',
    # 'Content_Len','Num_See_Also','Num_Image','Num_A'
    html = _s(html)
    if not html:
        return dict(
            Title="unknown", Author="unknown", Channel="unknown", Topic="unknown", Publisher="unknown",
            Day=np.nan, Date=np.nan, Month=np.nan, Year=np.nan,
            Hour=np.nan, Minute=np.nan, Second=np.nan,
            Content_Len=0, Num_See_Also=0, Num_Image=0, Num_A=0
        )

    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(" ", strip=True)

    # Title
    title = _clean(soup.title.get_text(strip=True) if soup.title else "")

    # Author：优先 meta[name=author]，再 author/byline 容器；清掉 'by ' 与时间串
    author = ""
    tag_author = soup.find("meta", attrs={"name": re.compile(r"author", re.I)})
    if tag_author and tag_author.get("content"):
        author = tag_author.get("content", "")
    if not author:
        cand = soup.select('[class*="author" i], [id*="author" i], [class*="byline" i]')
        if cand:
            author = cand[0].get_text(" ", strip=True)
    author = re.sub(r"^\s*by\s+", "", author, flags=re.I)
    author = re.sub(r"\b\d{4}-\d{2}-\d{2}.*$", "", author).strip()
    author = re.sub(r"\b(?:utc|gmt)\b.*$", "", author, flags=re.I).strip()
    author = _clean(author)

    # Channel：只采 data-channel 或明显 channel 容器的短文本
    channel = ""
    node = soup.find(attrs={"data-channel": True})
    if node:
        channel = node.get("data-channel", "")
    if not channel:
        cand_ch = soup.select('[class*="channel" i], [id*="channel" i]')
        if cand_ch:
            tmp = (cand_ch[0].get("data-channel") or cand_ch[0].get_text(" ", strip=True) or "").strip()
            if tmp and len(tmp) <= 64:
                channel = tmp
    channel = _clean(channel)

    # Topic：从 data-topic 或典型话题列表容器聚合
    topics = []
    for node in soup.select("[data-topic]"):
        topics.append(_clean(node.get("data-topic", "")))
    if not topics:
        for node in soup.select('footer [class*="topic" i], [class*="article-topics" i] a, [class*="article-topics" i] li'):
            t = _clean(node.get_text(" ", strip=True))
            if t != "unknown":
                topics.append(t)
    topics = [t for t in dict.fromkeys(topics) if t and t != "unknown"]
    topic_str = "topics: " + " , ".join(topics) if topics else "unknown"

    # Publisher（新增）
    publisher = _guess_publisher(soup, text)

    # 时间：meta 优先，正则兜底
    raw_dt = ""
    for key in ["article:published_time", "article:modified_time", "og:updated_time", "pubdate", "date", "publishdate"]:
        tag = soup.find("meta", attrs={"property": key}) or soup.find("meta", attrs={"name": key})
        if tag and tag.get("content"):
            raw_dt = tag.get("content")
            break
    if not raw_dt:
        raw_dt = text

    y, m, d, H, M, S = _first_datetime(raw_dt)
    wk = _ymd_to_weekday(y, m, d)

    # 统计
    content_len   = len(text)
    num_img       = len(soup.find_all("img"))
    num_a         = len(soup.find_all("a"))
    num_see_also  = len(re.findall(r"\bsee also\b", text, flags=re.I))

    return dict(
        Title=title,
        Author=author,
        Channel=channel,
        Topic=topic_str,
        Publisher=publisher,
        Day=np.nan if wk is None else wk,
        Date=np.nan if d is None else d,
        Month=np.nan if m is None else m,
        Year=np.nan if y is None else y,
        Hour=np.nan if H is None else H,
        Minute=np.nan if M is None else M,
        Second=np.nan if S is None else S,
        Content_Len=content_len,
        Num_See_Also=num_see_also,
        Num_Image=num_img,
        Num_A=num_a
    )

def html_to_table(df: pd.DataFrame) -> pd.DataFrame:
    rows = [parse_html_like_model8_plus_publisher(s) for s in df["Page content"].astype(str).tolist()]
    meta = pd.DataFrame(rows, index=df.index)
    meta.insert(0, "Id", df["Id"].values)
    return meta

meta_train_raw = html_to_table(df_train)
meta_test_raw  = html_to_table(df_test)

# 快速查看
meta_train_raw.head(5)


Unnamed: 0,Id,Title,Author,Channel,Topic,Publisher,Day,Date,Month,Year,Hour,Minute,Second,Content_Len,Num_See_Also,Num_Image,Num_A
0,0,unknown,clara moskowitz,world,"topics: asteroid , asteroids , challenge , ear...",unknown,3,19,6,2013,15,4,30,3787,4,1,22
1,1,unknown,christina warren,tech,"topics: apps and software , google , open sour...",unknown,4,28,3,2013,17,40,55,2081,1,2,18
2,2,unknown,sam laird,entertainment,"topics: entertainment , nfl , nfl draft , spor...",unknown,3,7,5,2014,19,15,20,6761,1,2,11
3,3,unknown,sam laird,watercooler,"topics: sports , video , videos , watercooler",unknown,5,11,10,2013,2,26,50,1751,1,1,13
4,4,unknown,connor finnegan,entertainment,"topics: entertainment , instagram , instagram ...",unknown,4,17,4,2014,3,31,43,8720,1,52,16


In [38]:
# =============================================
# Cell 4 — Clean & Select (keep all model8 features + Author + Publisher)
# =============================================

# 文本列（在 model8 基础上新增 Author、Publisher）
text_cols = ["Title", "Author", "Channel", "Topic", "Publisher"]

# 数值列（model8）
num_cols_model8 = [
    "Day","Date","Month","Year","Hour","Minute","Second",
    "Content_Len","Num_See_Also","Num_Image","Num_A"
]

# 合并统一做缺失/类型处理后再切回
all_df = pd.concat([
    meta_train_raw.assign(_is_train=1),
    meta_test_raw.assign(_is_train=0)
], axis=0, ignore_index=True)

# 文本缺失填充
for c in text_cols:
    all_df[c] = all_df[c].fillna("unknown").astype(str)

# 数值转数值（保留 NaN，稍后 SimpleImputer 处理）
for c in num_cols_model8:
    all_df[c] = pd.to_numeric(all_df[c], errors="coerce")

meta_train = all_df[all_df["_is_train"]==1].drop(columns=["_is_train"]).reset_index(drop=True)
meta_test  = all_df[all_df["_is_train"]==0].drop(columns=["_is_train"]).reset_index(drop=True)

X_train_base = meta_train[text_cols + num_cols_model8].copy()
X_test_base  = meta_test[text_cols + num_cols_model8].copy()
y = df_train["Popularity"].values

# 统一标签到 {0,1}
y = pd.Series(y).map({-1: 0, 0: 0, 1: 1}).astype(int).values
print("Unique y after mapping:", np.unique(y))


print("X shapes:", X_train_base.shape, X_test_base.shape)
X_train_base.head(3)


Unique y after mapping: [0 1]
X shapes: (27643, 16) (11847, 16)


Unnamed: 0,Title,Author,Channel,Topic,Publisher,Day,Date,Month,Year,Hour,Minute,Second,Content_Len,Num_See_Also,Num_Image,Num_A
0,unknown,clara moskowitz,world,"topics: asteroid , asteroids , challenge , ear...",unknown,3,19,6,2013,15.0,4.0,30.0,3787,4,1,22
1,unknown,christina warren,tech,"topics: apps and software , google , open sour...",unknown,4,28,3,2013,17.0,40.0,55.0,2081,1,2,18
2,unknown,sam laird,entertainment,"topics: entertainment , nfl , nfl draft , spor...",unknown,3,7,5,2014,19.0,15.0,20.0,6761,1,2,11


In [40]:
# =============================================
# Cell 5 — ColumnTransformer（5 文本 + 数值 with imputer）
# =============================================
from sklearn.pipeline import Pipeline as SkPipe  # 仅用于数值子流水线

# 与 Cell 4 保持一致的列
numeric_cols = num_cols_model8  # 只用 model8 的数值列
# 文本列在 ColumnTransformer 中逐一指定

# 数值列填充：避免 RF 因 NaN 报错
numeric_transformer = SkPipe(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
])

# 五个文本列分别做 BOW（适度限维 & 过滤低频）
title_bow     = ("title_bow",     CountVectorizer(lowercase=True, min_df=3, max_features=2**14), "Title")
author_bow    = ("author_bow",    CountVectorizer(lowercase=True, min_df=3, max_features=2**13), "Author")
channel_bow   = ("channel_bow",   CountVectorizer(lowercase=True, min_df=3, max_features=2**12), "Channel")
topic_bow     = ("topic_bow",     CountVectorizer(lowercase=True, min_df=3, max_features=2**14), "Topic")
publisher_bow = ("publisher_bow", CountVectorizer(lowercase=True, min_df=2, max_features=2**12), "Publisher")

trans_all = ColumnTransformer(
    transformers=[
        title_bow,
        author_bow,
        channel_bow,
        topic_bow,
        publisher_bow,
        ("num", numeric_transformer, numeric_cols),
    ],
    remainder="drop",        # 只保留上面列，避免意外列混入
    sparse_threshold=0.3     # 输出为稀疏矩阵（高维 BOW 更省内存）
)


In [41]:
# =============================================
# Cell 6 — Base model builders (mimic model8 style)
# =============================================

def make_rf():
    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight="balanced_subsample"
    )
    return Pipeline([("prep", trans_all), ("clf", rf)])

def make_lgbm():
    lgbm = LGBMClassifier(
        n_estimators=600,
        learning_rate=0.02,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="binary",
        metric="auc",
    )
    return Pipeline([("prep", trans_all), ("clf", lgbm)])

def make_xgb():
    xgb = XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=6,
        min_child_weight=2.0,
        reg_alpha=0.0,
        reg_lambda=1.0,
        eval_metric="auc",
        n_jobs=-1,
        random_state=RANDOM_STATE,
        tree_method="hist"
    )
    return Pipeline([("prep", trans_all), ("clf", xgb)])

def make_cat():
    cat = CatBoostClassifier(
        depth=6,
        learning_rate=0.06,
        n_estimators=800,
        l2_leaf_reg=6.0,
        random_seed=RANDOM_STATE,
        loss_function="Logloss",
        eval_metric="AUC",
        verbose=False,
    )
    return Pipeline([("prep", trans_all), ("clf", cat)])


In [42]:
# =============================================
# Cell 7 — CV helpers
# =============================================

def train_one(pipe: Pipeline, X: pd.DataFrame, y: np.ndarray, name: str = "model"):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    scoring = "roc_auc"
    cvres = cross_validate(
        pipe, X, y,
        cv=cv,
        scoring=scoring,
        return_train_score=True,
        n_jobs=1
    )
    print(json.dumps({
        "name": name,
        "cv_train_auc_mean": float(np.mean(cvres["train_score"])),
        "cv_train_auc_std":  float(np.std(cvres["train_score"])),
        "cv_val_auc_mean":   float(np.mean(cvres["test_score"])),
        "cv_val_auc_std":    float(np.std(cvres["test_score"])),
    }, ensure_ascii=False, indent=2))

    # Holdout
    X_tr, X_va, y_tr, y_va = train_test_split(
        X, y, test_size=0.15, random_state=RANDOM_STATE, stratify=y
    )
    pipe.fit(X_tr, y_tr)
    proba = pipe.predict_proba(X_va)[:, 1]
    auc  = roc_auc_score(y_va, proba)
    print(f"[Holdout] {name} AUC = {auc:.4f}")
    return pipe


In [43]:
# =============================================
# Cell 8 — Train base models
# =============================================
rf_pipe  = train_one(make_rf(),   X_train_base, y, name="RF")
lgb_pipe = train_one(make_lgbm(), X_train_base, y, name="LGBM")
xgb_pipe = train_one(make_xgb(),  X_train_base, y, name="XGB")
cat_pipe = train_one(make_cat(),  X_train_base, y, name="CatBoost")


{
  "name": "RF",
  "cv_train_auc_mean": 1.0,
  "cv_train_auc_std": 0.0,
  "cv_val_auc_mean": 0.5841340850938112,
  "cv_val_auc_std": 0.00505939946386798
}
[Holdout] RF AUC = 0.5814
[Holdout] RF AUC = 0.5814
[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3666
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 1155
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493172 -> initscore=-0.027315
[LightGBM] [Info] Start training from score -0.027315
[LightGBM] [Info] Number of positive: 10906, number of negative: 11208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049178 seconds.
You can set `force_row_wise=true` to remove the overhead.



[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3664
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 1153
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10905, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3683
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 1163
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493127 -> initscore=-0.027496
[LightGBM] [Info] Start training from score -0.027496




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3670
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 1151
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




[LightGBM] [Info] Number of positive: 10906, number of negative: 11209
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3687
[LightGBM] [Info] Number of data points in the train set: 22115, number of used features: 1165
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493149 -> initscore=-0.027404
[LightGBM] [Info] Start training from score -0.027404




{
  "name": "LGBM",
  "cv_train_auc_mean": 0.7931560795748053,
  "cv_train_auc_std": 0.0038484736413856627,
  "cv_val_auc_mean": 0.5879211432218467,
  "cv_val_auc_std": 0.010913289496506608
}
[LightGBM] [Info] Number of positive: 11587, number of negative: 11909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3844
[LightGBM] [Info] Number of data points in the train set: 23496, number of used features: 1217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493148 -> initscore=-0.027411
[LightGBM] [Info] Start training from score -0.027411
[LightGBM] [Info] Number of positive: 11587, number of negative: 11909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory i



[Holdout] LGBM AUC = 0.5765
{
  "name": "XGB",
  "cv_train_auc_mean": 0.8098966267202515,
  "cv_train_auc_std": 0.002493867059203662,
  "cv_val_auc_mean": 0.5875622469160587,
  "cv_val_auc_std": 0.007337565189395578
}
{
  "name": "XGB",
  "cv_train_auc_mean": 0.8098966267202515,
  "cv_train_auc_std": 0.002493867059203662,
  "cv_val_auc_mean": 0.5875622469160587,
  "cv_val_auc_std": 0.007337565189395578
}
[Holdout] XGB AUC = 0.5775
[Holdout] XGB AUC = 0.5775
{
  "name": "CatBoost",
  "cv_train_auc_mean": 0.7929363309555538,
  "cv_train_auc_std": 0.003405093136667055,
  "cv_val_auc_mean": 0.5894721098788737,
  "cv_val_auc_std": 0.012503460200810798
}
{
  "name": "CatBoost",
  "cv_train_auc_mean": 0.7929363309555538,
  "cv_train_auc_std": 0.003405093136667055,
  "cv_val_auc_mean": 0.5894721098788737,
  "cv_val_auc_std": 0.012503460200810798
}
[Holdout] CatBoost AUC = 0.5804
[Holdout] CatBoost AUC = 0.5804


In [45]:
# =============================================
# Cell 9 — Weighted soft voting + save
# =============================================
voting = VotingClassifier(
    estimators=[
        ("lgbm", make_lgbm()),
        ("forest", make_rf()),
        ("catboost", make_cat()),
        ("xgboost", make_xgb()),
    ],
    voting="soft",
    weights=[1.0, 0.05, 0.05, 0.10],  # mimic model8：LGBM 主力，其它轻权纠错
    n_jobs=None
)

# quick holdout check
X_tr, X_va, y_tr, y_va = train_test_split(
    X_train_base, y, test_size=0.15, random_state=RANDOM_STATE, stratify=y
)
voting.fit(X_tr, y_tr)
proba_va = voting.predict_proba(X_va)[:, 1]
print(f"[Holdout] Voting AUC = {roc_auc_score(y_va, proba_va):.4f}")

# —— 使用标准 pickle 保存（_pickle 没有 HIGHEST_PROTOCOL）——
import pickle

with open(MODEL_PATH, "wb") as f:
    pickle.dump(voting, f, protocol=pickle.HIGHEST_PROTOCOL)

print("Saved voting model to:", MODEL_PATH)



[LightGBM] [Info] Number of positive: 11587, number of negative: 11909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3844
[LightGBM] [Info] Number of data points in the train set: 23496, number of used features: 1217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493148 -> initscore=-0.027411
[LightGBM] [Info] Start training from score -0.027411




[Holdout] Voting AUC = 0.5782
Saved voting model to: output\voting_model9_mimic_model8_plus_author_publisher.pkl
Saved voting model to: output\voting_model9_mimic_model8_plus_author_publisher.pkl


In [46]:
# =============================================
# Cell 10 — Fit on full train & predict test
# =============================================
voting_full = VotingClassifier(
    estimators=[
        ("lgbm", make_lgbm()),
        ("forest", make_rf()),
        ("catboost", make_cat()),
        ("xgboost", make_xgb()),
    ],
    voting="soft",
    weights=[1.0, 0.05, 0.05, 0.10],
    n_jobs=None
)
voting_full.fit(X_train_base, y)

# 保险拿到“正类=1”的概率索引
pos_idx = int(np.where(voting_full.classes_ == 1)[0][0])
test_proba = voting_full.predict_proba(X_test_base)[:, pos_idx]

sub = pd.DataFrame({"Id": df_test["Id"], "Popularity": test_proba})
sub.to_csv(SUB_PATH, index=False)
print("Submission saved to:", SUB_PATH)


[LightGBM] [Info] Number of positive: 13632, number of negative: 14011
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4271
[LightGBM] [Info] Number of data points in the train set: 27643, number of used features: 1378
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493145 -> initscore=-0.027423
[LightGBM] [Info] Start training from score -0.027423




Submission saved to: output\submission_model9_mimic_model8_plus_author_publisher.csv
