前置設定

In [None]:

# ===== 基礎 =====
import os, re, numpy as np, pandas as pd
from scipy import sparse as sp
from bs4 import BeautifulSoup

# Sklearn
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import pos_tag, word_tokenize, ne_chunk

# 其他
from joblib import Parallel, delayed
from math import ceil

# ===== 路徑與隨機種子 =====
DATA_DIR = './dataset'
TRAIN_PATH = f'{DATA_DIR}/train.csv'
TEST_PATH  = f'{DATA_DIR}/test.csv'
OUT_DIR = './output'
os.makedirs(OUT_DIR, exist_ok=True)
np.random.seed(42)

# ===== 速度開關（訓練時可先關慢模塊）=====
FAST_NO_NER = True        # 關 NER
FAST_NO_LDA = True        # 關 LDA 主題
FAST_NO_TEXTSTAT = True   # 關可讀性評分

# ===== NLTK 資源（缺什麼補什麼）=====
for rid, name in [
    ('tokenizers/punkt', 'punkt'),
    ('corpora/stopwords', 'stopwords'),
    ('corpora/wordnet', 'wordnet'),
    ('sentiment/vader_lexicon', 'vader_lexicon'),
    ('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger'),
    ('chunkers/maxent_ne_chunker', 'maxent_ne_chunker'),
    ('corpora/words', 'words'),
]:
    try:
        nltk.data.find(rid)
    except LookupError:
        nltk.download(name)

# 全局 VADER（避免每條樣本重建）
try:
    VADER = SentimentIntensityAnalyzer()
except Exception:
    VADER = None


導入數據與切分

In [None]:
# 將 Popularity {-1,1} → {0,1}；並切出 train/val
df_all = pd.read_csv(TRAIN_PATH)
df_all['Popularity'] = (df_all['Popularity'].astype(int) == 1).astype(int)

TRAIN_SIZE = 26000
VAL_SIZE   = 1000

train_df = df_all.iloc[:TRAIN_SIZE].reset_index(drop=True)
val_df   = df_all.iloc[TRAIN_SIZE:TRAIN_SIZE+VAL_SIZE].reset_index(drop=True)

print(train_df.shape, val_df.shape)
train_df.head(2)


工具函數

In [None]:
# 小工具
_BODY_OPEN = re.compile(
    r'(?is)<\s*(section|div|article)\b[^>]*\b'
    r'(?:article-content|article-body|content-body|post-content)\b[^>]*>'
)
_MONTH = dict(jan='01', feb='02', mar='03', apr='04', may='05', jun='06',
              jul='07', aug='08', sep='09', oct='10', nov='11', dec='12')

def _norm(s: str) -> str:
    return re.sub(r'[\W]+', ' ', (s or '').lower()).strip()

def _slug(s: str) -> str:
    return re.sub(r'[^a-z0-9_]+', '', _norm(s).replace(' ', '_'))

def _bucket(n, edges):
    if n is None: return 'unk'
    for i in range(len(edges)-1):
        if edges[i] <= n < edges[i+1]:
            return f"b{edges[i]}_{edges[i+1]}"
    return f"b{edges[-1]}p"

def _aspect_bucket(w, h):
    if not w or not h: return 'unk'
    r = w / h
    if r < 0.9: return 'tall'
    if r < 1.2: return 'squareish'
    if r < 1.8: return 'landscape'
    return 'ultrawide'

def _img_size_bucket(w, h):
    if not w or not h: return 'unk'
    area = (w or 0) * (h or 0)
    if area < 80_000: return 'xs'
    if area < 230_000: return 'sm'
    if area < 920_000: return 'md'
    if area < 2_100_000: return 'lg'
    return 'xl'

def _parse_wh_from_src(src: str):
    if not src: return (None, None)
    m = re.search(r'/(\d{2,5})x(\d{2,5})/', src)
    return (int(m.group(1)), int(m.group(2))) if m else (None, None)

TRENDING_TOPICS = {
    'elon_musk', 'ai', 'climate_change', 'covid', 'blockchain', 'taiwan',
    'tesla', 'space', 'crypto', 'elections'
}

# LDA（可選）
def pretrain_lda(df, column='Page content', n_components=10, max_features=1000, max_text_len=500):
    def extract_text(html):
        if not isinstance(html, str) or not html.strip(): return ""
        m = _BODY_OPEN.search(html)
        header_html = html[:m.start()] if m else html
        soup = BeautifulSoup(header_html, 'html.parser')
        return ' '.join(soup.get_text().lower().split()[:max_text_len])

    corpus = [extract_text(x) for x in df[column].astype(str)]
    if not any(corpus): return None, None
    vec = CountVectorizer(max_features=max_features, stop_words='english')
    X = vec.fit_transform(corpus)
    lda = LatentDirichletAllocation(n_components=n_components, random_state=42)
    lda.fit(X)
    return vec, lda

lda_vectorizer, lda_model = (None, None)
if not FAST_NO_LDA:
    _df_lda = df_all.dropna(subset=['Page content']).astype({'Page content':'str'})
    lda_vectorizer, lda_model = pretrain_lda(_df_lda, 'Page content', n_components=10, max_features=1000)
    print("LDA pretrained")
else:
    print("LDA skipped by FAST_NO_LDA=True")


預處理函數

In [None]:
# ====== 幫助函數：清洗文本做詞袋 ======
porter = PorterStemmer()
STOP = set(stopwords.words('english'))

def _bow_clean(txt: str) -> str:
    toks = [w for w in re.findall(r'[A-Za-z]+', (txt or '').lower()) if w not in STOP]
    return ' '.join(porter.stem(w) for w in toks)

# ====== ★★★ 修正版 preprocessor：回傳 6 個值（新增 body_feats, body_bow_text） ★★★ ======
def preprocessor(html: str, lda_vectorizer=None, lda_model=None, max_text_len=500):
    if not isinstance(html, str) or not html.strip():
        # ★ 回傳 6 個空值
        return "empty_content", set(), "", "", "", ""

    # ★ 將 HTML 分為 header 和 body
    m = _BODY_OPEN.search(html)
    if m:
        header_html = html[:m.start()]
        body_html = html[m.start():]
    else:
        header_html = html
        body_html = ""

    soup = BeautifulSoup(header_html, 'html.parser')

    # --- [原有的 header 特徵提取代碼，保持不變] ---
    # 標題
    title_raw = None
    h1 = soup.find('h1', class_=lambda c: (isinstance(c, list) and any('title' in x for x in c)) or (isinstance(c, str) and 'title' in c)) \
         or soup.find('h1')
    if h1: title_raw = h1.get_text(' ', strip=True)
    elif soup.title: title_raw = soup.title.get_text(' ', strip=True)
    title_tokens = _norm(title_raw)

    # 作者 / 頻道 / 發佈者
    author = None
    by = soup.find(class_=lambda c: c and ('byline' in c or 'author_name' in c))
    if by: author = by.get_text(' ', strip=True)
    if not author:
        a = soup.find('a', href=re.compile(r'/author/[^/]+/?$', re.I))
        if a: author = a.get_text(' ', strip=True)
    author_slug = _slug(re.sub(r'^\s*by\s+', '', author or '', flags=re.I))
    channel = None
    art = soup.find('article')
    if art and art.has_attr('data-channel'): channel = art['data-channel']
    if not channel and art:
        cls = ' '.join(art.get('class', []))
        mch = re.search(r'\b(news|tech|world|sports?|business|entertainment|culture|life|science)\b', cls, re.I)
        if mch: channel = mch.group(1)
    channel_slug = _slug(channel or 'unknown')
    publisher = None
    pub = soup.find('a', href=re.compile(r'/publishers/[^/]+/?', re.I))
    if pub: publisher = pub.get_text(' ', strip=True) or re.sub(r'.*/publishers/([^/]+)/?.*', r'\1', pub['href'], flags=re.I)
    publisher_slug = _slug(publisher or 'unknown')

    # 時間
    year = month = weekday = tod = season = None
    is_weekend = None
    tm = soup.find('time')
    dt = tm['datetime'] if (tm and tm.has_attr('datetime')) else (tm.get_text(' ', strip=True) if tm else None)
    if dt:
        y = re.search(r'(20\d{2}|19\d{2})', dt);  year = y.group(1) if y else None
        mo = re.search(r'-(\d{2})-', dt) or re.search(r'\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b', dt, re.I)
        if mo:
            mm = mo.group(1).lower() if mo.lastindex else mo.group(0).lower()
            month = _MONTH.get(mm, mm)
        wd = re.search(r'\b(mon|tue|wed|thu|fri|sat|sun)\b', dt, re.I)
        if wd: weekday = wd.group(1).lower(); is_weekend = weekday in ('sat','sun')
        hh = re.search(r'\b(\d{2}):(\d{2})', dt)
        if hh:
            h = int(hh.group(1))
            tod = 'morning' if 5<=h<12 else 'afternoon' if 12<=h<17 else 'evening' if 17<=h<22 else 'night'
        if month:
            m_i = int(month)
            season = 'spring' if 3<=m_i<=5 else 'summer' if 6<=m_i<=8 else 'autumn' if 9<=m_i<=11 else 'winter'

    # 媒體元素
    imgs = soup.find_all('img'); img_count = len(imgs); has_image = img_count>0
    leadimg = soup.find(attrs={'data-fragment':'lead-image'}) is not None
    max_w=max_h=None
    for im in imgs:
        w,h = _parse_wh_from_src(im.get('src',''))
        if w and h:
            if not max_w or (w*h) > ((max_w or 0)*(max_h or 0)):
                max_w,max_h = w,h
    img_size_bucket = _img_size_bucket(max_w, max_h)
    img_aspect_bucket = _aspect_bucket(max_w, max_h)
    videos = soup.find_all('video'); iframes = soup.find_all('iframe')
    has_video = bool(videos) or any(re.search(r'(youtube|vimeo|dailymotion)', (fr.get('src') or ''), re.I) for fr in iframes)
    audio = soup.find_all('audio'); has_audio = len(audio)>0
    interactive_elements = soup.find_all(['canvas','svg', lambda tag: tag.name=='div' and 'interactive' in (tag.get('class') or [])])
    has_interactive = len(interactive_elements)>0
    link_count = len(soup.find_all('a'))
    link_bucket = _bucket(link_count, [0,1,3,6,10])
    img_bucket  = _bucket(img_count, [0,1,3,5])

    authoritative_domains = ['.edu','.gov','.org']
    authoritative_links = sum(1 for a in soup.find_all('a') if any(d in (a.get('href') or '').lower() for d in authoritative_domains))
    authoritative_link_bucket = _bucket(authoritative_links, [0,1,3,5])

    # 標題形態
    raw = title_raw or ''
    title_has_num = bool(re.search(r'\d', raw))
    title_has_year = bool(re.search(r'\b(19|20)\d{2}\b', raw))
    title_has_q = '?' in raw
    title_has_exclaim = '!' in raw
    title_has_colon = ':' in raw
    is_listicle = bool(re.match(r'^\s*\d+', raw))
    upper_ratio = (sum(ch.isupper() for ch in raw) / max(1, sum(ch.isalpha() for ch in raw)))
    upper_bucket = 'low' if upper_ratio < 0.15 else 'mid' if upper_ratio < 0.4 else 'high'
    title_word_len = len(_norm(raw).split()); tw_bucket = _bucket(title_word_len, [0,4,8,12,20])
    title_char_len = len(re.sub(r'\s+','',raw)); tc_bucket = _bucket(title_char_len, [0,30,60,90,140])

    # 社交
    social_keywords = ['share','twitter','facebook','linkedin','whatsapp','telegram']
    social_elements = soup.find_all(lambda tag: any(
        kw in (tag.get('class') or []) or kw in (tag.get('id') or '') or kw in tag.get_text().lower()
        for kw in social_keywords
    ))
    social_count = len(social_elements)
    social_count_bucket = _bucket(social_count, [0,1,3,5])
    share_count = 0
    for elem in social_elements:
        m = re.search(r'(\d+)\s*(shares?|likes?|retweets?)', elem.get_text(), re.I)
        if m: share_count += int(m.group(1))
    shares_bucket = _bucket(share_count, [0,10,100,1000])

    comment_selectors = ['.comments','#comments','.comment','.discussion']
    comment_count = sum(len(soup.select(sel)) for sel in comment_selectors)
    comment_count_bucket = _bucket(comment_count, [0,1,3,5])

    # 內容與情感
    text_content = ' '.join(soup.get_text().lower().split()[:max_text_len])
    sentiment_compound = VADER.polarity_scores(text_content)['compound'] if 'VADER' in globals() and VADER else 0.0
    sentiment_bucket = ('strong_positive' if sentiment_compound>0.5 else
                        'positive' if sentiment_compound>0.05 else
                        'strong_negative' if sentiment_compound<-0.5 else
                        'negative' if sentiment_compound<-0.05 else 'neutral')
    positive_words = ['amazing','great','excellent','wonderful','best','success','win','good','positive']
    negative_words = ['terrible','awful','bad','worst','failure','lose','problem','negative']
    pos_count = sum(1 for w in positive_words if w in text_content)
    neg_count = sum(1 for w in negative_words if w in text_content)

    # 緊急/問句/名詞/CTA
    cta_phrases = ['read more','subscribe now','click here','learn more','join us','sign up']
    cta_count = sum(1 for p in cta_phrases if p in text_content)
    cta_count_bucket = _bucket(cta_count, [0,1,3,5])

    urgency_indicators = ['breaking','urgent','alert','crisis','emergency','important']
    urgency_count = sum(1 for w in urgency_indicators if w in text_content)
    urgency_count_bucket = _bucket(urgency_count, [0,1,3,5])

    question_words = ['what','why','how','when','where','who']
    question_count = sum(1 for w in question_words if w in text_content)
    question_count_bucket = _bucket(question_count, [0,1,3,5])

    tokens = word_tokenize(text_content)
    try:
        tagged = pos_tag(tokens)
    except Exception:
        tagged = [(w,'NN') for w in tokens]
    nouns = [w for w,pos_ in tagged if pos_.startswith('NN') and w.lower() not in STOP]
    noun_count_bucket = _bucket(len(nouns), [0,5,10,20,50])

    # NER（可關）
    entities = set()
    if 'FAST_NO_NER' not in globals() or not FAST_NO_NER:
        try:
            from nltk import ne_chunk
            chunked = ne_chunk(tagged)
            for ch in chunked:
                if hasattr(ch,'label') and ch.label() in ['PERSON','ORGANIZATION','GPE']:
                    ent = '_'.join(c[0].lower() for c in ch)
                    entities.add(f'entity_{ent}')
        except Exception:
            pass
    entity_count_bucket = _bucket(len(entities), [0,1,3,5])

    # LDA（可關）
    if ('FAST_NO_LDA' not in globals() or not FAST_NO_LDA) and (lda_vectorizer is not None) and (lda_model is not None):
        X_ = lda_vectorizer.transform([text_content] if text_content else [''])
        topic_dist = lda_model.transform(X_)
        dom = int(np.argmax(topic_dist[0])) if topic_dist.size>0 else 0
        score = float(topic_dist[0][dom]) if topic_dist.size>0 else 0.0
        topic_bucket = f'topic_{dom}_b{_bucket(score,[0,0.5,0.7,0.9])}'
    else:
        topic_bucket = 'topic_unk'

    # 參與度
    engagement_metrics = {'clicks':0, 'shares':share_count, 'comments':comment_count}
    click_elements = soup.find_all(lambda tag: 'click' in tag.get_text().lower() and re.search(r'\d+', tag.get_text()))
    for elem in click_elements:
        m = re.search(r'(\d+)\s*clicks?', elem.get_text(), re.I)
        if m: engagement_metrics['clicks'] += int(m.group(1))
    clicks_bucket   = _bucket(engagement_metrics['clicks'],   [0,100,1000,10000])
    comments_bucket = _bucket(engagement_metrics['comments'], [0,1,10,50])

    # 可讀性（可關）
    if 'FAST_NO_TEXTSTAT' not in globals() or not FAST_NO_TEXTSTAT:
        try:
            from textstat import flesch_reading_ease
            readability_score = flesch_reading_ease(text_content) if text_content else 0
        except Exception:
            readability_score = 0
    else:
        readability_score = 50  # 中性
    readability_bucket = ('very_easy' if readability_score>80 else
                          'easy' if readability_score>60 else
                          'standard' if readability_score>50 else
                          'difficult' if readability_score>30 else 'very_difficult')

    # 版面結構
    div_count = len(soup.find_all('div'))
    section_count = len(soup.find_all('section'))
    list_count = len(soup.find_all(['ul','ol']))
    div_count_bucket = _bucket(div_count, [0,5,10,20,50])
    section_count_bucket = _bucket(section_count, [0,1,3,5])
    list_count_bucket = _bucket(list_count, [0,1,3,5])
    header_word_count_bucket = _bucket(len(text_content.split()), [0,50,100,200,500])

    # 拼元特徵 token
    feats = []
    feats += [
        f'author_{author_slug or "unknown"}',
        f'channel_{channel_slug}',
        f'publisher_{publisher_slug}',
        f'year_{year or "unk"}', f'month_{month or "unk"}',
        f'weekday_{weekday or "unk"}', f'tod_{tod or "unk"}', f'season_{season or "unk"}',
        'weekend' if is_weekend else 'weekday' if is_weekend is not None else 'weekend_unk',
    ]
    feats += [
        'has_image' if has_image else 'no_image',
        f'imgcnt_{img_bucket}', 'has_leadimg' if leadimg else 'no_leadimg',
        f'imgsize_{img_size_bucket}', f'imgaspect_{img_aspect_bucket}',
        'has_video' if has_video else 'no_video',
        'has_audio' if has_audio else 'no_audio',
        'has_interactive' if has_interactive else 'no_interactive',
        f'linkcnt_{link_bucket}', f'authoritative_links_{authoritative_link_bucket}',
    ]
    feats += [
        'is_listicle' if is_listicle else 'not_listicle',
        'title_has_num' if title_has_num else 'title_no_num',
        'title_has_year' if title_has_year else 'title_no_year',
        'title_has_q' if title_has_q else 'title_no_q',
        'title_has_exclaim' if title_has_exclaim else 'title_no_exclaim',
        'title_has_colon' if title_has_colon else 'title_no_colon',
        f'title_len_word_{tw_bucket}', f'title_len_char_{tc_bucket}', f'title_upper_{upper_bucket}',
    ]
    feats += [f'social_buttons_{social_count_bucket}', f'comment_sections_{comment_count_bucket}', f'share_count_{shares_bucket}']
    feats += [f'positive_words_{pos_count}', f'negative_words_{neg_count}', f'sentiment_{sentiment_bucket}']
    feats += [f'urgency_indicators_{urgency_count_bucket}', f'question_words_{question_count_bucket}',
              f'noun_count_{noun_count_bucket}', f'cta_count_{cta_count_bucket}']
    feats += [f'div_count_{div_count_bucket}', f'section_count_{section_count_bucket}',
              f'list_count_{list_count_bucket}', f'readability_{readability_bucket}']
    feats += [f'header_word_count_{header_word_count_bucket}']
    feats += [f'entity_count_{entity_count_bucket}', topic_bucket,
              f'trending_matches_{_bucket(sum(1 for w in tokens if w.lower() in TRENDING_TOPICS),[0,1,3,5])}',
              f'clicks_{clicks_bucket}', f'shares_{shares_bucket}', f'comments_{comments_bucket}']
    feats += list(entities)

    header_bow_text = _bow_clean(text_content)

    # ★ ====== 新增：正文特徵提取 ======
    body_feats = []
    body_bow_text = ""
    if body_html:
        soup_body = BeautifulSoup(body_html, 'html.parser')
        body_text = ' '.join(soup_body.get_text().lower().split()) # 取得純文本

        # 正文長度
        body_len_bucket = _bucket(len(body_text.split()), [0, 100, 300, 600, 1000])
        body_feats.append(f'body_len_{body_len_bucket}')

        # 段落、引用、代碼塊
        p_count = len(soup_body.find_all('p'))
        p_bucket = _bucket(p_count, [0, 5, 15, 30, 50])
        body_feats.append(f'body_p_{p_bucket}')

        bq_count = len(soup_body.find_all('blockquote'))
        bq_bucket = _bucket(bq_count, [0, 1, 3, 5])
        body_feats.append(f'body_bq_{bq_bucket}')

        code_count = len(soup_body.find_all(['pre', 'code']))
        body_feats.append('body_has_code' if code_count > 0 else 'body_no_code')

        # 正文詞袋文本
        body_bow_text = _bow_clean(body_text)
    else:
        # 如果沒有 body，使用 'no_body' 特徵
        body_feats.append('no_body')


    # ★ 回傳 6 個值
    return _norm(title_tokens), entities, ' '.join(feats), header_bow_text, ' '.join(body_feats), body_bow_text

# ====== ★★★ 新增/修改分塊編碼器 ★★★ ======
# 原有
title_vec = HashingVectorizer(n_features=2**20, alternate_sign=False, ngram_range=(1,2), token_pattern=r'(?u)\b\w+\b')
header_vec = HashingVectorizer(n_features=2**18, alternate_sign=True, ngram_range=(1,2), token_pattern=r'(?u)\b\w+\b')
entities_vec = HashingVectorizer(n_features=2**12, alternate_sign=False, token_pattern=r'(?u)\b\w+\b')

# ★ 新增 body 的詞袋向量化器
body_vec = HashingVectorizer(n_features=2**20, alternate_sign=False, ngram_range=(1,2), token_pattern=r'(?u)\b\w+\b')

# 原有
make_hasher = lambda n: FeatureHasher(n_features=2**n, input_type='string', alternate_sign=False)
author_h, channel_h, publisher_h = make_hasher(15), make_hasher(12), make_hasher(14)
time_h, media_h, titleshape_h   = make_hasher(10), make_hasher(12), make_hasher(10)
social_h, content_h, structure_h= make_hasher(14), make_hasher(14), make_hasher(14)
length_h, extra_h               = make_hasher(12), make_hasher(14)

# ★ 新增 body 的類別特徵 Hasher
bodyfeats_h = make_hasher(12)

# ====== tokenizer（維持原樣）======
def tokenizer_stem_keepmeta(text: str, entities: set) -> list:
    toks = re.split(r'\s+', (text or '').strip()); out=[]
    for w in toks:
        if not w: continue
        if '_' in w or any(ch.isdigit() for ch in w) or (w.startswith('entity_') and w[7:] in entities):
            out.append(w)
        elif w.lower() not in STOP and re.fullmatch(r'[a-zA-Z]+', w):
            out.append(porter.stem(w.lower()))
    return out

# ====== feats_string → 分桶 ======
PREFIX_MAP = {
    'author_':'author','channel_':'channel','publisher_':'publisher',
    'year_':'time','month_':'time','weekday_':'time','tod_':'time','season_':'time',
    'weekend':'time','weekday':'time',
    'has_image':'media','no_image':'media','imgcnt_':'media','has_leadimg':'media','no_leadimg':'media',
    'imgsize_':'media','imgaspect_':'media','has_video':'media','no_video':'media','has_audio':'media','no_audio':'media',
    'has_interactive':'media','no_interactive':'media','linkcnt_':'media','authoritative_links_':'media',
    'is_listicle':'titleshape','not_listicle':'titleshape','title_has_':'titleshape',
    'title_len_word_':'titleshape','title_len_char_':'titleshape','title_upper_':'titleshape',
    'social_buttons_':'social','comment_sections_':'social','share_count_':'social',
    'urgency_indicators_':'content','question_words_':'content','noun_count_':'content','cta_count_':'content',
    'div_count_':'structure','section_count_':'structure','list_count_':'structure','readability_':'structure',
    'header_word_count_':'length',
    'entity_count_':'extra','topic_':'extra','trending_matches_':'extra','clicks_':'extra','shares_':'extra','comments_':'extra',
    'sentiment_':'extra','positive_words_':'extra','negative_words_':'extra',
}
def _split_feat_tokens(feats_string: str):
    buckets = {k: [] for k in ['author','channel','publisher','time','media','titleshape','social','content','structure','length','extra']}
    if not feats_string:
        for k in buckets: buckets[k] = [f'{k}=_none']; return buckets
    for tok in feats_string.split():
        placed = False
        for pref, grp in PREFIX_MAP.items():
            if tok.startswith(pref): buckets[grp].append(tok); placed=True; break
        if not placed: buckets['extra'].append(tok)
    for k,v in buckets.items():
        if not v: buckets[k] = [f'{k}=_none']
    return buckets

# ====== ★★★ 修正版 featurize_split：處理 6 個返回值並拼接新特徵 ★★★ ======
def featurize_split(html_series: pd.Series, lda_vectorizer=None, lda_model=None, n_jobs=1) -> sp.csr_matrix:
    rows = html_series.astype(str).tolist()
    if n_jobs is None or n_jobs==1:
        # ★ preprocessor 現在回傳 6-tuple
        processed_data = [preprocessor(h, lda_vectorizer, lda_model) for h in rows]
    else:
        processed_data = Parallel(n_jobs=n_jobs, backend="loky", prefer="processes")(
            delayed(preprocessor)(h, lda_vectorizer, lda_model) for h in rows
        )

    # 收集器
    titles, header_bows, entity_texts = [], [], []
    author_tokens, channel_tokens, publisher_tokens = [], [], []
    time_tokens, media_tokens, titleshape_tokens = [], [], []
    social_tokens, content_tokens, structure_tokens = [], [], []
    length_tokens, extra_tokens = [], []
    # ★ 新增 body 的收集器
    body_feats_tokens, body_bows = [], []

    # ★ 逐行解包（6 個值）
    for title, ents, header_feats, header_bow, body_feats, body_bow in processed_data:
        titles.append(' '.join(tokenizer_stem_keepmeta(title, set(ents))) if title else '')
        header_bows.append(header_bow or '')
        entity_texts.append(' '.join(sorted(ents)) if ents else '')
        b = _split_feat_tokens(header_feats)
        author_tokens.append(b['author']);    channel_tokens.append(b['channel']);   publisher_tokens.append(b['publisher'])
        time_tokens.append(b['time']);        media_tokens.append(b['media']);       titleshape_tokens.append(b['titleshape'])
        social_tokens.append(b['social']);    content_tokens.append(b['content']);   structure_tokens.append(b['structure'])
        length_tokens.append(b['length']);    extra_tokens.append(b['extra'])
        # ★ 收集 body 特徵
        body_feats_tokens.append(body_feats.split() if body_feats else ['no_body'])
        body_bows.append(body_bow or '')

    # 各塊編碼
    X_title      = title_vec.transform(titles)
    X_header     = header_vec.transform(header_bows)
    X_entities   = entities_vec.transform(entity_texts)
    X_author     = author_h.transform(author_tokens)
    X_channel    = channel_h.transform(channel_tokens)
    X_publisher  = publisher_h.transform(publisher_tokens)
    X_time       = time_h.transform(time_tokens)
    X_media      = media_h.transform(media_tokens)
    X_titleshape = titleshape_h.transform(titleshape_tokens)
    X_social     = social_h.transform(social_tokens)
    X_content    = content_h.transform(content_tokens)
    X_structure  = structure_h.transform(structure_tokens)
    X_length     = length_h.transform(length_tokens)
    X_extra      = extra_h.transform(extra_tokens)
    # ★ 編碼 body 特徵
    X_body_feats = bodyfeats_h.transform(body_feats_tokens)
    X_body_bow   = body_vec.transform(body_bows)

    # ★ 拼接（加入新的 body 特徵）
    X = sp.hstack([
        X_title, X_header, X_entities,
        X_author, X_channel, X_publisher, X_time,
        X_media, X_titleshape, X_social, X_content, X_structure, X_length,
        X_extra,
        X_body_feats, X_body_bow  # ★ 新增
    ], format='csr')

    return X

把驗證集轉成特徵，後面評估直接用

In [None]:
# 一次性把驗證集轉成特徵，後面評估直接用
X_val = featurize_split(val_df['Page content'].astype(str), lda_vectorizer, lda_model, n_jobs=1)
y_val = val_df['Popularity'].values
sp.save_npz(f'{OUT_DIR}/X_val_split.npz', X_val)
np.save(f'{OUT_DIR}/y_val.npy', y_val)
X_val.shape, np.bincount(y_val)


k 折

In [None]:
# =========================
# K 折交叉驗證 + 每折最佳模型集成輸出 test 預測（支援 group / stratified / kfold）
# —— 新增：epoch 內早停（batch 級評估 + 回退本 epoch 最佳狀態）
# =========================
import os, gc, copy, _pickle as pkl
import numpy as np
import pandas as pd
from math import ceil
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
from scipy.special import expit

# ---------- 基本配置 ----------
TRAIN_PATH = './dataset/train.csv'
TEST_PATH  = './dataset/test.csv'
OUT_DIR    = './output'
os.makedirs(OUT_DIR, exist_ok=True)

# 切換這裡即可：'group'（按 publisher 防洩漏）| 'stratified'（隨機分層）| 'kfold'（純隨機）
CV_MODE     = 'stratified'
N_SPLITS    = 5
EPOCHS      = 5
BATCH_SIZE  = 2500
SEED        = 42

# 早停（跨 epoch）
PATIENCE    = 2          # 連續 PATIENCE 個 epoch 無提升就停

# 早停（epoch 內）
BATCH_EVAL_EVERY = 1    # 每多少個 batch 在驗證集評估一次（1=每個 batch 都評估）
BATCH_PATIENCE   = 20     # 連續多少次「批內評估」無提升就提前結束當前 epoch

DO_FOLD_LDA = False      # True：每折只用訓練集預訓練 LDA（更穩但更慢）

# ----------（僅 group 模式會用到）publisher 提取 ----------
import re
from bs4 import BeautifulSoup
def _norm(s): return re.sub(r'[\W]+', ' ', (s or '').lower()).strip()
def _slug(s): return re.sub(r'[^a-z0-9_]+', '', _norm(s).replace(' ', '_'))

def extract_publisher_slug(html: str) -> str:
    if not isinstance(html, str) or not html.strip():
        return "unknown"
    soup = BeautifulSoup(html, 'html.parser')
    pub = soup.find('a', href=re.compile(r'/publishers/[^/]+/?', re.I))
    if pub:
        publisher = pub.get_text(' ', strip=True) or re.sub(r'.*/publishers/([^/]+)/?.*', r'\1', pub['href'], flags=re.I)
    else:
        publisher = "unknown"
    return _slug(publisher or 'unknown')

# ---------- 分割器工廠 ----------
def make_split_iter(cv_mode, n_splits, seed, y, texts=None, groups=None):
    """
    回傳 (tr_idx, va_idx) 的迭代器：
      - 'group'：GroupKFold（需 groups=publisher_keys）
      - 'stratified'：StratifiedKFold（shuffle=True）
      - 'kfold'：KFold（shuffle=True）
    """
    n = len(y)
    X_dummy = np.zeros(n)  # 只為滿足 API
    if cv_mode == 'group':
        if groups is None:
            raise ValueError("CV_MODE='group' 需要提供 groups（publisher_keys）")
        splitter = GroupKFold(n_splits=n_splits)
        print(f"Using GroupKFold by publisher (groups={pd.Series(groups).nunique()})")
        return splitter.split(X_dummy, y, groups)

    elif cv_mode == 'stratified':
        splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        print("Using StratifiedKFold (shuffle=True)")
        return splitter.split(X_dummy, y)

    elif cv_mode == 'kfold':
        splitter = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        print("Using KFold (shuffle=True) — no stratification")
        return splitter.split(X_dummy)

    else:
        raise ValueError(f"Unknown CV_MODE: {cv_mode}")

# ---------- 分類器工廠（文本穩定配置） ----------
def make_clf():
    return SGDClassifier(
        loss="hinge",            # 若想更平滑的機率，改為 "log_loss" 並用 predict_proba
        penalty="elasticnet",
        l1_ratio=0.15,
        alpha=5e-4,
        learning_rate="optimal",
        eta0=1e-5,
        average=True,
        random_state=SEED
    )

# ---------- 讀取資料 ----------
df = pd.read_csv(TRAIN_PATH)
df['Popularity'] = (df['Popularity'].astype(int) == 1).astype(int)
y = df['Popularity'].values
texts = df['Page content'].astype(str)

# 只有 group 模式才需要抽取 publisher；其他模式略過可節省時間
if CV_MODE == 'group':
    publisher_keys = texts.apply(extract_publisher_slug).values
    split_iter = make_split_iter(CV_MODE, N_SPLITS, SEED, y, texts, publisher_keys)
else:
    split_iter = make_split_iter(CV_MODE, N_SPLITS, SEED, y, texts, None)

# 容器
fold_artifacts = []   # 保存每折最佳模型與（可選）LDA
oof_scores = np.zeros(len(df), dtype=float)
fold_aucs, fold_epochs = [], []

print(f"\nStart {N_SPLITS}-fold CV: EPOCHS={EPOCHS}, BATCH_SIZE={BATCH_SIZE}, "
      f"DO_FOLD_LDA={DO_FOLD_LDA}, MODE={CV_MODE}, "
      f"BATCH_EVAL_EVERY={BATCH_EVAL_EVERY}, BATCH_PATIENCE={BATCH_PATIENCE}")

for fold, (tr_idx, va_idx) in enumerate(split_iter, start=1):
    print(f"\n========== Fold {fold}/{N_SPLITS} ==========")
    tr_df = df.iloc[tr_idx].reset_index(drop=True)
    va_df = df.iloc[va_idx].reset_index(drop=True)
    y_val = va_df['Popularity'].values

    # 每折 LDA（可選，避免外洩需在訓練集上建）
    if DO_FOLD_LDA:
        lda_vec_f, lda_mod_f = pretrain_lda(tr_df, column='Page content', n_components=10, max_features=1000)
    else:
        lda_vec_f, lda_mod_f = (None, None)

    # 固定本折驗證特徵（只算一次）
    X_val = featurize_split(va_df['Page content'].astype(str), lda_vec_f, lda_mod_f, n_jobs=1)

    # 模型與「跨 epoch」早停
    clf = make_clf()
    fold_best_auc, fold_best_epoch = -1, -1
    fold_best_state = None
    epoch_no_improve = 0

    # ========== 多 epoch 訓練（含：epoch 內早停） ==========
    for epoch in range(1, EPOCHS+1):
        tr_shuf = tr_df.sample(frac=1.0, random_state=SEED+epoch).reset_index(drop=True)
        n_batches = ceil(len(tr_shuf)/BATCH_SIZE)

        # 這兩個用於「epoch 內早停」
        epoch_best_auc  = -1
        epoch_best_state = None
        batch_no_improve = 0

        for b in range(n_batches):
            batch = tr_shuf.iloc[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
            X_tr = featurize_split(batch['Page content'].astype(str), lda_vec_f, lda_mod_f, n_jobs=1)
            y_tr = batch['Popularity'].values

            if epoch == 1 and b == 0:
                clf.partial_fit(X_tr, y_tr, classes=np.array([0,1]))
            else:
                clf.partial_fit(X_tr, y_tr)

            # —— 批內評估（控制頻率）——
            do_eval = ((b + 1) % BATCH_EVAL_EVERY == 0) or (b == n_batches - 1)
            if do_eval:
                if hasattr(clf, "predict_proba"):
                    val_prob_now = clf.predict_proba(X_val)[:, 1]
                else:
                    val_prob_now = expit(clf.decision_function(X_val))
                val_auc_now = roc_auc_score(y_val, val_prob_now)

                # 更新當前 epoch 的最佳狀態
                if val_auc_now > epoch_best_auc:
                    epoch_best_auc = val_auc_now
                    epoch_best_state = copy.deepcopy(clf)
                    batch_no_improve = 0
                else:
                    batch_no_improve += 1

                # epoch 內早停條件
                if batch_no_improve >= BATCH_PATIENCE:
                    print(f"  Fold {fold} | epoch {epoch} | early-stop in-epoch at batch {b+1}/{n_batches} "
                          f"(no improve {BATCH_PATIENCE}×); best AUC so far = {epoch_best_auc:.4f}")
                    break

            # 釋放 batch 特徵
            del X_tr; gc.collect()

        # —— epoch 結束：回退到本 epoch 最佳狀態，並以其 AUC 作為 epoch 成績 —— 
        if epoch_best_state is not None:
            clf = copy.deepcopy(epoch_best_state)   # 之後的 epoch 會從「本 epoch 的最佳點」繼續學
            val_auc = epoch_best_auc
        else:
            # 萬一 epoch 內沒有做任何評估（極端情況），做一次補評估
            if hasattr(clf, "predict_proba"):
                val_prob = clf.predict_proba(X_val)[:, 1]
            else:
                val_prob = expit(clf.decision_function(X_val))
            val_auc = roc_auc_score(y_val, val_prob)

        print(f"Fold {fold} | epoch {epoch}/{EPOCHS} | Val AUC={val_auc:.4f}")

        # —— 跨 epoch 早停（根據 epoch 最佳）——
        if val_auc > fold_best_auc:
            fold_best_auc = val_auc
            fold_best_epoch = epoch
            fold_best_state = copy.deepcopy(clf)
            epoch_no_improve = 0
        else:
            epoch_no_improve += 1
            if epoch_no_improve >= PATIENCE:
                print(f"  Early stopping (across epochs) at epoch {epoch} (no improve {PATIENCE}×)")
                break

    # 保存 OOF（用 fold 內最佳狀態）
    if hasattr(fold_best_state, "predict_proba"):
        oof_scores[va_idx] = fold_best_state.predict_proba(X_val)[:, 1]
    else:
        oof_scores[va_idx] = expit(fold_best_state.decision_function(X_val))

    fold_aucs.append(fold_best_auc); fold_epochs.append(fold_best_epoch)

    # 保存每折最佳模型到硬碟
    model_path = os.path.join(OUT_DIR, f'model_4_{CV_MODE}_clf_sgd_fold{fold}.pkl')
    pkl.dump(fold_best_state, open(model_path, 'wb'))

    # 若用了 LDA，順便把該折的 LDA 也存起來
    lda_path = None
    if DO_FOLD_LDA:
        lda_path = os.path.join(OUT_DIR, f'lda_{CV_MODE}_fold{fold}.pkl')
        pkl.dump({'lda_vec': lda_vec_f, 'lda_model': lda_mod_f}, open(lda_path, 'wb'))

    print(f"Fold {fold} BEST: epoch={fold_best_epoch}, AUC={fold_best_auc:.4f} | saved {model_path}")
    del X_val; gc.collect()

# CV 總結
oof_auc = roc_auc_score(y, oof_scores)
print("\n========== CV Summary ==========")
print("Fold AUCs:", ["%.4f" % a for a in fold_aucs])
print("Mean AUC = %.4f | Std = %.4f" % (np.mean(fold_aucs), np.std(fold_aucs)))
print("OOF  AUC = %.4f" % oof_auc)

# ---------- 用每折最佳模型對 test 預測並平均 ----------
df_test = pd.read_csv(TEST_PATH)
test_texts = df_test['Page content'].astype(str)
test_preds_each_fold = []

for fold in range(1, len(fold_aucs)+1):
    # 讀模型
    model_path = os.path.join(OUT_DIR, f'model_4_{CV_MODE}_clf_sgd_fold{fold}.pkl')
    clf = pkl.load(open(model_path, 'rb'))

    # 讀折內 LDA（可選）
    if DO_FOLD_LDA:
        lda_path = os.path.join(OUT_DIR, f'lda_{CV_MODE}_fold{fold}.pkl')
        lda_pack = pkl.load(open(lda_path, 'rb'))
        lda_vec_f, lda_mod_f = lda_pack['lda_vec'], lda_pack['lda_model']
    else:
        lda_vec_f, lda_mod_f = (None, None)

    # 特徵化
    X_test = featurize_split(test_texts, lda_vec_f, lda_mod_f, n_jobs=1)

    # 預測機率
    if hasattr(clf, "predict_proba"):
        prob = clf.predict_proba(X_test)[:, 1]
    else:
        prob = expit(clf.decision_function(X_test))

    test_preds_each_fold.append(prob)
    print(f"Fold {fold} test predicted. Shape={prob.shape}")

# 集成（平均）
test_pred = np.mean(np.vstack(test_preds_each_fold), axis=0)

# 導出提交
sub_path = os.path.join(OUT_DIR, f'model_4_submission_k{N_SPLITS}_3.csv')
pd.DataFrame({'Id': df_test['Id'], 'Popularity': test_pred}).to_csv(sub_path, index=False)
print("Submission saved ->", sub_path)


可視化權重

In [None]:
# 新增 cell：分析每個特徵塊的權重貢獻（使用最後一折的模型作為示例）
# 注意：這需要先運行前面的訓練 cell 以確保模型文件存在
# 我們會用一個小樣本計算每個特徵塊的維度（offsets），然後計算 mean(abs(coef)) 作為貢獻指標

import numpy as np
import scipy.sparse as sp
import pickle as pkl
from sklearn.metrics import roc_auc_score  # 可選，如果需要

# 假設我們用訓練集的前 10 個樣本計算特徵塊大小（只需計算 shape[1]，不需真實轉換大量數據）
sample_html = train_df['Page content'].astype(str).head(10)  # 從 train_df 取樣本
# 注意：lda_vectorizer 和 lda_model 需從前文定義，如果 DO_FOLD_LDA=False，可用全局的
# 這裡假設用全局 lda_vectorizer, lda_model（調整如果不同）

# 調用 featurize_split 但不拼接，只收集每個塊的 shape[1]
# （我們臨時修改 featurize_split 的返回，或手動運行內部邏輯；為簡單，這裡模擬收集）
# 實際上，你可以從 featurize_split 內部複製邏輯，但為了簡潔，假設運行一次並記錄 shapes

X_sample = featurize_split(sample_html, lda_vectorizer, lda_model, n_jobs=1)  # 運行一次小樣本得到 X
# 但我們需要每個塊的 shape，所以在 featurize_split 內記錄或這裡列出已知
# 假設所有 HashingVectorizer/FeatureHasher 用默認 n_features=2**20=1048576
# 但 entities, title 等是 HashingVectorizer (str -> vec)
# author 等是 FeatureHasher (list of dict? -> vec)
# 實際 dim 都是固定的，除非指定；默認 1048576

# 明確列出每個特徵塊（從你的代碼）
feature_blocks = [
    'title', 'header', 'entities',
    'author', 'channel', 'publisher', 'time',
    'media', 'titleshape', 'social', 'content', 'structure', 'length',
    'extra',
    'body_feats', 'body_bow'
]

# 假設每個 vec/hasher 的 n_features （從你的代碼中查找，或運行時檢查）
# 例如，如果 title_vec = HashingVectorizer(n_features=2**18)，則 dim=262144
# 這裡假設默認 2**20=1048576（調整為你的實際）
block_dims = [1048576] * len(feature_blocks)  # 臨時假設；替換為實際
# 更好的方式：運行小樣本並收集
rows = sample_html.tolist()
processed_data = [preprocessor(h, lda_vectorizer, lda_model) for h in rows]
# ... (從 featurize_split 複製收集邏輯)
# 然後 X_title = title_vec.transform(titles); block_dims.append(X_title.shape[1])
# 等；為簡潔，假設已知或運行後 print(X_title.shape[1]) 等

# 計算 offsets
offsets = [0]
for dim in block_dims:
    offsets.append(offsets[-1] + dim)

# 加載一個模型（例如最後一折）
fold = 8  # 或選擇任一 fold
model_path = f'{OUT_DIR}/model_4_{CV_MODE}_clf_sgd_fold{fold}.pkl'
clf = pkl.load(open(model_path, 'rb'))

# 獲取權重（coef_ 是 (1, total_features)）
coef = clf.coef_[0]  # [total_features,]

# 計算每個塊的貢獻：mean(abs(coef)) 或 sum(abs(coef)) / total_sum_abs
block_importances = []
for i, name in enumerate(feature_blocks):
    start, end = offsets[i], offsets[i+1]
    block_coef = coef[start:end]
    mean_abs = np.mean(np.abs(block_coef))  # 改用均值
    block_importances.append((name, mean_abs))
block_importances.sort(key=lambda x: x[1], reverse=True)
total_mean_abs = sum(r for _, r in block_importances)
for name, mean_abs in block_importances:
    print(f"{name}: {mean_abs:.6f} ({mean_abs/total_mean_abs:.4f})")

# 排序並顯示
block_importances.sort(key=lambda x: x[1], reverse=True)
print("特徵塊權重貢獻佔比（sum(abs(coef)) / total）：")
for name, ratio in block_importances:
    print(f"{name}: {ratio:.4f}")

# 可視化（可選）
import matplotlib.pyplot as plt
names, ratios = zip(*block_importances)
plt.barh(names, ratios)
plt.xlabel('貢獻佔比')
plt.title('特徵塊重要性')
plt.show()

# 基於此，你可以決定刪除貢獻小的塊，例如如果 <0.01，註釋掉 hstack 中的對應 X_