Cell 1 — Imports & Paths

In [None]:
# === Cell 1: Imports & Load Data ===
import warnings, re, numpy as np, pandas as pd
warnings.filterwarnings("ignore")

from bs4 import BeautifulSoup
import nltk

# Paths
TRAIN_PATH = './dataset/train.csv'
TEST_PATH  = './dataset/test.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)

print(df_train.shape, df_test.shape)
df_train.head(2)


In [None]:
# === Cell 2: Robust HTML -> Structured features (Title/Author/Channel/Topic/DateTime/Counts) ===
from datetime import datetime

_day_map  = {'mon':1,'tue':2,'wed':3,'thu':4,'fri':5,'sat':6,'sun':7}
_month_map= {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,
             'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}

def _text(x):
    try:
        return re.sub(r"\s+", " ", x.get_text(" ", strip=True)).strip()
    except Exception:
        return ""

def _find_first(soup, selectors):
    for sel in selectors:
        try:
            node = soup.select_one(sel)
            if node: return node
        except Exception:
            pass
    return None

def preprocessor(html: str):
    # Parse HTML safely
    soup = BeautifulSoup(html or "", 'html.parser')

    # Title
    title = ""
    for sel in ["body h1", "header h1", "h1", "title"]:
        node = _find_first(soup, [sel])
        if node:
            title = _text(node).lower()
            break

    # Author (common patterns)
    author = ""
    node = _find_first(soup, [
        "head .article-info .author_name", "head .author_name", "span.author a",
        "span.author", "div.byline", "p.byline", "meta[name='author']"
    ])
    if node:
        author = _text(node).lower()
    else:
        # meta author
        try:
            meta = soup.find('meta', attrs={'name':'author'})
            if meta and meta.get('content'): author = meta['content'].strip().lower()
        except Exception:
            pass
    if author.startswith("by "): author = author[3:].strip()

    # Channel / Topic (heuristics)
    channel = ""
    node = _find_first(soup, [".article", "[data-channel]", "meta[property='article:section']"])
    if node:
        if node.has_attr('data-channel'):
            channel = (node['data-channel'] or "").strip().lower()
        else:
            channel = _text(node).lower()
    topic = ""
    node = _find_first(soup, [".article-topics", "footer .article-topics", "a[rel='tag']", "a.tag"])
    if node:
        topic = _text(node).lower()

    # Datetime pieces (try to find any date-like string)
    day,date,month,year,hour,minute,second = "","","","","","",""
    dt_candidates = []
    # meta datetime
    for attrs in [{'property':'article:published_time'},{'name':'pubdate'},{'itemprop':'datePublished'},{'name':'date'}]:
        try:
            m = soup.find('meta', attrs=attrs)
            if m and m.get('content'): dt_candidates.append(m['content'])
        except Exception:
            pass
    # visible datetime text
    for sel in ["time", "span.time", "div.time", ".date", ".published", ".pubdate"]:
        n = _find_first(soup, [sel])
        if n:
            dt_candidates.append(_text(n))
    dt_txt = next((t for t in dt_candidates if t), "")
    # extract parts
    m = re.search(r"(mon|tue|wed|thu|fri|sat|sun)", dt_txt, flags=re.I)
    if m: day = m.group(1).lower()
    m = re.search(r"(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)", dt_txt, flags=re.I)
    if m: month = m.group(1).lower()
    m = re.search(r"\b(20\d{2}|19\d{2})\b", dt_txt)
    if m: year = m.group(1)
    m = re.search(r"\b(\d{1,2}):(\d{2})(?::(\d{2}))?\b", dt_txt)
    if m:
        hour, minute, second = m.group(1), m.group(2), (m.group(3) or "0")

    # Content length, counts
    # Strip scripts/styles
    for t in soup(["script","style","noscript"]):
        t.decompose()
    content = _text(soup.body or soup)
    content_len = len(content)

    num_see_also = len(soup.find_all(string=re.compile(r"see also|related", re.I)))
    num_image    = len(soup.find_all("img"))
    num_a        = len(soup.find_all("a"))

    return (title or "unknown", author or "unknown", channel or "unknown", topic or "unknown",
            day or "unk", date or "", month or "unk", year or "", hour or "", minute or "", second or "",
            content_len, num_see_also, num_image, num_a)

# Build combined feature table
feature_list = [preprocessor(t) for t in df_train['Page content']]
feature_list += [preprocessor(t) for t in df_test['Page content']]

df_combine = pd.DataFrame(feature_list, columns=[
    'Title', 'Author', 'Channel', 'Topic', 'Day', 'Date', 'Month', 'Year',
    'Hour', 'Minute', 'Second', 'Content_Len', 'Num_See_Also', 'Num_Image', 'Num_A'
])
df_combine.head()


In [None]:
# === Cell 3: Map day/month, drop low-value cols, create df_copy ===
day_map = {'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5, 'sat':6, 'sun':7}
month_map = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}

df_copy = df_combine.copy()
df_copy['Day']   = df_copy['Day'].map(day_map).fillna(0).astype(int)
df_copy['Month'] = df_copy['Month'].map(month_map).fillna(0).astype(int)

# High-cardinality or weak signals (keep Author/Topic text; drop others similar to model_8)
drop_cols = ['Title', 'Channel', 'Minute', 'Second', 'Num_See_Also', 'Num_Image', 'Num_A', 'Date']
df_copy = df_copy.drop(columns=[c for c in drop_cols if c in df_copy.columns])

# Fill empties
for c in ['Author','Topic','Year','Hour']:
    if c in df_copy.columns:
        df_copy[c] = df_copy[c].fillna('unknown')
for c in df_copy.columns:
    if c not in ['Author','Topic']:
        df_copy[c] = pd.to_numeric(df_copy[c], errors='coerce').fillna(0)

df_copy.head()


In [None]:
# === Cell 4: Tokenizers (WordNet Lemmatizer) ===
import re, numpy as np
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

def tokenizer(text):
    if isinstance(text, np.ndarray):
        text = text[0]
    return re.split(r'\s+', str(text).strip())

def tokenizer_wnl(text):
    if isinstance(text, np.ndarray):
        text = text[0]
    text = re.sub(r"([\w]+)'[\w]+", lambda m: m.group(1), str(text))
    text = re.sub(r"\.", "", text)
    text = re.sub(r"[^\w]+", " ", text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(s) for s in re.split(r'\s+', text.strip())]


In [None]:
# === Cell 5: ColumnTransformers (CountVectorizer on Author/Topic) ===
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Build the matrix column order [Author, Topic, (numeric...)]
text_cols = ['Author', 'Topic']
num_cols  = [c for c in df_copy.columns if c not in text_cols]

def build_mats():
    X_all = pd.concat([df_copy[text_cols], df_copy[num_cols]], axis=1).values
    n_train = df_train.shape[0]
    X_train_raw = X_all[:n_train]
    X_test      = X_all[n_train:]
    y_train_raw = (df_train['Popularity'].values == 1).astype(int)
    return X_train_raw, y_train_raw, X_test

X_train_raw, y_train_raw, X_test = build_mats()

# 检查数据结构
print("X_train_raw shape:", X_train_raw.shape)
print("Text columns:", text_cols)
print("Numeric columns:", num_cols)
print("Sample data:")
print("Author (col 0):", X_train_raw[0, 0])
print("Topic (col 1):", X_train_raw[0, 1])
print("First numeric col:", X_train_raw[0, 2])

# ColumnTransformer for models that prefer only Topic text + numeric passthrough
# 修复：明确指定哪些列是数值列
num_col_indices = list(range(2, len(text_cols) + len(num_cols)))  # 从第2列开始都是数值列

trans_other = ColumnTransformer(
    transformers=[
        ('Topic', CountVectorizer(tokenizer=tokenizer_wnl, lowercase=False), [1]),  # column 1 is Topic
        ('numeric', 'passthrough', num_col_indices)  # 明确指定数值列
    ],
    n_jobs=-1
)

# ColumnTransformer for RandomForest using both Author & Topic
trans_forest = ColumnTransformer(
    transformers=[
        ('Author', CountVectorizer(tokenizer=tokenizer, lowercase=False), [0]),   # Author
        ('Topic',  CountVectorizer(tokenizer=tokenizer_wnl, lowercase=False), [1]), # Topic
        ('numeric', 'passthrough', num_col_indices)  # 明确指定数值列
    ],
    n_jobs=-1
)

In [None]:
# === Cell 6: Train/Valid split + training() helper (AUC) ===
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score
import numpy as np

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_raw, y_train_raw, test_size=0.2, random_state=0, stratify=y_train_raw
)

def training(clf, cv=5):
    cv_results = cross_validate(clf, X_train_raw, y_train_raw, cv=cv,
                                scoring='roc_auc', return_train_score=True, n_jobs=-1)
    print('CV train AUC:  {:.5f} (+/- {:.5f})'.format(np.mean(cv_results['train_score']), np.std(cv_results['train_score'])))
    print('CV valid AUC:  {:.5f} (+/- {:.5f})'.format(np.mean(cv_results['test_score']),  np.std(cv_results['test_score'])))
    clf.fit(X_train, y_train)
    valid_auc = roc_auc_score(y_valid, clf.predict_proba(X_valid)[:,1])
    print('Holdout valid AUC:', round(valid_auc, 5))
    return clf


In [None]:
# === Cell 7: LightGBM Pipeline ===
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

lgbm = Pipeline([('ct', trans_other),
                 ('clf', LGBMClassifier(random_state=0, learning_rate=0.009, n_estimators=200))])
lgbm = training(lgbm)


In [None]:
# === Cell 8: RandomForest Pipeline ===
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

forest = Pipeline([('ct', trans_forest),
                   ('clf', RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=0))])
forest = training(forest)


In [None]:
# === Cell 9: XGBoost Pipeline ===
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

xgboost = Pipeline([('ct', trans_other),
                    ('clf', XGBClassifier(n_estimators=300, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, 
                                          eval_metric='auc', n_jobs=-1, verbosity=0, random_state=0))])
xgboost = training(xgboost)


In [None]:
# === Cell 10: CatBoost Pipeline ===
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

catboost = Pipeline([('ct', trans_other),
                     ('clf', CatBoostClassifier(verbose=False, eval_metric='AUC', n_estimators=300, learning_rate=0.5, random_seed=0))])
catboost = training(catboost)


In [None]:
# === Cell 11: Soft Voting Ensemble ===
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier([('lgbm', lgbm), ('forest', forest), ('catboost', catboost), ('xgb', xgboost)],
                          voting='soft', weights=[1.0, 0.2, 0.2, 0.6], n_jobs=-1)
voting = training(voting)


In [None]:
best_model = voting

y_score = best_model.predict_proba(X_test)[:, 1]
df_pred = pd.DataFrame({'Id': df_test['Id'], 'Popularity': y_score})
df_pred.to_csv('submission_93.csv', index=False)