In [84]:
# Cell 1 – environment setup
import os, sys, subprocess, random, logging, requests
import numpy as np, pandas as pd, torch, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, classification_report)

def _require(mod_name: str, pip_name: str | None = None, attr: str | None = None):
    """
    Import `mod_name`; if missing, pip-install `pip_name` (or the same name) then import.
    Optionally return a symbol inside the module (`attr`).
    """
    import importlib
    try:
        mod = importlib.import_module(mod_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-qU",
                               pip_name or mod_name])
        mod = importlib.import_module(mod_name)
    return getattr(mod, attr) if attr else mod

BM25Okapi = _require("rank_bm25", attr="BM25Okapi")   
_require("bert_score", "bert-score")                  

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s",
                    datefmt="%H:%M:%S")
logger = logging.getLogger(__name__)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic, torch.backends.cudnn.benchmark = True, False

FAST_DEV = bool(int(os.getenv("FAST_DEV", "1")))
FAST_SAMPLE_TRAIN, FAST_SAMPLE_TEST, FAST_EPOCHS = 2_000, 500, 2
EPOCHS   = FAST_EPOCHS if FAST_DEV else 3

DATA_PATH   = "/kaggle/input/dataset/"
MODEL_NAME  = "bert-base-uncased"
BATCH_SIZE  = 16
MAX_LENGTH  = 512
LEARNING_RATE = 2e-5
TEST_SIZE  = 0.30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Device: {DEVICE} | FAST_DEV: {FAST_DEV}")

from kaggle_secrets import UserSecretsClient
FACT_CHECK_API_KEY = UserSecretsClient().get_secret("FACT_CHECK_API_KEY")

def query_fact_check_api(claim: str, page_size: int = 5) -> dict | None:
    url = "https://factchecktools.googleapis.com/v1/claims:search"
    params = {"query": claim, "languageCode": "en",
              "pageSize": page_size, "key": FACT_CHECK_API_KEY}
    try:
        r = requests.get(url, params=params, timeout=10)
        r.raise_for_status()
        return r.json()
    except requests.RequestException as e:
        logger.warning(f"Fact-Check API error: {e}")
        return None


In [85]:
import os, json, pandas as pd
from sklearn.model_selection import train_test_split

LABEL2ID      = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
FEVER_LABELS  = list(LABEL2ID)
FEVER_CLASSES = set(FEVER_LABELS)

def _map_to_fever(lbl: str):
    mapping = {
        # LIAR & FakeNewsNet
        "real": "SUPPORTS", "fake": "REFUTES",
        "false": "REFUTES", "pants_on_fire": "REFUTES", "barely_true": "REFUTES",
        "half_true": "SUPPORTS", "mostly_true": "SUPPORTS", "true": "SUPPORTS",
        # already-FEVER
        "SUPPORTS": "SUPPORTS", "REFUTES": "REFUTES", "NOT ENOUGH INFO": "NOT ENOUGH INFO",
    }
    return mapping.get(str(lbl).strip())

def _read_jsonl(fp):
    with open(fp, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def load_and_preprocess_data(path: str,
                             logger=None,
                             fast_dev: bool = False,
                             fast_train: int = 2000,
                             fast_test:  int = 500):
    # ---------- LIAR ----------
    liar_cols = ["id", "label", "statement", "subject", "speaker", "job", "state",
                 "party", "barely_true", "false", "half_true", "mostly_true",
                 "pants_on_fire", "context"]

    def _load_liar(split):
        df = pd.read_csv(os.path.join(path, f"{split}.tsv"), sep="\t", header=None)
        df.columns = liar_cols
        df["label"] = df["label"].apply(_map_to_fever)
        df["source"] = "liar"
        return df[["statement", "label", "source"]]

    liar_df = pd.concat([_load_liar(s) for s in ("train", "valid", "test")])

    # ---------- FEVER ----------
    def _load_fever(split):
        df = pd.DataFrame(_read_jsonl(os.path.join(path, f"{split}.jsonl")))
        df = df[["claim", "label"]].rename(columns={"claim": "statement"})
        df["label"] = df["label"].apply(_map_to_fever)
        df["source"] = "fever"
        return df

    fever_df = pd.concat([_load_fever(s) for s in ("train", "paper_dev", "paper_test")])

    # ---------- FakeNewsNet ----------
    def _load_fnn(csv_file, real_fake, src):
        df = pd.read_csv(os.path.join(path, csv_file))
        text_col = "title" if "title" in df.columns else "content" if "content" in df.columns else None
        if text_col is None:
            raise ValueError(f"No text column in {csv_file}")
        df = df.rename(columns={text_col: "statement"})
        df["label"] = _map_to_fever(real_fake)
        df["source"] = src
        return df[["statement", "label", "source"]]

    fnn_df = pd.concat([
        _load_fnn("politifact_real.csv",  "real", "politifact"),
        _load_fnn("politifact_fake.csv",  "fake", "politifact"),
        _load_fnn("gossipcop_real.csv",   "real", "gossipcop"),
        _load_fnn("gossipcop_fake.csv",   "fake", "gossipcop"),
    ])

    # ---------- combine & clean ----------
    data = pd.concat([liar_df, fever_df, fnn_df], ignore_index=True)
    data.dropna(subset=["statement", "label"], inplace=True)
    data["statement"] = data["statement"].astype(str).str.strip()
    data = data[data["statement"].str.len() > 10]
    data = data.drop_duplicates(subset=["statement", "label"])
    data = data[data["label"].isin(FEVER_CLASSES)].reset_index(drop=True)

    if logger:
        logger.info("Label distribution:\n%s", data["label"].value_counts())

    train_df, test_df = train_test_split(
        data, test_size=0.30, stratify=data["label"], random_state=42
    )

    if fast_dev:
        def _subsample(df, n):
            return df.groupby("label", group_keys=False)\
                     .apply(lambda x: x.sample(min(n, len(x)), random_state=42))
        train_df = _subsample(train_df, fast_train // 3).reset_index(drop=True)
        test_df  = _subsample(test_df,  fast_test  // 3).reset_index(drop=True)
        if logger:
            logger.info(f"FAST_DEV: train {len(train_df)}, test {len(test_df)}")

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)


In [86]:
# Cell 3 – dataset, dataloaders, CNN (BERT evidence-ready)
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd          # needed inside Dataset

LABEL2ID  = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
ID2LABEL  = {v: k for k, v in LABEL2ID.items()}

# ── Dataset ──────────────────────────────────────────────────────────
class FeverDataset(Dataset):
    """
    If use_evidence=True the tokenizer is fed (claim, evidence) pairs.
    Evidence is taken from:
      • pre-computed df["evidence"] OR
      • evidence_fn(claim)  – if supplied
    """
    def __init__(self, df, tokenizer, max_len=512,
                 use_evidence=False, evidence_fn=None):
        self.df   = df.reset_index(drop=True)
        self.tok  = tokenizer
        self.max  = max_len
        self.ev   = use_evidence
        self.evfn = evidence_fn

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row   = self.df.iloc[idx]
        claim = str(row["statement"])
        label = LABEL2ID.get(row["label"], 2)

        if self.ev:                                   # claim + evidence mode
            evidence = (row["evidence"]
                        if "evidence" in row and pd.notna(row["evidence"])
                        else (self.evfn(claim) if self.evfn else ""))
            enc = self.tok(
                claim, evidence,
                max_length=self.max, truncation=True,
                padding="max_length", return_tensors="pt"
            )
        else:                                         # claim-only mode
            enc = self.tok(
                claim,
                max_length=self.max, truncation=True,
                padding="max_length", return_tensors="pt"
            )

        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label":          torch.tensor(label)
        }

# ── Dataloaders ─────────────────────────────────────────────────────
def create_dataloaders(train_df, test_df, tok,
                       bs=16, max_len=512,
                       use_evidence=False, evidence_fn=None,
                       workers=2, pin=True):
    defs = dict(tokenizer=tok, max_len=max_len,
                use_evidence=use_evidence, evidence_fn=evidence_fn)
    mk   = lambda df: FeverDataset(df, **defs)
    mkdl = lambda ds, shuf: DataLoader(ds, bs, shuffle=shuf,
                                       num_workers=workers, pin_memory=pin)
    return mkdl(mk(train_df), True), mkdl(mk(test_df), False)

# ── CNN (unchanged except default dropout 0.3) ─────────────────────
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, classes=3,
                 dropout=0.3, pretrained_weights=None, freeze_embed=False):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if pretrained_weights is not None:
            self.embed.weight.data[: pretrained_weights.size(0)] = pretrained_weights
            if freeze_embed:
                self.embed.weight.requires_grad = False

        ks = (3, 4, 5)
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv2d(1, 100, (k, embed_dim), padding=(k-1, 0)),
                nn.ReLU(),
                nn.BatchNorm2d(100, momentum=0.05)  # quicker BN updates
            ) for k in ks
        ])
        self.drop = nn.Dropout(dropout)
        self.fc   = nn.Linear(100 * len(ks), classes)

    def forward(self, ids):
        x = self.embed(ids).unsqueeze(1)                         # [B,1,L,D]
        pooled = [F.max_pool1d(c(x).squeeze(3), c(x).size(2)).squeeze(2)
                  for c in self.convs]                           # [B,100] × 3
        return self.fc(self.drop(torch.cat(pooled, 1)))


In [87]:
import numpy as np, torch
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, classification_report)

LABEL2ID     = {"SUPPORTS":0, "REFUTES":1, "NOT ENOUGH INFO":2}
FEVER_LABELS = ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── BERT loader ─────────────────────────────────────────────
def get_bert_model(model_name="bert-base-uncased", num_labels=3):
    from transformers import BertTokenizer, BertForSequenceClassification
    tok = BertTokenizer.from_pretrained(model_name)
    mdl = BertForSequenceClassification.from_pretrained(model_name,
                                                        num_labels=num_labels).to(DEVICE)
    return tok, mdl

# ── shared helpers ─────────────────────────────────────────
def _eval_epoch(model, dl):
    model.eval(); y_true, y_pred = [], []
    with torch.no_grad():
        for b in dl:
            ids  = b["input_ids"].to(DEVICE)
            mask = b["attention_mask"].to(DEVICE)
            labs = b["label"].to(DEVICE)
            logits = model(ids, attention_mask=mask).logits if hasattr(model,"config") else model(ids)
            y_true.extend(labs.cpu().numpy())
            y_pred.extend(torch.argmax(logits, 1).cpu().numpy())
    return np.array(y_true), np.array(y_pred)

def _report(split, y_true, y_pred):
    logger.info(f"{split} acc: {accuracy_score(y_true,y_pred):.3f}")
    logger.info("\n"+classification_report(
        y_true, y_pred, labels=[0,1,2], target_names=FEVER_LABELS,
        digits=3, zero_division=0))

# ── BERT training ──────────────────────────────────────────
def train_bert_model(model, train_dl, val_dl, optim,
                     epochs=3, use_amp=False, class_w=None):
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_w)
    scaler  = torch.cuda.amp.GradScaler() if use_amp else None

    for ep in range(1, epochs+1):
        model.train(); total = 0
        for b in train_dl:
            optim.zero_grad()
            ids = b["input_ids"].to(DEVICE); mask = b["attention_mask"].to(DEVICE)
            labs= b["label"].to(DEVICE)
            if use_amp:
                with torch.cuda.amp.autocast():
                    loss = loss_fn(model(ids,attention_mask=mask).logits, labs)
                scaler.scale(loss).backward(); scaler.step(optim); scaler.update()
            else:
                loss = loss_fn(model(ids,attention_mask=mask).logits, labs)
                loss.backward(); optim.step()
            total += loss.item()
        logger.info(f"[BERT] epoch {ep} loss {total/len(train_dl):.4f}")
        _report("val", *_eval_epoch(model, val_dl))

def evaluate_bert_model(model, test_dl):
    y_t, y_p = _eval_epoch(model, test_dl)
    _report("test", y_t, y_p)
    return {"Accuracy": accuracy_score(y_t,y_p),
            "Precision": precision_score(y_t,y_p,average="weighted",zero_division=0),
            "Recall":    recall_score  (y_t,y_p,average="weighted",zero_division=0),
            "F1 Score":  f1_score      (y_t,y_p,average="weighted",zero_division=0)}

# ── CNN training / eval ───────────────────────────────────
def train_cnn_model(model, train_dl, val_dl, optim,
                    epochs=3, class_w=None):
    model.to(DEVICE)
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_w)
    for ep in range(1, epochs+1):
        model.train(); total = 0
        for b in train_dl:
            optim.zero_grad()
            ids, labs = b["input_ids"].to(DEVICE), b["label"].to(DEVICE)
            loss = loss_fn(model(ids), labs)
            loss.backward(); optim.step()
            total += loss.item()
        logger.info(f"[CNN] epoch {ep} loss {total/len(train_dl):.4f}")
        _report("val", *_eval_epoch(model, val_dl))

def evaluate_cnn_model(model, test_dl):
    y_t, y_p = _eval_epoch(model, test_dl)
    _report("test", y_t, y_p)
    return {"Accuracy": accuracy_score(y_t,y_p),
            "Precision": precision_score(y_t,y_p,average="weighted",zero_division=0),
            "Recall":    recall_score  (y_t,y_p,average="weighted",zero_division=0),
            "F1 Score":  f1_score      (y_t,y_p,average="weighted",zero_division=0)}


In [88]:
import os, re, json, time, random, requests, nltk, numpy as np, pandas as pd
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from rank_bm25 import BM25Okapi
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

LABEL2ID = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
FEVER_LABELS = ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]

# ── text prep
_stop = set(stopwords.words("english"))
_stem = PorterStemmer()
def preprocess(text: str, remove_stop=True, stem=True):
    toks = nltk.word_tokenize(re.sub(r"[^\w\s]", " ", text.lower()))
    if remove_stop:
        toks = [t for t in toks if t not in _stop]
    if stem:
        toks = [_stem.stem(t) for t in toks]
    return toks

# ── scraping helpers
BASE = {"politifact":"https://www.politifact.com", "snopes":"https://www.snopes.com"}
PAGES = {"politifact":"https://www.politifact.com/factchecks/list/?page={}",
         "snopes":"https://www.snopes.com/fact-check/page/{}/"}
UA = {"User-Agent":"Mozilla/5.0 (FactCheckBot)"}

def _allowed(site, url):
    rp = RobotFileParser(); rp.set_url(BASE[site] + "/robots.txt")
    try: rp.read(); return rp.can_fetch("FactCheckBot", url)
    except: return False

def _html(site, page, retries=3):
    url = PAGES[site].format(page)
    if not _allowed(site, url): return None
    for i in range(retries):
        try:
            r = requests.get(url, timeout=10, headers=UA); r.raise_for_status(); return r.text
        except requests.RequestException: time.sleep(2**i)
    return None

def _parse(html, site):
    soup = BeautifulSoup(html, "html.parser"); out=[]
    if site=="politifact":
        for li in soup.select("li.o-listicle__item"):
            a = li.select_one("a.m-statement__quote");  img = li.select_one("div.m-statement__meter img")
            if a: out.append({"claim":a.get_text(strip=True),
                              "label":"REFUTES" if img and "false" in img["alt"].lower() else "SUPPORTS"})
    else:
        for art in soup.select("article.media-block"):
            a = art.select_one("h2.media-block__title a"); 
            lbl = art.select_one("span.media-rating__link")
            if a: out.append({"claim":a.get_text(strip=True),
                              "label":"REFUTES" if lbl and "false" in lbl.get_text(strip=True).lower() else "SUPPORTS"})
    return out

def scrape_claims(site, pages=2, delay=(1,3)):
    collected=[]
    for p in range(1, pages+1):
        html=_html(site,p); 
        if not html: break
        claims=_parse(html,site)
        if not claims: break
        collected.extend(claims)
        time.sleep(random.uniform(*delay))
    return collected

# ── retrieval
def retrieve_evidence(claim, bm25, corpus_df, top_n=5):
    scores = bm25.get_scores(preprocess(claim))
    idxs = np.argsort(scores)[::-1][:top_n]
    hits = [(corpus_df.iloc[i]["label"], 1/(rank+1)) for rank,i in enumerate(idxs) if scores[i]>0]
    if not hits:                                        # fall back to web
        for site in ("politifact","snopes"):
            scrapes = scrape_claims(site, pages=1)
            if scrapes:
                hits = [(c["label"], 0.2) for c in scrapes[:top_n]]
                break
    return hits

def majority_vote(hits):
    if not hits: return "NOT ENOUGH INFO"
    score = {lab:0 for lab in FEVER_LABELS}
    for lab, w in hits: score[lab]+=w
    return max(score, key=score.get)

# ── BM25 tuning
def tune_bm25(train_df, val_texts, val_labels, tune=False):
    if not tune: return 1.5, 0.75
    best_k1, best_b, best = 1.5, 0.75, 0
    tokens = [preprocess(t) for t in train_df["statement"]]
    for k1 in [1.2,1.5,1.8,2.0]:
        for b in [0.6,0.75,0.9]:
            bm25 = BM25Okapi(tokens, k1=k1, b=b)
            preds = [majority_vote(retrieve_evidence(t,bm25,train_df)) for t in val_texts]
            acc = accuracy_score(val_labels, preds)
            if acc>best: best_k1, best_b, best= k1,b,acc
    logger.info(f"BM25 tuned: k1={best_k1}, b={best_b}, acc={best:.3f}")
    return best_k1, best_b

# ── evaluation
def evaluate_bm25(bm25, texts, labels, train_df, top_n=5):
    preds=[majority_vote(retrieve_evidence(t,bm25,train_df,top_n)) for t in texts]
    logger.info("\n"+classification_report(labels, preds, labels=[0,1,2], target_names=FEVER_LABELS, zero_division=0))
    return {"Accuracy": accuracy_score(labels,preds),
            "Precision": precision_score(labels,preds,average="weighted",zero_division=0),
            "Recall": recall_score(labels,preds,average="weighted",zero_division=0),
            "F1 Score": f1_score(labels,preds,average="weighted",zero_division=0)}


In [89]:
# Cell 6 – master pipeline (BERT sees evidence, 3 epochs)

import os, numpy as np, pandas as pd, torch, matplotlib.pyplot as plt
from torch.optim import AdamW, Adam
from rank_bm25 import BM25Okapi

try:
    import bert_score
except ImportError:
    os.system("pip install -q bert-score"); import bert_score

# ── wipe checkpoints only if retraining requested
for f in ("bert_model.pt", "cnn_model.pt"):
    if os.path.exists(f):
        os.remove(f); print(f"Deleted old {f}")

def plot_metrics(m, title, show=False):
    if show:
        plt.barh(list(m.keys()), list(m.values()), color="steelblue")
        plt.xlim(0, 1); plt.title(title); plt.show()

# ── tiny BM25 helper for evidence
def build_bm25(df):
    return BM25Okapi([preprocess(s) for s in df["statement"]])

def top_sentence(bm25, corpus_df, claim, min_score=0.30):
    scores = bm25.get_scores(preprocess(claim))
    idx    = int(np.argmax(scores))
    return corpus_df.iloc[idx]["statement"] if scores[idx] > min_score else ""

# ── main orchestrator
def main(force_bm25_tune=False,
         force_bert_retrain=False,
         force_cnn_retrain=False,
         fast_dev=True,
         show_plots=False):

    logger.info("🚀 pipeline start")

    # data ----------------------------------------------------
    train_df, test_df = load_and_preprocess_data(
        DATA_PATH,
        logger=logger,
        fast_dev=fast_dev,
        fast_train=FAST_SAMPLE_TRAIN,
        fast_test=FAST_SAMPLE_TEST
    )
    logger.info(f"train={len(train_df)} • test={len(test_df)}")

    # build BM25 for evidence retrieval
    bm25_ev     = build_bm25(train_df)
    evidence_fn = lambda c: top_sentence(bm25_ev, train_df, c, 0.30)

    # tokeniser & models
    tok, bert = get_bert_model(MODEL_NAME, num_labels=3)

    # dataloaders
    train_dl_bert, test_dl_bert = create_dataloaders(
        train_df, test_df, tok,
        bs=BATCH_SIZE, max_len=256,
        use_evidence=True,  evidence_fn=evidence_fn   # claim + evidence
    )
    train_dl_cnn,  test_dl_cnn = create_dataloaders(
        train_df, test_df, tok,
        bs=BATCH_SIZE, max_len=128,
        use_evidence=False                           # claim-only
    )

    # BERT ----------------------------------------------------
    opt_bert, EPOCHS_BERT = AdamW(bert.parameters(), lr=LEARNING_RATE), 3
    if not force_bert_retrain and os.path.exists("bert_model.pt"):
        bert.load_state_dict(torch.load("bert_model.pt", map_location=DEVICE))
        logger.info("BERT weights loaded")
    else:
        from sklearn.utils.class_weight import compute_class_weight
        cw = compute_class_weight("balanced", classes=[0,1,2],
                                  y=train_df["label"].map(LABEL2ID))
        train_bert_model(
            bert, train_dl_bert, test_dl_bert, opt_bert,
            epochs=EPOCHS_BERT,
            class_w=torch.tensor(cw, dtype=torch.float, device=DEVICE)
        )
        torch.save(bert.state_dict(), "bert_model.pt")
    bert_metrics = evaluate_bert_model(bert, test_dl_bert)
    plot_metrics(bert_metrics, "BERT", show_plots)

    # CNN -----------------------------------------------------
    bert_emb = bert.get_input_embeddings().weight.data.clone()
    cnn = CNNModel(
        tok.vocab_size, bert_emb.size(1), 3,
        dropout=0.3, pretrained_weights=bert_emb,
        freeze_embed=True
    ).to(DEVICE)
    for block in cnn.convs: block[2].momentum = 0.05

    opt_cnn, EPOCHS_CNN = Adam(cnn.parameters(), lr=LEARNING_RATE), 5
    if not force_cnn_retrain and os.path.exists("cnn_model.pt"):
        cnn.load_state_dict(torch.load("cnn_model.pt", map_location=DEVICE))
        logger.info("CNN weights loaded")
    else:
        from sklearn.utils.class_weight import compute_class_weight
        cw = compute_class_weight("balanced", classes=[0,1,2],
                                  y=train_df["label"].map(LABEL2ID))
        train_cnn_model(
            cnn, train_dl_cnn, test_dl_cnn, opt_cnn,
            epochs=EPOCHS_CNN,
            class_w=torch.tensor(cw, dtype=torch.float, device=DEVICE)
        )
        torch.save(cnn.state_dict(), "cnn_model.pt")
    cnn_metrics = evaluate_cnn_model(cnn, test_dl_cnn)
    plot_metrics(cnn_metrics, "CNN", show_plots)

    # BM25 baseline (unchanged)
    k1, b = tune_bm25(train_df, test_df["statement"], test_df["label"],
                      tune=force_bm25_tune)
    bm25 = BM25Okapi([preprocess(t) for t in train_df["statement"]], k1=k1, b=b)
    bm25_metrics = evaluate_bm25(
        bm25, test_df["statement"], test_df["label"], train_df
    )
    plot_metrics(bm25_metrics, "BM25", show_plots)

    # summary
    summary = pd.DataFrame(
        {"BERT": bert_metrics, "CNN": cnn_metrics, "BM25": bm25_metrics}
    ).T.round(3)
    print("\n===== Performance summary =====\n", summary)

    return bert, cnn, bm25, tok, train_df, test_df


bert_model, cnn_model, bm25, tokenizer, train_df, test_df = main(
    force_bm25_tune=False,
    force_bert_retrain=True,
    force_cnn_retrain=True,
    fast_dev=True,
    show_plots=False
)


Deleted old bert_model.pt
Deleted old cnn_model.pt


  .apply(lambda x: x.sample(min(n, len(x)), random_state=42))
  .apply(lambda x: x.sample(min(n, len(x)), random_state=42))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Performance summary =====
       Accuracy  Precision  Recall  F1 Score
BERT     0.476      0.492   0.476     0.470
CNN      0.450      0.452   0.450     0.437
BM25     0.396      0.397   0.396     0.396


In [90]:
# Cell 7 – 100 % BM25-free, OOF features, logistic meta-classifier
import warnings, numpy as np, torch
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
warnings.filterwarnings("ignore", category=UserWarning)

LABEL2ID   = {"SUPPORTS":0, "REFUTES":1, "NOT ENOUGH INFO":2}
FEVER      = ["SUPPORTS","REFUTES","NOT ENOUGH INFO"]
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── helper: base-model softmax
def _probs(model, tok, text):
    enc = tok(text, return_tensors="pt", truncation=True,
              padding="max_length", max_length=MAX_LENGTH).to(DEVICE)
    with torch.no_grad():
        logits = (model(**enc).logits
                  if hasattr(model, "config") else model(enc["input_ids"]))
    return torch.softmax(logits, 1).squeeze().cpu().numpy()

def _sharpen(p, g): p = p**g; return p/p.sum()

BERT_G = 1.4   # sharpen BERT only
CNN_G  = 1.0   # leave CNN raw
ALPHA, BETA = 0.75, 0.25

def _feat(text):
    p_bert = _sharpen(_probs(bert_model, tokenizer, text), BERT_G)
    p_cnn  = _probs  (cnn_model , tokenizer, text)        # no sharpening
    return np.concatenate([ALPHA*p_bert, BETA*p_cnn])     # 6-dim vector

# ── create OOF training features on the full train set (5-fold) ──────────
skf   = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_X, oof_y = [], []

for tr_idx, val_idx in skf.split(train_df, train_df["label"]):
    for i in val_idx:
        txt = train_df.iloc[i]["statement"]
        oof_X.append(_feat(txt))
        oof_y.append(LABEL2ID[train_df.iloc[i]["label"]])

X_train = np.vstack(oof_X)
y_train = np.array(oof_y)

# ── build meta-test features from held-out 40 % slice ────────────────────
from sklearn.model_selection import train_test_split
meta_train_df, meta_test_df = train_test_split(
    test_df, test_size=0.40, stratify=test_df["label"], random_state=42
)

X_test  = np.vstack([_feat(t) for t in meta_test_df["statement"]])
y_test  = meta_test_df["label"].map(LABEL2ID).values

# ── logistic regression meta-classifier (simple & robust) ────────────────
meta_clf = LogisticRegression(
    max_iter=2000, class_weight="balanced",
    multi_class="multinomial", solver="lbfgs"
)
meta_clf.fit(X_train, y_train)
y_pred = meta_clf.predict(X_test)

print("Stacking Meta-Classifier Report")
print(classification_report(
    y_test, y_pred, labels=[0,1,2], target_names=FEVER,
    digits=3, zero_division=0))


Stacking Meta-Classifier Report
                 precision    recall  f1-score   support

       SUPPORTS      0.500     0.567     0.531        67
        REFUTES      0.683     0.642     0.662        67
NOT ENOUGH INFO      0.475     0.439     0.457        66

       accuracy                          0.550       200
      macro avg      0.553     0.549     0.550       200
   weighted avg      0.553     0.550     0.550       200

