# Claim Verification with Ollama + Justification Metrics

This notebook:
1. Reads a **JSON** dataset of claims with `claim`, `label`, `justification`.
2. Calls a local **Ollama** model to predict a **label** (Supported/Refuted) **and** a **justification**.
3. Computes **ROUGE-L F1**, **BLEU**, **cosine similarity** (SBERT; fallback TF-IDF), and **BERTScore-F1** between model justification and ground-truth justification.
4. Produces a DataFrame with columns:
   - `claim`, `label`, `justification`
   - `label_pred`, `justification_pred`
   - `rougeL_f1`, `bleu`, `cosine`, `bertscore_f1`
5. Saves results to `runs/<model_slug>_claim_eval.csv`.


In [1]:
!pip install jupyter ipykernel requests pandas tqdm  rouge-score sacrebleu bert-score

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.5-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting ipywidgets (from jupyter)
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.7-py3-none-any.whl.metadata (16 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collec

In [5]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
Downloading scipy-1.16.2-cp312-cp312-win_amd64.whl (38.6 MB)
   ---------------------------------------- 0.0/38.6 MB ? eta -:--:--
   --- ------------------------------------ 3.1/38.6 MB 15.4 MB/s eta 0:00:03
   ------ --------------------------------- 6.3/38.6 MB 15.5 MB/s eta 0:00:03
   --------- ------------------------------ 9.4/38.6 MB 15.1 MB/s eta 0:00:02
   

In [None]:
DATASET_PATH = "refuted_claims.json"   # your file
MODEL = "gemma3:4b "          # or qwen3:4b / mistral:7b / gemma3:4b
LANG = "en"                    # for BERTScore baseline rescaling


In [15]:
from pathlib import Path
import json, re, time
from typing import Dict, Any, List, Tuple, Optional

import pandas as pd
import numpy as np

# HTTP (Ollama)
try:
    import requests
except Exception:
    requests = None
import urllib.request, urllib.error

# Metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer  # fallback if SBERT unavailable

# Optional libraries (guarded imports)
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    SentenceTransformer = None

try:
    from bert_score import score as bertscore_score
except Exception:
    bertscore_score = None

try:
    import sacrebleu
except Exception:
    sacrebleu = None

try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None

# Optional progress bar (if not installed, you can set USE_TQDM=False)
try:
    from tqdm import tqdm
    USE_TQDM = True
except Exception:
    tqdm = lambda x, **kwargs: x
    USE_TQDM = False

OLLAMA_URL = "http://127.0.0.1:11434"  # change if needed


In [16]:
def slugify(s: str) -> str:
    return re.sub(r'[^a-zA-Z0-9._-]+', '_', s).strip('_').lower()

def read_json_dataset(path: str) -> List[Dict[str, Any]]:
    """
    Reads either:
    - a JSON array of objects, or
    - a JSONL file (one JSON object per line)
    """
    p = Path(path)
    txt = p.read_text(encoding='utf-8')
    # Heuristic: if first non-space char is '[', treat as JSON array.
    first_non_ws = next((c for c in txt if not c.isspace()), '[')
    if first_non_ws == '[':
        data = json.loads(txt)
        assert isinstance(data, list), "Top-level JSON must be a list of objects."
        return data
    # Else fall back to JSONL
    records = []
    for line in txt.splitlines():
        line = line.strip()
        if line:
            records.append(json.loads(line))
    return records

def normalize_label(label: str) -> str:
    if label is None:
        return ""
    s = label.strip().lower()
    # Map various aliases to two classes
    if s in {"supported", "true", "yes", "accepted", "accurate", "correct"}:
        return "Supported"
    if s in {"refuted", "false", "no", "rejected", "inaccurate", "incorrect"}:
        return "Refuted"
    # If unknown, return capitalized original
    return label.capitalize()

def ensure_fields(obj: Dict[str, Any], required=("claim","label","justification")):
    for k in required:
        if k not in obj:
            raise ValueError(f"Dataset record missing required field: {k}")


In [None]:
import subprocess, json, shlex
def http_post_json(url: str, payload: Dict[str, Any], timeout: float = 300.0) -> Dict[str, Any]:
    if requests is not None:
        r = requests.post(url, json=payload, timeout=timeout)
        r.raise_for_status()
        return r.json()
    # fallback
    data = json.dumps(payload).encode('utf-8')
    req = urllib.request.Request(url, data=data, headers={'Content-Type': 'application/json'}, method='POST')
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        return json.loads(resp.read().decode('utf-8'))

INSTRUCTION = """You are a fact-checking assistant.
            Given a political claim, return a JSON object with fields:
            - "label": one of ["Supported", "Refuted"]
            - "justification": a short, factual justification (1–3 sentences).

            IMPORTANT:
            - The 'label' must be exactly "Supported" or "Refuted".
            - Be concise, avoid speculation, and do not add extra keys.
            """



def _extract_json_block(text: str) -> Optional[Dict[str, Any]]:
    try:
        t = text
        if "```" in t:
            # choose the largest fenced block (heuristic)
            parts = t.split("```")
            t = max(parts, key=len)
        m = re.search(r"\{[\s\S]*\}", t)
        if m:
            return json.loads(m.group(0))
        return json.loads(t)
    except Exception:
        return None
    
def ask_ollama_cli(model: str, claim: str, system: str = INSTRUCTION) -> Tuple[str, str, str]:
    prompt = f"Claim: {claim}\nRespond with JSON as specified."
    cmd = ["ollama", "generate", "-m", model, "--system", system, prompt]
    raw = subprocess.check_output(cmd, text=True)
    j = _extract_json_block(raw) or {}
    return raw, normalize_label(j.get("label","")), (j.get("justification","") or "").strip()


In [18]:
def cosine_sbert(a: str, b: str, model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> Optional[float]:
    """
    SBERT cosine similarity. Returns None if sentence-transformers unavailable.
    """
    if SentenceTransformer is None:
        return None
    global _sbert_model
    try:
        _sbert_model
    except NameError:
        _sbert_model = SentenceTransformer(model_name)
    vecs = _sbert_model.encode([a, b], convert_to_numpy=True, normalize_embeddings=True)
    return float((vecs[0] * vecs[1]).sum())

def cosine_tfidf(a: str, b: str) -> float:
    vec = TfidfVectorizer().fit_transform([a, b])
    return float(cosine_similarity(vec[0], vec[1])[0,0])

def rougeL_f1(pred: str, ref: str) -> Optional[float]:
    if rouge_scorer is None:
        return None
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(ref, pred)  # (reference, prediction)
    return float(scores['rougeL'].fmeasure)

def bleu_sentence(pred: str, ref: str) -> Optional[float]:
    if sacrebleu is None:
        return None
    # SacreBLEU expects list of references
    return float(sacrebleu.sentence_bleu(pred, [ref]).score)

def bertscore_f1(pred: str, ref: str, lang: str = "en") -> Optional[float]:
    if bertscore_score is None:
        return None
    import torch
    with torch.no_grad():
        P, R, F = bertscore_score([pred], [ref], lang=lang, rescale_with_baseline=True)
    return float(F.mean().item())


In [19]:
# ▶️ Configuration
DATASET_PATH = "data/refuted_claims.json"  # <-- set to your JSON file (array or JSONL)
MODEL = "gemma3:4b "         # e.g., "qwen3:4b", "mistral:7b", "gemma3:4b"
OUTDIR = "runs"
LANG = "en"                   # language code for BERTScore baseline rescaling


In [20]:
records = read_json_dataset(DATASET_PATH)

# Validate and normalize
clean = []
for r in records:
    ensure_fields(r, required=("claim","label","justification"))
    claim = str(r["claim"]).strip()
    label = normalize_label(str(r["label"]))
    just  = str(r["justification"]).strip()
    clean.append({"claim": claim, "label": label, "justification": just})

print(f"Loaded {len(clean)} records.")
if clean:
    print("Sample record:", clean[0])


Loaded 1742 records.
Sample record: {'claim': 'Donald Trump delivered the largest tax cuts in American history.', 'label': 'Refuted', 'justification': 'Three tax bills have been larger than that of Donald Trump'}


In [21]:
rows = []
subset = clean[:100]  # Limit to first 100 rows
iterator = tqdm(subset, desc="Evaluating") if USE_TQDM else subset

for row in iterator:
    claim = row["claim"]
    gt_label = row["label"]
    gt_just = row["justification"]

    raw, label_pred, just_pred = ask_ollama(MODEL, claim)

    # Metrics: justification_pred vs ground-truth justification
    rouge = rougeL_f1(just_pred, gt_just)
    bleu = bleu_sentence(just_pred, gt_just)
    # Cosine: prefer SBERT, fallback to TF-IDF
    cos = cosine_sbert(just_pred, gt_just)
    if cos is None:
        cos = cosine_tfidf(just_pred, gt_just)
    bert_f1 = bertscore_f1(just_pred, gt_just, lang=LANG)

    rows.append({
        "claim": claim,
        "label": gt_label,
        "justification": gt_just,
        "label_pred": label_pred,
        "justification_pred": just_pred,
        "rougeL_f1": rouge,
        "bleu": bleu,
        "cosine": cos,
        "bertscore_f1": bert_f1,
    })

df = pd.DataFrame(rows)
df.head()

Evaluating:   0%|          | 0/100 [00:09<?, ?it/s]


RuntimeError: Ollama call failed after 3 retries: 404 Client Error: Not Found for url: http://127.0.0.1:11434/api/generate

In [None]:
outdir = Path(OUTDIR)
outdir.mkdir(parents=True, exist_ok=True)
out_csv = outdir / f"{slugify(MODEL)}_claim_eval.csv"
df.to_csv(out_csv, index=False, encoding="utf-8")
print(f"Saved: {out_csv.resolve()}")
