In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import json, re, uuid, random
import os, tempfile, hashlib
from tqdm import tqdm
import faiss
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import nltk
import string
from dataclasses import dataclass
from datasets import Dataset
from evaluate import load as hf_load
from typing import Dict, List, Any, Tuple
from nltk.tokenize import sent_tokenize
from rank_bm25 import BM25Okapi
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft import PeftModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sentence_transformers import SentenceTransformer
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSequenceClassification
from typing import Dict, List
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from nltk.corpus import stopwords
import sacremoses
import rouge_score
import bert_score
import pickle
import sys

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

def free_cuda():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Using device: cuda
GPU: Tesla T4


In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Read in train / val/ test datasets
train_df = pd.read_csv("./data/medquad_train.csv")
val_df = pd.read_csv(".data/medquad_val.csv")
test_df = pd.read_csv(".data/medquad_test.csv")

In [None]:
# Load any generator (tokenizer + model) by path

def load_generator(model_dir: str):
    """
    Loads a generator from a local directory.
    Works for both BASELINE2_DIR (base BioGPT) and FULLMODEL_DIR (merged LoRA).
    """
    tok = AutoTokenizer.from_pretrained(model_dir)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    gen = AutoModelForCausalLM.from_pretrained(model_dir)
    gen.config.pad_token_id = tok.pad_token_id
    gen = gen.to(device).eval()
    return tok, gen

In [None]:
BASELINE2_DIR = "./biogpt_base"                    
FULLMODEL_DIR  = "./finetuned_biogpt"   

tok_base, gen_base = load_generator(BASELINE2_DIR)
tok_full, gen_full = load_generator(FULLMODEL_DIR)


In [4]:
# =============================
# Helper: tokenize text
# =============================
_word_re = re.compile(r"[A-Za-z0-9]+(?:[-'][A-Za-z0-9]+)?")
def simple_tokenize(text: str):
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    return _word_re.findall(text.lower())

# =============================
# Build doc column (Q + A + meta)
# =============================
def build_doc_row(row):
    parts = []
    q = str(row.get("question", "")).strip()
    a = str(row.get("answer", "")).strip()
    if q: parts.append(f"Q: {q}")
    if a: parts.append(f"A: {a}")

    meta_bits = []
    for lab, key in [("Entity", "entity"), ("Type", "qtype"), ("Source", "source"), ("URL", "url")]:
        val = str(row.get(key, "") or "").strip()
        if val: meta_bits.append(f"{lab}: {val}")
    if meta_bits:
        parts.append("\n".join(meta_bits))
    return "\n".join(parts).strip()

# =============================
# Save & Load Helpers
# =============================
SAVE_DIR = "./retrieval_artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

def save_retrieval_artifacts(train_df, bm25, bm25_corpus_tokens, index, dense_emb=None):
    # BM25
    with open(f"{SAVE_DIR}/bm25.pkl", "wb") as f:
        pickle.dump(bm25, f)
    # BM25 tokens
    with open(f"{SAVE_DIR}/bm25_corpus_tokens.pkl", "wb") as f:
        pickle.dump(bm25_corpus_tokens, f)
    # Train df
    train_df.to_csv(f"{SAVE_DIR}/train_df.csv", index=False)
    # FAISS index
    faiss.write_index(index, f"{SAVE_DIR}/dense_index.faiss")
    # Dense embeddings
    if dense_emb is not None:
        np.save(f"{SAVE_DIR}/dense_emb.npy", dense_emb)

def load_retrieval_artifacts():
    # BM25
    with open(f"{SAVE_DIR}/bm25.pkl", "rb") as f:
        bm25 = pickle.load(f)
    with open(f"{SAVE_DIR}/bm25_corpus_tokens.pkl", "rb") as f:
        bm25_corpus_tokens = pickle.load(f)
    # Dataframe
    train_df = pd.read_csv(f"{SAVE_DIR}/train_df.csv")
    _dense_rows = train_df.reset_index(drop=True)
    # FAISS
    index = faiss.read_index(f"{SAVE_DIR}/dense_index.faiss")
    # Dense model
    dense = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO", device=device)
    return train_df, bm25, bm25_corpus_tokens, dense, index, _dense_rows

# =============================
# Build indexes (if first time)
# =============================
def build_retrieval(train_df):
    # Add 'doc' column
    train_df = train_df.copy()
    train_df["doc"] = train_df.apply(build_doc_row, axis=1)

    # BM25 corpus
    bm25_corpus_texts  = train_df["doc"].tolist()
    bm25_corpus_tokens = [simple_tokenize(t) for t in bm25_corpus_texts]
    bm25 = BM25Okapi(bm25_corpus_tokens)

    # Dense embeddings + FAISS
    dense = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO", device=device)
    corpus_texts = train_df["doc"].astype(str).tolist()
    dense_emb = dense.encode(corpus_texts, batch_size=128, normalize_embeddings=True,
                             show_progress_bar=True).astype("float32")
    dim = dense_emb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(dense_emb)
    _dense_rows = train_df.reset_index(drop=True)

    # Save artifacts
    save_retrieval_artifacts(train_df, bm25, bm25_corpus_tokens, index, dense_emb)
    print("Saved retrieval artifacts to", SAVE_DIR)
    return train_df, bm25, bm25_corpus_tokens, dense, index, _dense_rows

# =============================
# Search functions
# =============================
def bm25_search(query: str, k: int = 20):
    q_tokens = simple_tokenize(query)
    scores = bm25.get_scores(q_tokens)
    top_idx = np.argsort(scores)[::-1][:k]
    out = train_df.iloc[top_idx].copy()
    out["bm25_score"] = np.asarray(scores)[top_idx]
    return out

def dense_search(query: str, k: int = 200):
    qv = dense.encode([query], normalize_embeddings=True).astype("float32")
    scores, idx = index.search(qv, k)
    out = _dense_rows.iloc[idx[0]].copy()
    out["dense_score"] = scores[0]
    return out

def hybrid_search(query: str, k_bm25=200, k_dense=200, k_final=50, alpha=0.5):
    bm = bm25_search(query, k=k_bm25)[["question","answer","source","url","bm25_score"]].copy()
    de = dense_search(query, k=k_dense)[["question","answer","source","url","dense_score"]].copy()

    merged = bm.merge(de, how="outer", on=["question","answer","source","url"])
    merged["bm25_score"]  = merged["bm25_score"].fillna(0.0)
    merged["dense_score"] = merged["dense_score"].fillna(0.0)

    merged["bm25_s"]  = minmax_scale(merged["bm25_score"].to_numpy(dtype=float), copy=True)
    merged["dense_s"] = minmax_scale(merged["dense_score"].to_numpy(dtype=float), copy=True)
    merged["hybrid"]  = (1 - alpha) * merged["bm25_s"] + alpha * merged["dense_s"]

    return merged.sort_values("hybrid", ascending=False).head(k_final).reset_index(drop=True)

# =============================
# Cross-encoder re-ranker
# =============================
RERANK_NAME = "ncbi/MedCPT-Cross-Encoder"
ce_tok = AutoTokenizer.from_pretrained(RERANK_NAME, model_max_length=512, truncation_side="right")
ce_model = AutoModelForSequenceClassification.from_pretrained(RERANK_NAME).to(device).eval()

def clip_passage(text: str, max_passage_tokens: int = 400) -> str:
    ids = ce_tok.encode(text or "", add_special_tokens=False)[:max_passage_tokens]
    return ce_tok.decode(ids, skip_special_tokens=True)

@torch.inference_mode()
def rerank_cross_encoder(query: str, df_candidates: pd.DataFrame, top_n: int = 8,
                         batch_size: int = 16, max_length: int = 512, max_passage_tokens: int = 400):
    if df_candidates.empty:
        return df_candidates
    q = str(query or "")
    d_texts = [clip_passage(str(a), max_passage_tokens=max_passage_tokens)
               for a in df_candidates["answer"].astype(str).tolist()]
    scores = []
    for i in range(0, len(d_texts), batch_size):
        q_batch = [q]*len(d_texts[i:i+batch_size])
        d_batch = d_texts[i:i+batch_size]
        enc = ce_tok(q_batch, d_batch, padding="max_length", truncation="only_second",
                     max_length=max_length, return_tensors="pt").to(device)
        logits = ce_model(**enc).logits.squeeze(-1)
        scores.extend(logits.detach().cpu().tolist())
    out = df_candidates.copy()
    out["ce_score"] = scores
    return out.sort_values("ce_score", ascending=False).head(top_n).reset_index(drop=True)

# ADDITIONAL
print("Loading retrieval artifacts...")
train_df, bm25, bm25_corpus_tokens, dense, index, _dense_rows = load_retrieval_artifacts()
print("Retrieval artifacts loaded.")

Overwriting /content/drive/MyDrive/DSA4213/RAG/rag_retrieval.py


In [None]:
# To save the retrieval artifcacts
# train_df, bm25, bm25_corpus_tokens, dense, index, _dense_rows = build_retrieval(train_df)

Batches:   0%|          | 0/103 [00:00<?, ?it/s]

Saved retrieval artifacts to /content/drive/MyDrive/retrieval_artifacts


In [None]:
train_df, bm25, bm25_corpus_tokens, dense, index, _dense_rows = load_retrieval_artifacts()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# =============================
# Finalized Generation Pipelines
# =============================

# ====== No RAG (Baseline 1) ======
@torch.inference_mode()
def generate_no_rag(user_q: str, tok, gen_model,
                    max_new_tokens: int = 150,
                    repetition_penalty: float = 1.1,
                    no_repeat_ngram_size: int = 3) -> dict:
    """
    Baseline 1: Finetuned BioGPT (no retrieval).
    The model relies only on what it learned during fine-tuning.
    """
    system = (
        "You are a careful medical information assistant for the general public.\n"
        "- Use short sentences and plain language.\n"
        "- Avoid diagnosis; give general guidance and next steps.\n"
        "- If you don't know, say you don't know.\n"
    )
    prompt = f"{system}\nQuestion: {user_q.strip()}\nAnswer:"

    # Tokenize safely
    inputs = tok(prompt, return_tensors="pt",
                 truncation=True, max_length=512).to(DEVICE)

    out_ids = gen_model.generate(
        **inputs,
        do_sample=False,  # greedy decoding
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        pad_token_id=tok.pad_token_id,
        eos_token_id=tok.eos_token_id,
        use_cache=True
    )

    text = tok.decode(out_ids[0], skip_special_tokens=True)
    ans = text.split("Answer:", 1)
    ans = (ans[1] if len(ans) > 1 else text).strip()
    return {"answer": ans, "evidence": None}


# ====== RAG (Baseline 2 + Full Model) ======
SYSTEM_RAG_FINE = (
    "You are a careful medical information assistant for the general public.\n"
    "- Answer ONLY using the Context bullets; if missing, say you don't know.\n"
    "- Use short sentences and plain language. Avoid diagnosis; give general guidance and next steps.\n"
    "- Add bracketed numeric citations [1], [2] that refer to the bullets you used.\n"
)

SYSTEM_RAG_BASE = (
    "Answer the following medical question briefly using only the context below. "
    "If the context does not contain the answer, reply: 'I don't know.'\n"
)


def build_rag_prompt(user_q: str,
                     reranked_df: pd.DataFrame,
                     tok,
                     ctx_token_budget: int = 700,
                     system_text: str = SYSTEM_RAG_FINE) -> str:
    """
    Build a concise RAG prompt with a safe token budget.
    """
    bullets, used = [], 0
    for i, (_, r) in enumerate(reranked_df.iterrows(), start=1):
        block = f"[{i}] {str(r['answer']).strip()}\n" \
                f"(Source: {(r.get('source') or 'Unknown').strip()} {(r.get('url') or '').strip()})"
        ids = tok.encode(block, add_special_tokens=False)
        if used + len(ids) > ctx_token_budget:
            break
        bullets.append(block)
        used += len(ids)

    ctx = "\n".join(bullets)
    return (
        f"{system_text}\n"
        f"Context:\n{ctx}\n\n"
        f"Question: {user_q.strip()}\n"
        f"Answer:"
    )


@torch.inference_mode()
def generate_with_rag(
    user_q: str,
    tok,
    gen_model,
    use_base_prompt: bool = False,  # True if using BioGPT-base (zero-shot)
    k_bm25=200, k_dense=200, k_final=50, alpha=0.5,
    top_n=6,
    max_new_tokens=150,
    do_sample=False,
    top_p=1.0,
    repetition_penalty=1.1,
    no_repeat_ngram_size=3
) -> dict:
    """
    Full RAG pipeline for Baseline 2 or Full Model.
    Returns: {"answer": str, "evidence": pd.DataFrame[answer, source, url, ce_score]}
    """
    # --- 1. Hybrid retrieval ---
    cand = hybrid_search(user_q, k_bm25=k_bm25,
                         k_dense=k_dense, k_final=k_final, alpha=alpha)

    # --- 2. Cross-encoder re-ranking ---
    rer = rerank_cross_encoder(
        user_q, cand, top_n=top_n,
        batch_size=16, max_length=512, max_passage_tokens=400
    )

    # --- 3. Build RAG prompt ---
    system_text = SYSTEM_RAG_BASE if use_base_prompt else SYSTEM_RAG_FINE
    prompt = build_rag_prompt(user_q, rer, tok=tok,
                              ctx_token_budget=700, system_text=system_text)

    def truncate_context_text(text: str, max_words: int = 400):
        """Trim very long context prompts before tokenization (prevents >512 token overflow)."""
        return " ".join(text.split()[:max_words])

    prompt = truncate_context_text(prompt)

    # --- 4. Generate ---
    inputs = tok(prompt, return_tensors="pt",
                 truncation=True, max_length=512).to(DEVICE)

    gen_kwargs = dict(
        do_sample=do_sample,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        pad_token_id=tok.pad_token_id,
        eos_token_id=tok.eos_token_id,
        use_cache=True
    )

    out_ids = gen_model.generate(**inputs, **gen_kwargs)
    text = tok.decode(out_ids[0], skip_special_tokens=True)

    ans = text.split("Answer:", 1)
    ans = (ans[1] if len(ans) > 1 else text).strip()

    # --- 5. Extract evidence ---
    cols = [c for c in ["answer", "source", "url", "ce_score"] if c in rer.columns]
    used = rer[cols].copy() if cols else pd.DataFrame(
        columns=["answer", "source", "url", "ce_score"]
    )

    return {"answer": ans, "evidence": used}


Using device: cuda


In [4]:
# Edited RAG prompt and generation with RAG

SYSTEM_RAG_FINE = (
    "You are a careful medical information assistant for the general public.\n"
    "- Answer ONLY using the Context bullets; if missing, say you don't know.\n"
    "- Use short sentences and plain language. Avoid diagnosis; give general guidance and next steps.\n"
    "- Add bracketed numeric citations [1], [2] that refer to the bullets you used.\n"
    "If the context does not contain the answer, reply: 'I don't know.'\n"
)

SYSTEM_RAG_BASE = (
    "Answer the following medical question briefly using only the context below. "
    "If the context does not contain the answer, reply: 'I don't know.'\n"
)

def build_rag_prompt(user_q: str,
                     reranked_df: pd.DataFrame,
                     tok,
                     ctx_token_budget: int = 700,
                     system_text: str = SYSTEM_RAG_FINE) -> str:
    """Build a concise RAG prompt with a safe token budget (token-capped bullets)."""
    bullets, used = [], 0
    for i, (_, r) in enumerate(reranked_df.iterrows(), start=1):
        block = f"[{i}] {str(r['answer']).strip()}\n" \
                f"(Source: {(r.get('source') or 'Unknown').strip()} {(r.get('url') or '').strip()})"
        ids = tok.encode(block, add_special_tokens=False)
        if used + len(ids) > ctx_token_budget:
            break
        bullets.append(block)
        used += len(ids)

    ctx = "\n".join(bullets)

    # IMPORTANT: End with a hard delimiter the model can key off
    return (
        f"{system_text}\n"
        f"Context:\n{ctx}\n\n"
        f"Question: {user_q.strip()}\n"
        f"### Answer:\n"
    )

@torch.inference_mode()
def generate_with_rag(
    user_q: str,
    tok,
    gen_model,
    use_base_prompt: bool = False,  # True if using BioGPT-base (zero-shot)
    k_bm25=200, k_dense=200, k_final=50, alpha=0.5,
    top_n=6,
    max_new_tokens=160,
    repetition_penalty=1.05,
    no_repeat_ngram_size=3
) -> dict:
    """Full RAG pipeline for Baseline 2 or Full Model."""
    # 1) Retrieval
    cand = hybrid_search(user_q, k_bm25=k_bm25, k_dense=k_dense, k_final=k_final, alpha=alpha)

    # 2) Re-ranking
    rer = rerank_cross_encoder(
        user_q, cand, top_n=top_n,
        batch_size=16, max_length=512, max_passage_tokens=400
    )

    # 3) Prompt (with delimiter)
    system_text = SYSTEM_RAG_BASE if use_base_prompt else SYSTEM_RAG_FINE
    prompt = build_rag_prompt(user_q, rer, tok=tok, ctx_token_budget=700, system_text=system_text)

    # 4) Tokenize WITHOUT truncating the prompt tail
    inputs = tok(prompt, return_tensors="pt", truncation=False).to(DEVICE)

    # 5) Deterministic, stable decoding (beam search)
    gen_kwargs = dict(
        do_sample=False,
        num_beams=3,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        pad_token_id=tok.pad_token_id,
        eos_token_id=tok.eos_token_id,
        use_cache=True,
        return_dict_in_generate=True
    )

    out = gen_model.generate(**inputs, **gen_kwargs)

    # 6) Decode ONLY continuation
    full_ids   = out.sequences[0]
    prompt_len = inputs["input_ids"].shape[1]
    gen_only   = full_ids[prompt_len:]
    raw = tok.decode(gen_only, skip_special_tokens=True)

    # 7) Minimal, non-destructive cleanup
    # If the model echoed the delimiter, cut after it
    if "### Answer:" in raw:
        raw = raw.split("### Answer:", 1)[-1]

    # Remove only obvious header echoes
    cleaned = raw.strip()
    cleaned = cleaned.replace("Answer:", "", 1).strip()  # case where it prints "Answer:" once

    # Final polish: collapse spaces
    ans = " ".join(cleaned.split())

    # 8) Fallbacks if empty (keep quality)
    if not ans:
        # Try without any cleaning
        ans = tok.decode(gen_only, skip_special_tokens=True).strip()
    if not ans:
        # One mild sampling retry to coax a sentence
        alt = gen_model.generate(
            **inputs, do_sample=True, top_p=0.92, temperature=0.7,
            max_new_tokens=max_new_tokens, repetition_penalty=1.05,
            no_repeat_ngram_size=no_repeat_ngram_size,
            pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id,
            return_dict_in_generate=True, use_cache=True
        )
        gen_only2 = alt.sequences[0][prompt_len:]
        ans = tok.decode(gen_only2, skip_special_tokens=True).strip()
        if "### Answer:" in ans:
            ans = ans.split("### Answer:", 1)[-1]
        ans = " ".join(ans.split())

    # 9) Evidence frame
    cols = [c for c in ["answer", "source", "url", "ce_score"] if c in rer.columns]
    used = rer[cols].copy() if cols else pd.DataFrame(columns=["answer", "source", "url", "ce_score"])

    return {"answer": ans, "evidence": used}


Overwriting /content/drive/MyDrive/DSA4213/RAG/rag_generation.py


In [None]:
# Evaluation Metrics
nltk.download('stopwords')
from nltk.corpus import stopwords

rouge = evaluate.load("rouge")          # ROUGE-1/2/L
stop_words = set(stopwords.words("english"))
smooth = SmoothingFunction().method3    # BLEU smoothing for short answers

def normalize_text(s: str) -> str:
    s = s.lower()
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = " ".join(s.split())
    return s

def exact_match(pred: str, gold: str) -> int:
    return int(normalize_text(pred) == normalize_text(gold))

def token_f1(pred: str, gold: str) -> float:
    pt = normalize_text(pred).split()
    gt = normalize_text(gold).split()
    common = set(pt) & set(gt)
    num_same = sum(min(pt.count(w), gt.count(w)) for w in common)
    if not pt or not gt:
        return float(pt == gt)
    if num_same == 0:
        return 0.0
    precision = num_same / len(pt)
    recall    = num_same / len(gt)
    return 2 * precision * recall / (precision + recall)

# def bleu_score(pred: str, gold: str) -> float:
#     # sentence_bleu expects list of references (each is list of tokens)
#     ref = normalize_text(gold).split()
#     hyp = normalize_text(pred).split()
#     if not hyp:
#         return 0.0
#     return sentence_bleu([ref], hyp, smoothing_function=smooth)

def rouge_l(preds, refs) -> float:
    # returns ROUGE-L fmeasure
    out = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return float(out["rougeL"])


def content_tokens(s: str):
    toks = normalize_text(s).split()
    return [t for t in toks if (t not in stop_words) and (t.isalpha())]

def support_ratio(pred: str, contexts: list[str]) -> float:
    pred_ct = content_tokens(pred)
    if not pred_ct:
        return 1.0  # empty prediction = not hallucinated
    ctx = " ".join(contexts)
    ctx_set = set(content_tokens(ctx))
    supported = sum(1 for t in pred_ct if t in ctx_set)
    return supported / max(1, len(pred_ct))

def hallucinated(pred: str, contexts: list[str], thresh: float = 0.6) -> int:
    return int(support_ratio(pred, contexts) < thresh)

bertscore = evaluate.load("bertscore")

def bert_score(preds, refs, lang="en", model_type="bert-base-uncased"):
    """
    preds: list[str] of candidate answers
    refs:  list[str] of reference answers
    returns: dict with keys 'precision', 'recall', 'f1'
    """
    result = bertscore.compute(
        predictions=preds,
        references=refs,
        lang=lang,
        model_type=model_type
    )
    return result

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# --- Unified evaluator for Baselines & Full Model (fixed) ---

def evaluate_model(
    eval_df: pd.DataFrame,
    generation_fn,
    name: str,
    save_path: str = None
) -> Tuple[Dict, pd.DataFrame]:
    
    preds, refs = [], []
    em_list, f1_list = [], []
    supp_list, hall_list = [], []
    rows_out = []

    print(f"\n=== Evaluating {name} on {len(eval_df)} samples ===")

    for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc=f"Eval {name}"):
        q = str(row["question"])
        gold = str(row["answer"])

        # --- Generate ---
        try:
            out = generation_fn(q)
            if not isinstance(out, dict):
                raise TypeError(f"generator_fn returned {type(out)}, expected dict")
            pred = str(out.get("answer", "") or "").strip()
            ev_df = out.get("evidence", None)
        except Exception as e:
            print(f"[WARN] Generation failed for: {q[:60]}... ({e})")
            pred, ev_df = "", None

        # --- Context from retrieved evidence (if any) ---
        ctxs = []
        if ev_df is not None:
            try:
                if hasattr(ev_df, "empty") and not ev_df.empty and "answer" in ev_df.columns:
                    ctxs = ev_df["answer"].astype(str).tolist()
            except Exception:
                ctxs = []

        # --- Metrics ---
        preds.append(pred)
        refs.append(gold)
        try:
            em_list.append(exact_match(pred, gold))
        except Exception:
            em_list.append(0.0)
        try:
            f1_list.append(token_f1(pred, gold))
        except Exception:
            f1_list.append(0.0)
        # try:
        #     bleu_list.append(bleu_score(pred, gold))
        # except Exception:
        #     bleu_list.append(0.0)
        try:
            supp_list.append(support_ratio(pred, ctxs))
        except Exception:
            supp_list.append(0.0)
        try:
            hall_list.append(hallucinated(pred, ctxs, thresh=0.6))
        except Exception:
            hall_list.append(1.0)  # pessimistic default if check fails

        # --- Record per sample (safe evidence extraction) ---
        def _safe_ev(i):
            try:
                return ev_df.iloc[i]["answer"] if ev_df is not None and len(ev_df) > i and "answer" in ev_df.columns else ""
            except Exception:
                return ""

        rows_out.append({
            "question": q,
            "gold_answer": gold,
            "prediction": pred,
            "EM": em_list[-1],
            "F1": f1_list[-1],
            # "BLEU": bleu_list[-1],
            "SupportRatio": supp_list[-1],
            "Hallucinated": hall_list[-1],
            "evidence_1": _safe_ev(0),
            "evidence_2": _safe_ev(1),
            "evidence_3": _safe_ev(2),
        })

    # --- Aggregate Metrics ---
    try:
        rouge_l_f = rouge_l(preds, refs)
    except Exception:
        rouge_l_f = 0.0

    # BERTScore (skip if all preds empty to avoid noisy warnings)
    try:
        any_pred_text = any(bool(p.strip()) for p in preds)
        if any_pred_text:
            bert_res = bert_score(preds, refs)
            bert_p = float(np.mean(bert_res.get("precision", [0.0])))
            bert_r = float(np.mean(bert_res.get("recall", [0.0])))
            bert_f = float(np.mean(bert_res.get("f1", [0.0])))
        else:
            bert_p = bert_r = bert_f = 0.0
    except Exception:
        bert_p = bert_r = bert_f = 0.0

    results = {
        "N": len(eval_df),
        "ExactMatch": float(np.mean(em_list)) if em_list else 0.0,
        "TokenF1": float(np.mean(f1_list)) if f1_list else 0.0,
        # "BLEU": float(np.mean(bleu_list)) if bleu_list else 0.0,
        "ROUGE-L": float(rouge_l_f),
        "SupportRatio(avg)": float(np.mean(supp_list)) if supp_list else 0.0,
        "HallucinationRate(<0.6 support)": float(np.mean(hall_list)) if hall_list else 0.0,
        "BERTScore_P": bert_p,
        "BERTScore_R": bert_r,
        "BERTScore_F1": bert_f,
    }

    # --- Save to CSV ---
    details = pd.DataFrame(rows_out)
    if save_path:
        try:
            details.to_csv(save_path, index=False)
            print(f"Saved per-sample details to {save_path}")
        except Exception as e:
            print(f"[WARN] Failed to save CSV to {save_path}: {e}")

    print(f"\n=== {name} Summary ===")
    for k, v in results.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

    return results, details


In [None]:
# MODELS

# --- Baseline 1 (fine-tuned BioGPT only) ---
gen_fn_baseline1 = lambda q: generate_no_rag(q, tok_full, gen_full)

# --- Baseline 2 (RAG + zero-shot BioGPT) ---
gen_fn_baseline2 = lambda q: generate_with_rag(q, tok_base, gen_base)

# --- Full Model (RAG + fine-tuned BioGPT) ---
gen_fn_full = lambda q: generate_with_rag(q, tok_full, gen_full)


In [None]:
baseline1_results = evaluate_model(val_df, gen_fn_baseline1, "Baseline 1 – Finetuned BioGPT", "baseline1_val.csv")


=== Evaluating Baseline 1 – Finetuned BioGPT on 1636 samples ===


Eval Baseline 1 – Finetuned BioGPT: 100%|██████████| 1636/1636 [42:49<00:00,  1.57s/it]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Saved per-sample details to baseline1_val.csv

=== Baseline 1 – Finetuned BioGPT Summary ===
N: 1636
ExactMatch: 0.0000
TokenF1: 0.3464
ROUGE-L: 0.2649
SupportRatio(avg): 0.0000
HallucinationRate(<0.6 support): 1.0000
BERTScore_P: 0.6587
BERTScore_R: 0.6090
BERTScore_F1: 0.6302


In [None]:
SAVE_DIR_RESULTS = "./results"
os.makedirs(SAVE_DIR_RESULTS, exist_ok=True)

summary_df = pd.DataFrame([
    {"Model": "Baseline 1 (Fine-tuned only)", **baseline1_results}
])

# Drop "N" column if exists
summary_df.drop(columns=["N"], inplace=True, errors="ignore")

# Save to Drive
csv_path = os.path.join(SAVE_DIR_RESULTS, "baseline1_model.csv")
summary_df.to_csv(csv_path, index=False)

print(f"Saved summary CSV to {csv_path}")


Saved summary CSV to /content/drive/MyDrive/DSA4213/results/baseline1_model.csv


Unnamed: 0,Model,ExactMatch,TokenF1,ROUGE-L,SupportRatio(avg),HallucinationRate(<0.6 support),BERTScore_P,BERTScore_R,BERTScore_F1
0,Baseline 1 (Fine-tuned only),0.0,0.3464,0.264903,0.0,1.0,0.658748,0.609021,0.630159


In [None]:
baseline2_results = evaluate_model(val_df, gen_fn_baseline2, "Baseline 2 – RAG + Base BioGPT", "baseline2_val.csv")


=== Evaluating Baseline 2 – RAG + Base BioGPT on 1636 samples ===


Eval Baseline 2 – RAG + Base BioGPT:   0%|          | 1/1636 [00:03<1:22:19,  3.02s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (852 > 512). Running this sequence through the model will result in indexing errors
Eval Baseline 2 – RAG + Base BioGPT: 100%|██████████| 1636/1636 [54:38<00:00,  2.00s/it]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Saved per-sample details to baseline2_val.csv

=== Baseline 2 – RAG + Base BioGPT Summary ===
N: 1636
ExactMatch: 0.0000
TokenF1: 0.2250
ROUGE-L: 0.1407
SupportRatio(avg): 0.7124
HallucinationRate(<0.6 support): 0.1950
BERTScore_P: 0.4969
BERTScore_R: 0.5545
BERTScore_F1: 0.5142


In [None]:
SAVE_DIR_RESULTS = "./results"
os.makedirs(SAVE_DIR_RESULTS, exist_ok=True)

metrics_dict, detailed_df = baseline2_results
summary_df2 = pd.DataFrame([
    {"Model": "Baseline 2 (RAG + Base BioGPT)", **metrics_dict}
])

summary_df2.drop(columns=["N"], inplace=True, errors="ignore")

csv_path2 = os.path.join(SAVE_DIR_RESULTS, "baseline2_model.csv")
summary_df2.to_csv(csv_path2, index=False)
print(f"Saved Baseline 2 summary CSV to {csv_path2}")

# Save full validation results
full_csv_path = os.path.join(SAVE_DIR_RESULTS, "baseline2_val_results.csv")
baseline2_details.to_csv(full_csv_path, index=False)
print(f"Saved Baseline 2 full validation CSV to {full_csv_path}")

Saved Baseline 2 summary CSV to /content/drive/MyDrive/DSA4213/results/baseline2_model.csv


In [None]:
fullmodel_results  = evaluate_model(val_df, gen_fn_full, "Full Model – RAG + Finetuaned BioGPT", "fullmodel_val.csv")


=== Evaluating Full Model – RAG + Finetuaned BioGPT on 1636 samples ===


Eval Full Model – RAG + Finetuaned BioGPT: 100%|██████████| 1636/1636 [1:17:56<00:00,  2.86s/it]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Saved per-sample details to fullmodel_val.csv

=== Full Model – RAG + Finetuaned BioGPT Summary ===
N: 1636
ExactMatch: 0.0000
TokenF1: 0.2843
ROUGE-L: 0.1844
SupportRatio(avg): 0.7421
HallucinationRate(<0.6 support): 0.1278
BERTScore_P: 0.5428
BERTScore_R: 0.6205
BERTScore_F1: 0.5731


In [None]:
SAVE_DIR_RESULTS = "./results"
os.makedirs(SAVE_DIR_RESULTS, exist_ok=True)

fullmodel_metrics, fullmodel_details = fullmodel_results

summary_df = pd.DataFrame([
    {"Model": "Full Model (RAG + Finetuned BioGPT)", **fullmodel_metrics}
])

# Save to Drive
csv_path = os.path.join(SAVE_DIR_RESULTS, "fullmodel_model.csv")
summary_df.to_csv(csv_path, index=False)

print(f"Saved summary CSV to {csv_path}")

details_path = os.path.join(SAVE_DIR_RESULTS, "fullmodel_val_results.csv")
fullmodel_details.to_csv(details_path, index=False)

print(f"Saved full validation results to: {details_path}")

In [None]:
# Unified Interactive Chat
# To test out the models before deployment

BANNER = """\nMedical Q&A Demo
Commands:
  /quit       -> exit
  /evidence   -> toggle showing evidence
  /mode X     -> switch mode: 1=Baseline1, 2=RAG-zero-shot, 3=Full RAG
"""

show_evidence = True
current_mode = 3

def ask_once(q: str, max_new_tokens: int = 180):
    if current_mode == 1:
        # Baseline 1: finetuned BioGPT (no retrieval)
        out = generate_no_rag(q, tok_base, gen_base, max_new_tokens=max_new_tokens)
    elif current_mode == 2:
        # Baseline 2: zero-shot BioGPT (RAG prompt)
        out = generate_with_rag(q, tok_base, gen_base, max_new_tokens=max_new_tokens, use_base_prompt=True)
    elif current_mode == 3:
        # Full RAG model (finetuned LoRA)
        out = generate_with_rag(q, tok_full, gen_full, max_new_tokens=max_new_tokens, use_base_prompt=False)
    else:
        raise ValueError(f"Invalid mode: {current_mode}")
    return out["answer"], out.get("evidence", pd.DataFrame())

def pretty_print_answer(ans: str):
    print("\n================= ANSWER =================")
    print(ans)
    print("=========================================\n")

def pretty_print_evidence(ev: pd.DataFrame, k: int = 5):
    if ev is None or ev.empty:
        print("(No evidence returned)\n")
        return
    print("------------- Evidence used -------------")
    for i, (_, row) in enumerate(ev.head(k).iterrows(), start=1):
        snip = str(row.get("answer","")).strip()
        src  = str(row.get("source","")).strip()
        url  = str(row.get("url","")).strip()
        score = row.get("ce_score", None)
        print(f"[{i}] {snip}")
        print(f"    Source: {src}")
        if url: print(f"    URL   : {url}")
        if score is not None: print(f"    ReRank Score: {score:.3f}")
    print("-----------------------------------------\n")

def chat_loop():
    global show_evidence, current_mode
    print(BANNER)
    while True:
        try:
            q = input("You: ").strip()
        except (EOFError, KeyboardInterrupt):
            print("\nBye!")
            break

        if not q:
            continue
        if q.lower() in ("/quit", "/exit"):
            print("Bye!")
            break
        if q.lower() == "/evidence":
            show_evidence = not show_evidence
            print(f"(Evidence display {'ON' if show_evidence else 'OFF'})")
            continue
        if q.lower().startswith("/mode"):
            try:
                m = int(q.split()[1])
                if m in [1,2,3]:
                    current_mode = m
                    print(f"(Switched to mode {m})")
                else:
                    print("(Invalid mode. Choose 1, 2, or 3)")
            except Exception:
                print("(Usage: /mode 1|2|3)")
            continue

        try:
            ans, ev = ask_once(q, max_new_tokens=180)
        except Exception as e:
            print(f"(Error during generation: {e})")
            continue

        pretty_print_answer(ans)
        if show_evidence:
            pretty_print_evidence(ev, k=6)

# Launch the chat
chat_loop()


Medical Q&A Demo
Commands:
  /quit       -> exit
  /evidence   -> toggle showing evidence
  /mode X     -> switch mode: 1=Baseline1, 2=RAG-zero-shot, 3=Full RAG

You: what is high blood sugar

High blood sugar, also known as hyperglycemia, is a condition in which the blood sugar level is higher than normal. The condition is caused by a shortage of insulin, which is a hormone that helps the body control blood sugar levels. In people with type 2 diabetes, the insulin level is lower than normal, and the body is unable to use it effectively. The body cannot use insulin properly, which leads to hyperglycemia. This condition is called diabetes type 2.

------------- Evidence used -------------
[1] Diabetes is a complex group of diseases with a variety of causes. People with diabetes have high blood glucose, also called high blood sugar or hyperglycemia. Diabetes is a disorder of metabolism the way the body uses digested food for energy. The digestive tract breaks down carbohydratessugars an