In [1]:
import pandas as pd

df = pd.read_excel("rag_dataset.xlsx")  # or "data/rag_dataset.xlsx" if inside /data
print(df.shape)
print(df.columns.tolist())
df.head()


(46610, 5)
['question_id', 'question', 'answer', 'passage', 'passage_id']


Unnamed: 0,question_id,question,answer,passage,passage_id
0,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,Hidradenitis suppurativa (HS) is a chronic inf...,0
1,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,"In this retrospective case-control study, we c...",1
2,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,A total of 2292 patients at Massachusetts Gene...,2
3,1,Is admission hyperglycemia associated with fai...,"In patients with STEMI who undergo FT, admissi...",Hyperglycemia on admission is associated with ...,3
4,1,Is admission hyperglycemia associated with fai...,"In patients with STEMI who undergo FT, admissi...",This is a retrospective study of 304 STEMI pat...,4


In [2]:
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import torch

# prepare lists
passages = df["passage"].astype(str).tolist()
questions = df["question"].astype(str).tolist()
answers = df["answer"].astype(str).tolist()

# build BM25 retriever
bm25 = BM25Okapi([p.split() for p in passages])

# load generator (FLAN-T5 base)
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
gen = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cuda" if torch.cuda.is_available() else "cpu")

def retrieve_bm25(q, k=3):
    scores = bm25.get_scores(q.split())
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return " ".join(passages[i] for i in top)

def generate_summary(q, ctx, max_new_tokens=120):
    prompt = f"Question: {q}\nContext: {ctx}\nGenerate a short, readable summary:"
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512).to(gen.device)
    out = gen.generate(**inputs, max_new_tokens=max_new_tokens)
    return tok.decode(out[0], skip_special_tokens=True).strip()


In [3]:
# Run on dataset(full)
full_df = df.copy()
preds = []

for q in tqdm(full_df["question"].tolist(), desc="Generating summaries (full dataset)"):
    ctx = retrieve_bm25(q, k=3)  # you can also test k=1 or k=5 later
    preds.append(generate_summary(q, ctx))

full_df["rag_summary"] = preds


Generating summaries (full dataset):   0%|          | 0/46610 [00:00<?, ?it/s]

In [5]:
from evaluate import load
import textstat, json

rouge = load("rouge")
berts = load("bertscore")

pred = full_df["rag_summary"].tolist()
gold = full_df["answer"].astype(str).tolist()

rouge_res = rouge.compute(predictions=pred, references=gold, use_stemmer=True)
bert_res = berts.compute(predictions=pred, references=gold, lang="en")

fk_scores = [textstat.flesch_kincaid_grade(p) for p in pred]

metrics = {
    "ROUGE-1": rouge_res["rouge1"],
    "ROUGE-2": rouge_res["rouge2"],
    "ROUGE-L": rouge_res["rougeL"],
    "BERTScore_P": sum(bert_res["precision"]) / len(bert_res["precision"]),
    "BERTScore_R": sum(bert_res["recall"]) / len(bert_res["recall"]),
    "BERTScore_F1": sum(bert_res["f1"]) / len(bert_res["f1"]),
    "FK_grade_avg": sum(fk_scores) / len(fk_scores)
}
metrics


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'ROUGE-1': 0.30131846146533636,
 'ROUGE-2': 0.12413897958779373,
 'ROUGE-L': 0.2383342919558197,
 'BERTScore_P': 0.8833940183776204,
 'BERTScore_R': 0.8660610931609174,
 'BERTScore_F1': 0.8744216979560205,
 'FK_grade_avg': 16.343973060912198}

In [6]:
rows_to_check = [500, 1000, 2000]
for r in rows_to_check:
    if r < len(df):
        print(f"\n===== Row {r} =====")
        print("Question:", df.loc[r, "question"])
        print("\nTrue answer:", df.loc[r, "answer"])
        ctx = retrieve_bm25(df.loc[r, "question"], k=3)
        pred = generate_summary(df.loc[r, "question"], ctx)
        print("\nRAG summary:", pred)
        print("FK grade:", textstat.flesch_kincaid_grade(pred))



===== Row 500 =====
Question: Does pluronic L-81 ameliorate diabetic symptoms in db/db mice through transcriptional regulation of microsomal triglyceride transfer protein?

True answer: L-81 represents a promising candidate in the development of a selective insulin-mimetic molecule and an anti-diabetic agent.

RAG summary: L-81 ameliorates diabetic symptoms in mice with diabetes by inhibiting microsomal triglyceride transfer protein gene inactivation.
FK grade: 20.150000000000002

===== Row 1000 =====
Question: Does high plasma VEGF relate to low carbohydrate intake in patients with type 2 diabetes?

True answer: We conclude that high plasma VEGF concentrations are associated with less carbohydrate intake and lower body mass in type 2 diabetes. The role VEGF plays in facilitating glucose access to the brain represents a new aspect of food intake regulation and energy homeostasis, with relevance for diseases with body mass disturbances.

RAG summary: Plasma VEGF levels correlate with l

In [7]:
full_df["FK_grade"] = [textstat.flesch_kincaid_grade(p) for p in full_df["rag_summary"]]

# Save final metrics and outputs
import os, json
os.makedirs("results", exist_ok=True)

full_df.to_csv("results/rag_Summ_full_outputs.csv", index=False)
with open("results/rag_Summ_full_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

!ls -lh results


total 42M
-rw-rw-r-- 1 ubuntu ubuntu 253 Nov  6 03:40 rag_Summ_full_metrics.json
-rw-rw-r-- 1 ubuntu ubuntu 42M Nov  6 03:40 rag_Summ_full_outputs.csv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
!git status

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


On branch yunxiu-branch
Your branch is up to date with 'origin/yunxiu-branch'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mRAG.ipynb[m
	[31mresults/[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
# --- Extract and save required passage subset ---
import pandas as pd, os

df_full = pd.read_csv("results/rag_Summ_full_outputs.csv")

# Target passages
target_passages = [16771, 12220, 29568]
subset_df = df_full[df_full["passage_id"].isin(target_passages)].copy()

os.makedirs("results", exist_ok=True)
subset_df.to_csv("results/rag_Summ_required_passages.csv", index=False)

print("Saved required passage outputs to results/rag_Summ_required_passages.csv")
display(subset_df[["passage_id", "question", "rag_summary", "FK_grade"]])


Saved required passage outputs to results/rag_Summ_required_passages.csv


Unnamed: 0,passage_id,question,rag_summary,FK_grade
12220,12220,Is parathyroid hormone associated with biomark...,Vitamin D deficiency in childhood obesity coin...,20.94
16771,16771,Does iGF-2 mediate intestinal mucosal hyperpla...,IGF2 mediates intestinal mucosal hyperplasia i...,25.942222
29568,29568,Is insulin-like growth factor binding protein-...,IGFBP2 levels are elevated in blood of lung ca...,12.0375
