In [1]:
import pandas as pd

# Load the summarization outputs you generated earlier
df = pd.read_csv("results/rag_Summ_full_outputs.csv")  # or update path if different
print(df.shape)
df.head()


(46610, 7)


Unnamed: 0,question_id,question,answer,passage,passage_id,rag_summary,FK_grade
0,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,Hidradenitis suppurativa (HS) is a chronic inf...,0,Is glucagon secretion different after oral and...,16.63
1,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,"In this retrospective case-control study, we c...",1,Is glucagon secretion different after oral and...,16.63
2,0,Is hidradenitis suppurativa a systemic disease...,Control subjects were not validated for absenc...,A total of 2292 patients at Massachusetts Gene...,2,Is glucagon secretion different after oral and...,16.63
3,1,Is admission hyperglycemia associated with fai...,"In patients with STEMI who undergo FT, admissi...",Hyperglycemia on admission is associated with ...,3,Hyperglycemia on admission is associated with ...,24.658519
4,1,Is admission hyperglycemia associated with fai...,"In patients with STEMI who undergo FT, admissi...",This is a retrospective study of 304 STEMI pat...,4,Hyperglycemia on admission is associated with ...,24.658519


In [2]:
from sentence_transformers import SentenceTransformer, util

# Load your embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode summaries as retrievable context
embeddings = model.encode(df["rag_summary"].tolist(), convert_to_tensor=True)


In [5]:
from transformers import pipeline

# Load the FLAN-T5 model for question answering or generation
llm = pipeline("text2text-generation", model="google/flan-t5-large", device=0)  # or flan-t5-base if GPU memory limited

def retrieve_context(query, k=3):
    query_emb = model.encode(query, convert_to_tensor=True)
    scores = util.cos_sim(query_emb, embeddings)[0]
    topk_idx = scores.topk(k).indices.cpu().numpy()
    return [df["rag_summary"].iloc[i] for i in topk_idx]

def answer_query(query):
    context = retrieve_context(query, k=3)
    context_text = " ".join(context)
    prompt = f"Answer the question using the context below:\n\nContext: {context_text}\n\nQuestion: {query}"
    return llm(prompt)


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
query = "What are the main effects of diabetes on insulin resistance?"
print(answer_query(query))


In [12]:
from transformers import pipeline

qa_llm = pipeline("text2text-generation", model="google/flan-t5-base", device=0)

from tqdm.auto import tqdm

qa_preds = []
for q, ctx in tqdm(zip(df["question"], df["rag_summary"]), 
                   total=len(df), 
                   desc="Generating QA answers (full dataset)"):
    prompt = f"Answer the question using the context below:\n\nContext: {ctx}\n\nQuestion: {q}"
    answer = qa_llm(prompt, max_new_tokens=128)[0]["generated_text"]
    qa_preds.append(answer)

df["qa_pred"] = qa_preds



Device set to use cuda:0


Generating QA answers (full dataset):   0%|          | 0/46610 [00:00<?, ?it/s]

In [16]:
import os, json
os.makedirs("results", exist_ok=True)

df.to_csv("results/rag_QA_full_outputs.csv", index=False)
with open("results/rag_QA_full_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

!ls -lh results


total 84M
-rw-rw-r-- 1 ubuntu ubuntu   57 Nov  6 09:19 rag_QA_full_metrics.json
-rw-rw-r-- 1 ubuntu ubuntu  42M Nov  6 09:19 rag_QA_full_outputs.csv
-rw-rw-r-- 1 ubuntu ubuntu  253 Nov  6 03:40 rag_Summ_full_metrics.json
-rw-rw-r-- 1 ubuntu ubuntu  42M Nov  6 03:40 rag_Summ_full_outputs.csv
-rw-rw-r-- 1 ubuntu ubuntu 3.0K Nov  6 06:31 rag_Summ_required_passages.csv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
import pandas as pd
from evaluate import load

df = pd.read_csv("results/rag_QA_full_outputs.csv")
squad_metric = load("squad")

# Compute all at once instead of per-row
preds = [{"id": str(i), "prediction_text": str(p)} for i, p in enumerate(df["qa_pred"].astype(str))]
refs  = [{"id": str(i), "answers": {"text": [str(a)], "answer_start": [0]}} for i, a in enumerate(df["answer"].astype(str))]

results = squad_metric.compute(predictions=preds, references=refs)

df["EM"] = [int(str(p).strip().lower() == str(a).strip().lower()) for p, a in zip(df["qa_pred"], df["answer"])]

from difflib import SequenceMatcher
def token_f1(pred, ref):
    p_tokens, r_tokens = pred.lower().split(), ref.lower().split()
    overlap = len(set(p_tokens) & set(r_tokens))
    if not overlap: return 0
    precision, recall = overlap / len(p_tokens), overlap / len(r_tokens)
    return 2 * precision * recall / (precision + recall)

df["F1"] = [token_f1(p, a) for p, a in zip(df["qa_pred"], df["answer"])]

df.to_csv("results/rag_QA_full_new_outputs.csv", index=False)

print("Average EM:", df["EM"].mean())
print("Average F1:", df["F1"].mean())
print("SQuAD (official):", results)


Average EM: 0.0
Average F1: 0.019840641461379302
SQuAD (official): {'exact_match': 0.0, 'f1': 2.258700305494714}


In [18]:
for qid in [4953, 10863, 8722]:
    row = df.loc[df["question_id"] == qid].iloc[0]
    res = squad_metric.compute(
        predictions=[{"id": str(qid), "prediction_text": row["qa_pred"]}],
        references=[{"id": str(qid), "answers": {"text": [row["answer"]], "answer_start": [0]}}]
    )
    print(f"\nQuestion ID {qid}")
    print("Question:", row["question"])
    print("Reference Answer:", row["answer"])
    print("Predicted Answer:", row["qa_pred"])
    print(f"→ EM: {res['exact_match']:.2f}, F1: {res['f1']:.2f}")



Question ID 4953
Question: Does iGF-2 mediate intestinal mucosal hyperplasia in retinoblastoma protein ( Rb ) -deficient mice?
Reference Answer: IGF-2 expression is significantly elevated in villus enterocytes and is required for the hyperplastic intestinal mucosal phenotype of Rb-IKO mice. The trophic effects of IGF2 require intact IGF1R signaling within the intestinal epithelium. These findings reveal novel regulatory roles for Rb in expanding intestinal mucosal surface area.
Predicted Answer: yes
→ EM: 0.00, F1: 0.00

Question ID 10863
Question: Is parathyroid hormone associated with incident diabetes in white, but not black adults?
Reference Answer: In this large, population-based study, elevated PTH was independently associated with risk for diabetes among white, but not black adults. Further studies are needed to elucidate the mechanisms that may underlie this differential association of PTH with diabetes across race groups.
Predicted Answer: yes
→ EM: 0.00, F1: 0.00

Question I

In [22]:
for r in [4953, 10863, 8722]:
    if r < len(df):
        row = df.iloc[r]
        print(f"\nRow {r}")
        print("Question:", row["question"])
        print("Answer:", row["answer"])
        print("Predicted:", row["qa_pred"])
        print(f"EM={row['EM']:.2f}, F1={row['F1']:.2f}")



Row 4953
Question: Are dietary patterns associated with various vascular health markers and complications in type 1 diabetes?
Answer: Closer adherence to the dietary recommendations, and a diet high in fresh vegetables, fruits and berries, cooked vegetables, fish dishes, and yoghurt may be beneficial for the glycaemic control in type 1 diabetes. Moreover, a diet pattern with fish and eggs may have beneficial effects for blood pressure.
Predicted: yes
EM=0.00, F1=0.00

Row 10863
Question: Is up-regulation of the complement system in subcutaneous adipocytes from nonobese, hypertriglyceridemic subjects associated with adipocyte insulin resistance?
Answer: These findings point to an up-regulation of a complement-related transcriptome in sc adipocytes under metabolically stressed conditions, even in the absence of overt obesity. Such up-regulation may subsequently influence downstream processes, including macrophage infiltration into adipose tissue and adipocyte insulin resistance.
Predict