# Set up

In [3]:
import argparse
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import evaluate
import textstat
from textstat import flesch_kincaid_grade
from scipy.stats import bootstrap
from sklearn.metrics import f1_score, accuracy_score
from rouge_score import rouge_scorer
import bert_score
from bert_score import score, plot_example
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Read files
qa_val = pd.read_excel("qa_validation.xlsx")
qa_test = pd.read_excel("qa_test.xlsx")

summ_val = pd.read_excel("summarization_validation.xlsx")
summ_test = pd.read_excel("summarization_test.xlsx")

# Zero shot

In [18]:
# Load model
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")


# Build Prompt Function
def build_prompt(row, task="qa", prompt_type="plain"):
    qa_prompts = {
        "plain": (
            "You are a medical research assistant.\n"
            "Answer the question below with only 'Yes' or 'No'.\n"
            "Do not include explanations or extra words.\n\n"
            f"Question: {row['question']}\n\nAnswer (Yes or No):"
        ),
        "cite_source": (
            "You are a medical research assistant.\n"
            "Answer the question below while citing possible biomedical sources, answer with only 'Yes' or 'No'.\n"
            "Do not include explanations or extra words.\n\n"
            f"Question: {row['question']}\n\nAnswer (Yes or No):"
        ),
        "context": (
            "You are a medical research assistant.\n"
            "Based on the context given, answer the question below with only 'Yes' or 'No'.\n"
            "Do not include explanations or extra words.\n\n"
            f"Context: {row['context']}\n\nQuestion: {row['question']}\n\nAnswer (Yes or No):"
        )
    }

    summarisation_prompts = {
        "plain": (
            "You are a medical research assistant.\n"
            "Summarise the biomedical text below concisely:\n\n"
            f"{row['context']}"
        ),
        "cite_source": (
            "You are a medical research assistant.\n"
            "Summarise the biomedical text below concisely while citing possible biomedical sources:\n\n"
            f"{row['context']}"
        )
    }

    if task == "qa":
        return qa_prompts.get(prompt_type, qa_prompts["plain"])
    elif task == "summarisation":
        return summarisation_prompts.get(prompt_type, summarisation_prompts["plain"])
    else:
        raise ValueError(f"Unknown task type: {task}")


# Generate Outputs Function
def generate_outputs(df, task="qa", prompt_type="plain",
                     max_input_len=512, max_output_len=256, num_beams=4, temperature=0.0):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = build_prompt(row, task, prompt_type)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_len).to(model.device)

        with torch.no_grad():
            if task == "qa":
                output_tokens = model.generate(
                    **inputs,
                    max_new_tokens=5, # short yes/no
                    num_beams=num_beams,
                    do_sample=True, # allow stochasticity
                    temperature=0.7, # some randomness to avoid collapsing
                )
            elif task == "summarisation":
                output_tokens = model.generate(
                    **inputs,
                    max_new_tokens=max_output_len,
                    num_beams=num_beams,
                    do_sample=False,
                    temperature=temperature
                )

        generated = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
        results.append(generated.strip())

    return results


def add_preds_to_df(df, all_preds):
    df_copy = df.copy()
    for prompt_type, preds in all_preds.items():
        df_copy[f'pred_{prompt_type}'] = preds
    return df_copy

In [19]:
# QA
qa_results = {
    prompt_type: generate_outputs(qa_test, task="qa", prompt_type=prompt_type)
    for prompt_type in ["plain", "cite_source", "context"]
}

qa_test_with_preds = add_preds_to_df(qa_test, qa_results)

# Summarisation
summ_results = {
    prompt_type: generate_outputs(qa_test, task="summarisation", prompt_type=prompt_type)
    for prompt_type in ["plain", "cite_source"]
}

summ_test_with_preds = add_preds_to_df(summ_test, summ_results)

100%|██████████| 2061/2061 [02:54<00:00, 11.80it/s]
100%|██████████| 2061/2061 [02:10<00:00, 15.84it/s]
100%|██████████| 2061/2061 [02:25<00:00, 14.17it/s]
100%|██████████| 2061/2061 [25:05<00:00,  1.37it/s]
100%|██████████| 2061/2061 [27:55<00:00,  1.23it/s]


# Evaluation

In [20]:
# Decision-Based QA Metrics (Yes/No)
def compute_qa_decision_metrics(preds, golds):
    results = []
    for p, g in zip(preds, golds):
        p_clean, g_clean = str(p).lower().strip(), str(g).lower().strip()
        em = int(p_clean == g_clean)
        f1 = f1_score([g_clean], [p_clean], average='macro', labels=['yes','no'], zero_division=0)
        results.append({"Exact Match": em, "F1_macro": f1})
    return results


# Summarisation Metrics
def compute_text_generation_metrics(preds, golds):
    # Include all 3 ROUGE variants
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    smoothie = SmoothingFunction().method1

    # Pre-compute BERTScore
    P, R, F1 = bert_score.score(preds, golds, lang='en', rescale_with_baseline=True)
    bert_f1s = F1.tolist()

    results = []
    for p, g, bf1 in zip(preds, golds, bert_f1s):
        p, g = str(p).strip(), str(g).strip()
        if not p or not g:
            results.append({
                "ROUGE-1": 0, "ROUGE-2": 0, "ROUGE-L": 0,
                "BLEU": 0, "BERTScore_F1": 0
            })
            continue

        rouge_scores = scorer.score(g, p)
        rouge1 = rouge_scores['rouge1'].fmeasure
        rouge2 = rouge_scores['rouge2'].fmeasure
        rougeL = rouge_scores['rougeL'].fmeasure
        bleu = sentence_bleu([g.split()], p.split(), smoothing_function=smoothie)

        results.append({
            "ROUGE-1": rouge1,
            "ROUGE-2": rouge2,
            "ROUGE-L": rougeL,
            "BLEU": bleu,
            "BERTScore_F1": bf1
        })
    return results


# Readability (Flesch-Kincaid)
def compute_readability(preds):
    results = []
    for p in preds:
        p = str(p).strip()
        grade = textstat.flesch_kincaid_grade(p) if p else None
        results.append({"Flesch-Kincaid Grade": grade})
    return results


# Combined Evaluation Wrapper
def evaluate_all_prompts(task, df, all_preds):
    df_combined = df.copy()
    all_averages = {}

    if task == "qa":
        for pt, preds in all_preds.items():
            col_pred = f"pred_{pt}"
            df_combined[col_pred] = preds

            golds = df['final_decision'].tolist()
            row_metrics = compute_qa_decision_metrics(preds, golds)

            # Convert metrics to DataFrame and add suffix
            metrics_df = pd.DataFrame(row_metrics).add_suffix(f"_{pt}")
            df_combined = pd.concat([df_combined, metrics_df], axis=1)

            # Compute dataset-level averages
            avg_metrics = {k: np.nanmean([m[k] for m in row_metrics if m[k] is not None])
                           for k in row_metrics[0].keys()}
            all_averages[pt] = avg_metrics

    elif task == "summarisation":
        for pt, preds in all_preds.items():
            col_pred = f"pred_{pt}"
            df_combined[col_pred] = preds

            golds = df['summary'].tolist()
            gen_metrics = compute_text_generation_metrics(preds, golds)
            read_metrics = compute_readability(preds)

            row_metrics = [{**g, **r} for g, r in zip(gen_metrics, read_metrics)]

            # Convert metrics to DataFrame and add suffix
            metrics_df = pd.DataFrame(row_metrics).add_suffix(f"_{pt}")
            df_combined = pd.concat([df_combined, metrics_df], axis=1)

            # Compute dataset-level averages
            avg_metrics = {k: np.nanmean([m[k] for m in row_metrics if m[k] is not None])
                           for k in row_metrics[0].keys()}
            all_averages[pt] = avg_metrics

    else:
        raise ValueError("Invalid task. Choose 'qa' or 'summarisation'")

    return df_combined, all_averages


In [21]:
# For QA
qa_metrics, qa_average = evaluate_all_prompts(task="qa", df=qa_test, all_preds=qa_results)

# For summarisation
summ_metrics, summ_average = evaluate_all_prompts(task="summarisation", df=summ_test, all_preds=summ_results)

print("QA:")
print(qa_average)

print("\nSummarisation:")
print(summ_average)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


QA:
{'plain': {'Exact Match': np.float64(0.07520621057738962), 'F1_macro': np.float64(0.03760310528869481)}, 'cite_source': {'Exact Match': np.float64(0.07520621057738962), 'F1_macro': np.float64(0.03760310528869481)}, 'context': {'Exact Match': np.float64(0.07617661329451722), 'F1_macro': np.float64(0.03808830664725861)}}

Summarisation:
{'plain': {'ROUGE-1': np.float64(0.2916620149189143), 'ROUGE-2': np.float64(0.11484886584989305), 'ROUGE-L': np.float64(0.22580086254348133), 'BLEU': np.float64(0.03569938182084287), 'BERTScore_F1': np.float64(0.21141968472655367), 'Flesch-Kincaid Grade': np.float64(16.376579970744572)}, 'cite_source': {'ROUGE-1': np.float64(0.2921941730546203), 'ROUGE-2': np.float64(0.11450630995568949), 'ROUGE-L': np.float64(0.22578845253688423), 'BLEU': np.float64(0.03588950475081277), 'BERTScore_F1': np.float64(0.20567100878938824), 'Flesch-Kincaid Grade': np.float64(16.41293031936518)}}


# Few Shot

In [11]:
def build_few_shot_qa_prompt(row, qa_val, n_examples=2):
    examples = qa_val.sample(n=n_examples, random_state=int(row.name))  # random 3 examples
    prompt = "You are a medical research assistant.\nAnswer the questions below with only 'Yes' or 'No'.\nDo not include explanations or extra words.\n\n"

    for i, ex in enumerate(examples.itertuples(), start=1):
        prompt += f"Context: {ex.context}\nQuestion: {ex.question}\nAnswer: {ex.final_decision}\n\n"

    # Add target question
    prompt += f"Context: {row['context']}\nQuestion: {row['question']}\nAnswer:"
    return prompt


def build_few_shot_summ_prompt(row, summ_val, n_examples=2):
    examples = summ_val.sample(n=n_examples, random_state=int(row.name))  # random 3 examples
    prompt = "You are a medical research assistant.\nSummarise the biomedical texts:\n\n"

    for i, ex in enumerate(examples.itertuples(), start=1):
        prompt += f"Text: {ex.context}\nSummary: {ex.summary}\n\n"

    # Add target question
    #prompt += f"Now, summarise the following biomedical text concisely:\n"
    prompt += f"Text: {row['context']}\nSummary:"
    return prompt


def generate_outputs_fewshot(df, task="qa", max_input_len=1024, max_output_len=128,
                     num_beams=4, temperature=0.0):
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Build the prompt
        if task == "qa":
            prompt = build_few_shot_qa_prompt(row, qa_val)
        elif task == "summarisation":
            prompt = build_few_shot_summ_prompt(row, summ_val)
        else:
            raise ValueError("task must be 'qa' or 'summarisation'")

        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_len).to(model.device)

        # Generate
        with torch.no_grad():
            output_tokens = model.generate(
                **inputs,
                max_new_tokens=max_output_len,
                num_beams=num_beams if task=="summarisation" else 1, #greedy for qa
                do_sample=(task=="summarisation"),          # sampling for summarisation only
                temperature=0.3 if task=="summarisation" else temperature
            )

        # Decode and store result
        generated = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
        results.append(generated.strip())
    return results

In [13]:
# Generate few-shot outputs for QA
qa_few_shot_results = generate_outputs_fewshot(qa_test, task="qa")

# Generate few-shot outputs for Summarisation
summ_few_shot_results = generate_outputs_fewshot(summ_test, task="summarisation")


100%|██████████| 2061/2061 [02:15<00:00, 15.26it/s]
100%|██████████| 2061/2061 [25:32<00:00,  1.35it/s]


In [14]:
# Add few-shot predictions to the test dataframes
qa_test_with_few_shot_preds = add_preds_to_df(qa_test, {"few_shot": qa_few_shot_results})

summ_test_with_few_shot_preds = add_preds_to_df(summ_test, {"few_shot": summ_few_shot_results})

In [16]:
# For QA
qa_few_shot_metrics, qa_few_shot_average = evaluate_all_prompts(task="qa", df=qa_test, all_preds={"few_shot": qa_few_shot_results})

# For summarisation
summ_few_shot_metrics, summ_few_shot_average = evaluate_all_prompts(task="summarisation", df=summ_test, all_preds={"few_shot": summ_few_shot_results})

print("QA:")
print(qa_few_shot_average)

print("\nSummarisation:")
print(summ_few_shot_average)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


QA:
{'few_shot': {'Exact Match': np.float64(0.09073265405143134), 'F1_macro': np.float64(0.04536632702571567)}}

Summarisation:
{'few_shot': {'ROUGE-1': np.float64(0.2550663889283831), 'ROUGE-2': np.float64(0.09196817559485894), 'ROUGE-L': np.float64(0.1979929710463128), 'BLEU': np.float64(0.030386339533637285), 'BERTScore_F1': np.float64(0.18712245507883937), 'Flesch-Kincaid Grade': np.float64(17.069855971767698)}}
