# T5-Small Instruction Fine-Tuning for Clinical Queries

Fine-tune Hugging Face `t5-small` on the custom clinical QA dataset under `t5-small/data`.
The notebook also records zero-shot baselines, evaluates with ROUGE + BERTScore, and saves JSON predictions for later review.

In [None]:
!pip install -q transformers datasets evaluate accelerate bert-score rouge-score
print("Finished installing the project dependencies.")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Finished installing the project dependencies.


In [None]:
import json
import random
from pathlib import Path

import numpy as np
import torch
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import evaluate
from tqdm.auto import tqdm

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

BASE_DIR = Path('/content/drive/MyDrive/Colab/NLP/t5-small-project')
DATA_DIR = BASE_DIR / 'data'
OUTPUT_DIR = BASE_DIR / 'outputs'
PRED_DIR = OUTPUT_DIR / 'predictions'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = 't5-small'
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256
GEN_MAX_NEW_TOKENS = 160

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
print(f'Data directory: {DATA_DIR.resolve()}')

Using device: cuda
Data directory: /content/drive/MyDrive/Colab/NLP/t5-small-project/data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json

def load_split(json_path):
    with open(json_path, 'r') as f:
        raw_data = json.load(f)

    records = []

    for qid, payload in raw_data.items():
        question = (payload.get('question') or '').strip()
        answers = payload.get('answers', {})

        for aid, answer_payload in answers.items():
            article = (answer_payload.get('article') or '').strip()
            summary = (answer_payload.get('answer_abs_summ') or '').strip()

            # skip empty rows
            if not article or not summary:
                continue

            prompt = (
                "Summarize the following medical article to answer the clinical question.\n"
                f"Question: {question}\n"
                f"Article: {article}"
            )

            records.append({
                'id': f'{qid}_{aid}',
                'question': question,
                'article': article,
                'prompt': prompt,
                'summary': summary
            })

    return records


In [None]:
train_records = load_split(DATA_DIR / 'train.json')
val_records = load_split(DATA_DIR / 'validation.json')
test_records = load_split(DATA_DIR / 'test.json')
print(f'Train/Val/Test sizes -> {len(train_records)} / {len(val_records)} / {len(test_records)}')

def peek(records, name):
    if not records:
        print(f'No records available for {name}.')
        return
    print(f"\n{name} sample prompt (first 300 chars):\n{records[0]['prompt'][:300]}...")
    print(f"{name} sample reference:\n{records[0]['summary']}\n")

peek(train_records, 'Train')

Train/Val/Test sizes -> 392 / 51 / 109

Train sample prompt (first 300 chars):
Summarize the following medical article to answer the clinical question.
Question: how much oxazepam could cause an overdose?
Article: Oxazepam overdose Benzodiazepine overdose Serax overdose Adumbran overdose Serenid Forte overdose Zapex overdose Novoxapam overdose Oxpam overdose Summary Oxazepam i...
Train sample reference:
Oxazepam is used to treat anxiety and symptoms of alcohol withdrawal. If you or some you are with overdoses, call your local emergency number, such as 911, or call your local poison center which can be reached at 1-800-222-1222.



In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "t5-small",
    use_fast=True,
    local_files_only=False
)
print("Tokenizer loaded successfully!")


hf_train = Dataset.from_list(train_records)
hf_val = Dataset.from_list(val_records)
hf_test = Dataset.from_list(test_records)

def tokenize_batch(batch):
    model_inputs = tokenizer(
        batch['prompt'],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding='max_length'
    )

    labels = tokenizer(
        batch['summary'],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding='max_length'
    )

    labels_ids = labels['input_ids']
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels_ids
    ]

    model_inputs['labels'] = labels_ids
    model_inputs["decoder_attention_mask"] = labels["attention_mask"]
    return model_inputs


tokenized_train = hf_train.map(tokenize_batch, batched=True, remove_columns=hf_train.column_names, desc='Tokenizing train')
tokenized_val = hf_val.map(tokenize_batch, batched=True, remove_columns=hf_val.column_names, desc='Tokenizing val')
tokenized_test = hf_test.map(tokenize_batch, batched=True, remove_columns=hf_test.column_names, desc='Tokenizing test')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizer loaded successfully!


Tokenizing train:   0%|          | 0/392 [00:00<?, ? examples/s]

Tokenizing val:   0%|          | 0/51 [00:00<?, ? examples/s]

Tokenizing test:   0%|          | 0/109 [00:00<?, ? examples/s]

In [None]:
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')
print('Loaded ROUGE and BERTScore evaluators.')

def compute_text_metrics(preds, refs):
    rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    bert_result = bertscore.compute(predictions=preds, references=refs, lang='en')
    metrics = {f'rouge_{k}': round(v, 4) for k, v in rouge_result.items()}
    metrics['bertscore_f1'] = float(np.mean(bert_result['f1']))
    return metrics

def run_batch_generation(model, records, split_name, output_path, batch_size=4):
    model.eval()
    predictions = []
    iterator = range(0, len(records), batch_size)
    for start in tqdm(iterator, desc=f'Generating {split_name}', leave=False):
        batch = records[start:start + batch_size]
        if not batch:
            continue
        inputs = tokenizer(
            [row['prompt'] for row in batch],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding=True,
            return_tensors='pt'
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=GEN_MAX_NEW_TOKENS,
                num_beams=4,
                length_penalty=1.0,
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for row, pred in zip(batch, decoded):
            predictions.append(
                {
                    'id': row['id'],
                    'question': row['question'],
                    'prediction': pred.strip(),
                    'reference': row['summary'],
                }
            )
    with open(output_path, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f'Saved {len(predictions)} predictions to {output_path}')
    return predictions

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Loaded ROUGE and BERTScore evaluators.


In [None]:
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
print('Running baseline generation on validation data...')
baseline_val_results = run_batch_generation(
    baseline_model,
    val_records,
    'baseline-val',
    PRED_DIR / 'baseline_val_predictions.json'
)
baseline_val_metrics = compute_text_metrics(
    [row['prediction'] for row in baseline_val_results],
    [row['reference'] for row in baseline_val_results]
)
print('Baseline validation metrics:', baseline_val_metrics)

print('Running baseline generation on test data...')
baseline_test_results = run_batch_generation(
    baseline_model,
    test_records,
    'baseline-test',
    PRED_DIR / 'baseline_test_predictions.json'
)
baseline_test_metrics = compute_text_metrics(
    [row['prediction'] for row in baseline_test_results],
    [row['reference'] for row in baseline_test_results]
)
print('Baseline test metrics:', baseline_test_metrics)

Running baseline generation on validation data...


Generating baseline-val:   0%|          | 0/13 [00:00<?, ?it/s]

Saved 51 predictions to /content/drive/MyDrive/Colab/NLP/t5-small-project/outputs/predictions/baseline_val_predictions.json


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline validation metrics: {'rouge_rouge1': np.float64(0.2011), 'rouge_rouge2': np.float64(0.0689), 'rouge_rougeL': np.float64(0.1415), 'rouge_rougeLsum': np.float64(0.1415), 'bertscore_f1': 0.8336615223510593}
Running baseline generation on test data...


Generating baseline-test:   0%|          | 0/28 [00:00<?, ?it/s]

Saved 109 predictions to /content/drive/MyDrive/Colab/NLP/t5-small-project/outputs/predictions/baseline_test_predictions.json
Baseline test metrics: {'rouge_rouge1': np.float64(0.1951), 'rouge_rouge2': np.float64(0.0547), 'rouge_rougeL': np.float64(0.1329), 'rouge_rougeLsum': np.float64(0.1325), 'bertscore_f1': 0.8337273586780654}


In [None]:
finetune_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

finetune_model.gradient_checkpointing_enable()

finetune_model.generation_config.max_new_tokens = GEN_MAX_NEW_TOKENS
finetune_model.generation_config.do_sample = False

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=finetune_model
)

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR / 't5_small_finetune'),

    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,

    learning_rate=1e-4,

    num_train_epochs=5,

    eval_strategy='no',
    save_strategy='epoch',

    logging_steps=50,
    warmup_steps=50,

    load_best_model_at_end=False,

    fp16=torch.cuda.is_available(),

    label_smoothing_factor=0.1,

    report_to=[],
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=None,
    data_collator=data_collator,
    compute_metrics=None
)

train_result = trainer.train()

trainer.save_model(OUTPUT_DIR / 't5_small_finetune')
tokenizer.save_pretrained(OUTPUT_DIR / 't5_small_finetune')

print("Training finished and saved.")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
50,4.7375
100,3.7037
150,3.554
200,3.4322


Training finished and saved.


In [None]:
def generate_summary(prompt):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT_LENGTH
    ).to(device)

    output = finetune_model.generate(
        **inputs,
        max_new_tokens=GEN_MAX_NEW_TOKENS
    )

    return tokenizer.decode(output[0], skip_special_tokens=True).strip()


print("Running finetuned model generation on validation set...")

finetuned_val_preds = []
finetuned_val_refs = []

for item in val_records:
    pred = generate_summary(item["prompt"])
    finetuned_val_preds.append(pred)
    finetuned_val_refs.append(item["summary"])


val_save = []
for record, pred, ref in zip(val_records, finetuned_val_preds, finetuned_val_refs):
    val_save.append({
        "id": record["id"],
        "question": record["question"],
        "prediction": pred,
        "reference": ref
    })

with open(PRED_DIR / "finetuned_val_predictions.json", "w") as f:
    json.dump(val_save, f, indent=2)

print("Saved finetuned validation predictions.")


print("Running finetuned model generation on test set...")

finetuned_test_preds = []
finetuned_test_refs = []

for item in test_records:
    pred = generate_summary(item["prompt"])
    finetuned_test_preds.append(pred)
    finetuned_test_refs.append(item["summary"])

test_save = []
for record, pred, ref in zip(test_records, finetuned_test_preds, finetuned_test_refs):
    test_save.append({
        "id": record["id"],
        "question": record["question"],
        "prediction": pred,
        "reference": ref
    })

with open(PRED_DIR / "finetuned_test_predictions.json", "w") as f:
    json.dump(test_save, f, indent=2)

print("Saved finetuned test predictions.")

Caching is incompatible with gradient checkpointing in T5Block. Setting `past_key_values=None`.


Running finetuned model generation on validation set...
Saved finetuned validation predictions.
Running finetuned model generation on test set...
Saved finetuned test predictions.


In [None]:
rouge_val = rouge.compute(
    predictions=finetuned_val_preds,
    references=finetuned_val_refs
)

bert_val = bertscore.compute(
    predictions=finetuned_val_preds,
    references=finetuned_val_refs,
    model_type="bert-base-uncased",
    lang="en"
)

bert_val_f1 = float(np.mean(bert_val["f1"]))

print("\n===== FINETUNED VALIDATION METRICS =====")
print("ROUGE:", rouge_val)
print("BERTScore F1:", bert_val_f1)


rouge_test = rouge.compute(
    predictions=finetuned_test_preds,
    references=finetuned_test_refs
)

bert_test = bertscore.compute(
    predictions=finetuned_test_preds,
    references=finetuned_test_refs,
    model_type="bert-base-uncased",
    lang="en"
)

bert_test_f1 = float(np.mean(bert_test["f1"]))

print("\n===== FINETUNED TEST METRICS =====")
print("ROUGE:", rouge_test)
print("BERTScore F1:", bert_test_f1)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


===== FINETUNED VALIDATION METRICS =====
ROUGE: {'rouge1': np.float64(0.026929204144352646), 'rouge2': np.float64(0.0006033182503770738), 'rougeL': np.float64(0.027334965030827163), 'rougeLsum': np.float64(0.027383900560624397)}
BERTScore F1: 0.26054074571413155

===== FINETUNED TEST METRICS =====
ROUGE: {'rouge1': np.float64(0.0273203066729582), 'rouge2': np.float64(0.0016719168998941588), 'rougeL': np.float64(0.027012621915652187), 'rougeLsum': np.float64(0.027149554728786714)}
BERTScore F1: 0.2661833700236924


In [None]:
print(tokenizer)
print(tokenizer.decode([0, 1, 2, 3, 4]))


In [None]:
print(val_records[0]["prompt"][:500])
print(len(val_records[0]["prompt"]))


In [None]:
print(generate_summary(val_records[0]["prompt"]))


Di Di Di Di Di Digoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoegoego.


In [None]:
for i in range(10):
    print(len(tokenized_train[i]["labels"]), sum(tokenized_train[i]["labels"]))


In [None]:
def decode_sequences(sequences):
    decoded = tokenizer.batch_decode(sequences, skip_special_tokens=True)
    return [text.strip() for text in decoded]

val_predictions = trainer.predict(tokenized_val)
val_decoded_preds = decode_sequences(val_predictions.predictions)
val_labels = np.where(val_predictions.label_ids == -100, tokenizer.pad_token_id, val_predictions.label_ids)
val_decoded_refs = decode_sequences(val_labels)
finetuned_val_metrics = compute_text_metrics(val_decoded_preds, val_decoded_refs)
print('Fine-tuned validation metrics:', finetuned_val_metrics)

finetuned_val_results = []
for record, pred, ref in zip(val_records, val_decoded_preds, val_decoded_refs):
    finetuned_val_results.append(
        {
            'id': record['id'],
            'question': record['question'],
            'prediction': pred,
            'reference': ref,
        }
    )
with open(PRED_DIR / 'finetuned_val_predictions.json', 'w') as f:
    json.dump(finetuned_val_results, f, indent=2)
print('Saved fine-tuned validation predictions.')

test_predictions = trainer.predict(tokenized_test)
test_decoded_preds = decode_sequences(test_predictions.predictions)
test_labels = np.where(test_predictions.label_ids == -100, tokenizer.pad_token_id, test_predictions.label_ids)
test_decoded_refs = decode_sequences(test_labels)
finetuned_test_metrics = compute_text_metrics(test_decoded_preds, test_decoded_refs)
print('Fine-tuned test metrics:', finetuned_test_metrics)

finetuned_test_results = []
for record, pred, ref in zip(test_records, test_decoded_preds, test_decoded_refs):
    finetuned_test_results.append(
        {
            'id': record['id'],
            'question': record['question'],
            'prediction': pred,
            'reference': ref,
        }
    )
with open(PRED_DIR / 'finetuned_test_predictions.json', 'w') as f:
    json.dump(finetuned_test_results, f, indent=2)
print('Saved fine-tuned test predictions.')

In [None]:
def summarize_split(name, baseline_metrics, finetuned_metrics):
    metric_keys = sorted(set(baseline_metrics) | set(finetuned_metrics))
    print(f"\n{name} metrics")
    print("Metric".ljust(20), "Baseline".ljust(12), "Finetuned".ljust(12), "Delta")
    print("-" * 60)
    for key in metric_keys:
        base_val = baseline_metrics.get(key)
        tune_val = finetuned_metrics.get(key)
        delta = None if (base_val is None or tune_val is None) else tune_val - base_val
        print(
            key.ljust(20),
            f"{base_val:.4f}".ljust(12) if base_val is not None else "--".ljust(12),
            f"{tune_val:.4f}".ljust(12) if tune_val is not None else "--".ljust(12),
            f"{delta:+.4f}" if delta is not None else "--",
        )

required_vars = [
    'baseline_val_metrics',
    'baseline_test_metrics',
    'finetuned_val_metrics',
    'finetuned_test_metrics',
]
missing = [var for var in required_vars if var not in globals()]
if missing:
    raise RuntimeError(
        "Please execute the baseline and fine-tuned evaluation cells before running this comparison block."
    )

summarize_split('Validation', baseline_val_metrics, finetuned_val_metrics)
summarize_split('Test', baseline_test_metrics, finetuned_test_metrics)
print("\nDone comparing baseline vs. fine-tuned performance.")
