# Long-T5 Clinical Query Summarization with LoRA Fine-tuning

This notebook fine-tunes the Long-T5 model for medical article summarization using LoRA (Low-Rank Adaptation) for parameter-efficient training.

## 1. Install Dependencies

In [4]:
!pip install -q torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118

!pip install -q "numpy<2"

!pip install -q transformers==4.36.0 datasets evaluate accelerate==0.25.0 peft==0.7.1
!pip install -q bert-score rouge-score sentencepiece

print("All packages installed!")
print("Note: Restart the kernel after installation if you haven't already.")

All packages installed!
Note: Restart the kernel after installation if you haven't already.


In [1]:
import torch
print("=" * 50)
print("PyTorch & CUDA Verification")
print("=" * 50)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    capability = torch.cuda.get_device_capability(0)
    print(f"Compute capability: {capability[0]}.{capability[1]}")
    
    if capability[0] >= 6:
        print("GPU is compatible with this notebook")
    else:
        print("GPU may have limited compatibility")
    
    try:
        x = torch.randn(100, 100).cuda()
        y = torch.matmul(x, x)
        print("CUDA matrix operations working")
    except Exception as e:
        print(f"CUDA error: {e}")
else:
    print("Running on CPU - training will be slow")
print("=" * 50)

PyTorch & CUDA Verification
PyTorch version: 2.2.0+cu118
CUDA available: True
CUDA version: 11.8
GPU: Tesla P100-PCIE-16GB
Compute capability: 6.0
GPU is compatible with this notebook
CUDA matrix operations working
CUDA matrix operations working


## 2. Import Libraries and Check GPU

In [6]:
import json
import os
import numpy as np
from pathlib import Path

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import LoraConfig, TaskType, get_peft_model
import evaluate

print("=" * 50)
print("GPU Information")
print("=" * 50)
if torch.cuda.is_available():
    print(f"CUDA available: True")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Compute capability: {torch.cuda.get_device_capability(0)}")
    device = torch.device("cuda")
    
    try:
        test_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
        _ = test_tensor * 2
        print("CUDA test: PASSED")
    except Exception as e:
        print(f"CUDA test: FAILED - {e}")
        print("Falling back to CPU")
        device = torch.device("cpu")
else:
    print("CUDA available: False - using CPU")
    device = torch.device("cpu")
print("=" * 50)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


GPU Information
CUDA available: True
PyTorch version: 2.2.0+cu118
CUDA version: 11.8
GPU device: Tesla P100-PCIE-16GB
GPU memory: 17.06 GB
Compute capability: (6, 0)
CUDA test: PASSED


## 3. Configuration

In [7]:
MODEL_NAME = "google/long-t5-tglobal-base"

BASE_DIR = Path("/home/cc/clinical-query-summarization/long-t5")
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "outputs"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MAX_INPUT_LENGTH = 1536
MAX_TARGET_LENGTH = 256
GEN_MAX_NEW_TOKENS = 200

BATCH_SIZE = 2  
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 1e-4  
NUM_EPOCHS = 5  
WARMUP_RATIO = 0.1

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.1

EARLY_STOPPING_PATIENCE = 3
EVAL_STEPS = 20

print("Configuration loaded!")
print(f"Model: {MODEL_NAME}")
print(f"Max input length: {MAX_INPUT_LENGTH}")
print(f"Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f"LoRA rank: {LORA_R}, alpha: {LORA_ALPHA}")
print(f"Note: Settings optimized for P100 (16GB VRAM)")

Configuration loaded!
Model: google/long-t5-tglobal-base
Max input length: 1536
Batch size: 2 (effective: 8)
LoRA rank: 16, alpha: 32
Note: Settings optimized for P100 (16GB VRAM)


## 4. Load and Prepare Data

In [9]:
def load_data_from_json(json_path):
    # Input: article text + question
    # Target: answer_abs_summ ----> abstractive summary
    print(f"Loading data from {json_path}...")
    
    with open(json_path, 'r') as f:
        raw_data = json.load(f)
    
    records = []
    for qid, payload in raw_data.items():
        question = (payload.get('question') or '').strip()
        answers = payload.get('answers', {})
        
        for aid, answer_payload in answers.items():
            article = (answer_payload.get('article') or '').strip()
            summary = (answer_payload.get('answer_abs_summ') or '').strip()
            
            if not article or not summary:
                continue
            
            prompt = (
                "Summarize the following medical article to answer the clinical question.\n"
                f"Question: {question}\n"
                f"Article: {article}"
            )
            
            records.append({
                'id': f'{qid}_{aid}',
                'question': question,
                'article': article,
                'prompt': prompt,
                'summary': summary,
            })
    
    print(f"  Found {len(records)} samples")
    return records


print("Loading datasets...")
print("-" * 40)
train_records = load_data_from_json(DATA_DIR / "train.json")
val_records = load_data_from_json(DATA_DIR / "validation.json")
test_records = load_data_from_json(DATA_DIR / "test.json")
print("-" * 40)
print(f"Total: {len(train_records)} train, {len(val_records)} val, {len(test_records)} test")

Loading datasets...
----------------------------------------
Loading data from /home/cc/clinical-query-summarization/long-t5/data/train.json...
  Found 392 samples
Loading data from /home/cc/clinical-query-summarization/long-t5/data/validation.json...
  Found 51 samples
Loading data from /home/cc/clinical-query-summarization/long-t5/data/test.json...
  Found 109 samples
----------------------------------------
Total: 392 train, 51 val, 109 test


In [10]:
print("Sample training example:")
print("=" * 60)
sample = train_records[0]
print(f"ID: {sample['id']}")
print(f"Question: {sample['question'][:100]}...")
print(f"Article length: {len(sample['article'])} chars")
print(f"Summary: {sample['summary'][:200]}...")
print("=" * 60)

Sample training example:
ID: 133_133_Answer2
Question: how much oxazepam could cause an overdose?...
Article length: 3216 chars
Summary: Oxazepam is used to treat anxiety and symptoms of alcohol withdrawal. If you or some you are with overdoses, call your local emergency number, such as 911, or call your local poison center which can b...


## 5. Load Model and Tokenizer

In [11]:
print(f"Loading tokenizer from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded! Vocab size: {tokenizer.vocab_size}")

print(f"\nLoading base model from {MODEL_NAME}...")
print("This may take a minute...")
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print(f"Model loaded!")
print(f"Total parameters: {sum(p.numel() for p in base_model.parameters()):,}")

Loading tokenizer from google/long-t5-tglobal-base...




Tokenizer loaded! Vocab size: 32100

Loading base model from google/long-t5-tglobal-base...
This may take a minute...
Model loaded!
Total parameters: 247,587,456
Model loaded!
Total parameters: 247,587,456


## 6. Tokenize Dataset

In [12]:
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['prompt'],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    labels = tokenizer(
        examples['summary'],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding='max_length'
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


print("Creating HuggingFace datasets...")
train_dataset = Dataset.from_list(train_records)
val_dataset = Dataset.from_list(val_records)
test_dataset = Dataset.from_list(test_records)

print("Tokenizing datasets...")
print("  Tokenizing train set...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
print("  Tokenizing validation set...")
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
print("  Tokenizing test set...")
tokenized_test = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)

print(f"\nTokenization complete!")
print(f"Train: {len(tokenized_train)}, Val: {len(tokenized_val)}, Test: {len(tokenized_test)}")

Creating HuggingFace datasets...
Tokenizing datasets...
  Tokenizing train set...


Map: 100%|██████████| 392/392 [00:00<00:00, 590.15 examples/s]
Map: 100%|██████████| 392/392 [00:00<00:00, 590.15 examples/s]


  Tokenizing validation set...


Map: 100%|██████████| 51/51 [00:00<00:00, 426.66 examples/s]



  Tokenizing test set...


Map: 100%|██████████| 109/109 [00:00<00:00, 493.36 examples/s]


Tokenization complete!
Train: 392, Val: 51, Test: 109





## 7. Baseline Inference (Before Fine-tuning)

Generate predictions with the pre-trained Long-T5 model (no fine-tuning) to establish a baseline.

In [13]:
def generate_predictions(model, records, batch_size=2):
    model.eval()
    model.to(device)
    predictions = []
    
    total_batches = (len(records) + batch_size - 1) // batch_size
    print(f"Generating predictions for {len(records)} samples...")
    
    for i in range(0, len(records), batch_size):
        batch = records[i:i + batch_size]
        batch_num = i // batch_size + 1
        
        if batch_num % 10 == 0 or batch_num == total_batches:
            print(f"  Processing batch {batch_num}/{total_batches}...")
        
        inputs = tokenizer(
            [r['prompt'] for r in batch],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding=True,
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=GEN_MAX_NEW_TOKENS,
                num_beams=4,
                length_penalty=1.0,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        
        for record, pred in zip(batch, decoded):
            predictions.append({
                'id': record['id'],
                'question': record['question'],
                'reference': record['summary'],
                'prediction': pred.strip()
            })
        
        if device.type == 'cuda' and batch_num % 20 == 0:
            torch.cuda.empty_cache()
    
    print(f"  Done! Generated {len(predictions)} predictions")
    return predictions


def save_predictions(predictions, output_path):
    """Save predictions to JSON file."""
    with open(output_path, 'w') as f:
        json.dump(predictions, f, indent=2)
    print(f"Saved predictions to {output_path}")

In [14]:
# Load existing baseline predictions
# baseline inference cells if predictions already generated and saved

baseline_preds_exist = (OUTPUT_DIR / "baseline_validation_predictions.json").exists() and \
                       (OUTPUT_DIR / "baseline_test_predictions.json").exists()

if baseline_preds_exist:
    print("Loading existing baseline predictions from disk...")
    with open(OUTPUT_DIR / "baseline_validation_predictions.json", 'r') as f:
        baseline_val_preds = json.load(f)
    with open(OUTPUT_DIR / "baseline_test_predictions.json", 'r') as f:
        baseline_test_preds = json.load(f)
    print(f"  Loaded {len(baseline_val_preds)} validation predictions")
    print(f"  Loaded {len(baseline_test_preds)} test predictions")
    print("Baseline predictions loaded! You can skip the baseline inference cells.")
else:
    print("No existing baseline predictions found.")
    print("Run the baseline inference cells below to generate them.")

No existing baseline predictions found.
Run the baseline inference cells below to generate them.


In [15]:
print("=" * 60)
print("BASELINE INFERENCE (Pre-trained Long-T5, no fine-tuning)")
print("=" * 60)

print("\n--- Validation Set ---")
baseline_val_preds = generate_predictions(base_model, val_records, batch_size=2)
save_predictions(baseline_val_preds, OUTPUT_DIR / "baseline_validation_predictions.json")

print("\n--- Test Set ---")
baseline_test_preds = generate_predictions(base_model, test_records, batch_size=2)
save_predictions(baseline_test_preds, OUTPUT_DIR / "baseline_test_predictions.json")

print("\nBaseline inference complete!")

BASELINE INFERENCE (Pre-trained Long-T5, no fine-tuning)

--- Validation Set ---
Generating predictions for 51 samples...
Generating predictions for 51 samples...




  Processing batch 10/26...
  Processing batch 20/26...
  Processing batch 20/26...
  Processing batch 26/26...
  Processing batch 26/26...
  Done! Generated 51 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/baseline_validation_predictions.json

--- Test Set ---
Generating predictions for 109 samples...
  Done! Generated 51 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/baseline_validation_predictions.json

--- Test Set ---
Generating predictions for 109 samples...
  Processing batch 10/55...
  Processing batch 10/55...
  Processing batch 20/55...
  Processing batch 20/55...
  Processing batch 30/55...
  Processing batch 30/55...
  Processing batch 40/55...
  Processing batch 40/55...
  Processing batch 50/55...
  Processing batch 50/55...
  Processing batch 55/55...
  Processing batch 55/55...
  Done! Generated 109 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/basel

In [None]:
print("Example baseline prediction:")
print("-" * 60)
print(f"Question: {baseline_val_preds[0]['question'][:100]}...")
print(f"\nReference: {baseline_val_preds[0]['reference'][:300]}...")
print(f"\nPrediction: {baseline_val_preds[0]['prediction'][:300]}...")
print("-" * 60)

Example baseline prediction:
------------------------------------------------------------
Question: subjective vertigo Can macular degeneration in only one eye cause dizziness?...

Reference: Dizziness has many possible causes, including inner ear disturbance, motion sickness, medication effects, and underlying health condition, such as poor circulation, infection or injury. How long the dizziness lasts and makes you feel, its triggers and other symptoms may help  determine its cause. Yo...

Prediction: Symptoms People experiencing dizziness may describe it as any of a number of sensations, such as: - A false sense of motion or spinning (vertigo) - Lightheadedness or feeling faint - Unsteadiness or a loss of balance - A feeling of floating, wooziness or heavy-headedness These feelings may be trigge...
------------------------------------------------------------


## 8. Apply LoRA for Parameter-Efficient Fine-tuning



In [16]:
print("Loading fresh model for LoRA fine-tuning...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("\nConfiguring LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["q", "v"],
)

model = get_peft_model(model, lora_config)

print("\n" + "=" * 60)
print("LoRA Configuration Applied!")
print("=" * 60)
model.print_trainable_parameters()
print("=" * 60)

Loading fresh model for LoRA fine-tuning...

Configuring LoRA...

Configuring LoRA...

LoRA Configuration Applied!
trainable params: 1,769,472 || all params: 249,356,928 || trainable%: 0.7096141319161583

LoRA Configuration Applied!
trainable params: 1,769,472 || all params: 249,356,928 || trainable%: 0.7096141319161583


## 9. Configure Training with Early Stopping

In [17]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)

use_fp16 = torch.cuda.is_available()
if use_fp16:
    print("FP16 training enabled for P100")
else:
    print("Using FP32 training (CPU mode)")

training_args = Seq2SeqTrainingArguments(
    output_dir=str(OUTPUT_DIR / "checkpoints"),
    
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=WARMUP_RATIO,
    
    num_train_epochs=NUM_EPOCHS,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=EVAL_STEPS,
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LENGTH,
    
    fp16=use_fp16,
    fp16_full_eval=False, 
    
    logging_steps=10,
    logging_first_step=True,
    report_to="none", 
    
    dataloader_num_workers=0, 
    dataloader_pin_memory=False,  
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    early_stopping_threshold=0.0
)

print("\nTraining configuration:")
print(f"  Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  Max epochs: {NUM_EPOCHS}")
print(f"  Early stopping patience: {EARLY_STOPPING_PATIENCE} evaluations")
print(f"  Evaluation every {EVAL_STEPS} steps")
print(f"  FP16: {use_fp16}")

FP16 training enabled for P100

Training configuration:
  Effective batch size: 8
  Max epochs: 5
  Early stopping patience: 3 evaluations
  Evaluation every 20 steps
  FP16: True


## 10. Train the Model

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
)

print("Trainer initialized!")
print(f"Training samples: {len(tokenized_train)}")
print(f"Validation samples: {len(tokenized_val)}")
print(f"Steps per epoch: {len(tokenized_train) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)}")

Trainer initialized!
Training samples: 392
Validation samples: 51
Steps per epoch: 49


In [19]:
print("=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print("Note: Early stopping will stop training if validation loss")
print(f"      doesn't improve for {EARLY_STOPPING_PATIENCE} consecutive evaluations.")
print("=" * 60)

train_result = trainer.train()

print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)
print(f"Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Final training loss: {train_result.metrics['train_loss']:.4f}")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


STARTING TRAINING
Note: Early stopping will stop training if validation loss
      doesn't improve for 3 consecutive evaluations.




Step,Training Loss,Validation Loss
20,35.782,40.202431
40,33.7725,36.184742
60,30.5882,31.780287
80,27.4963,28.283424
100,26.8226,25.989681
120,24.9932,25.188524





TRAINING COMPLETE!
Total training time: 419.28 seconds
Final training loss: 30.3848


In [None]:
lora_save_path = OUTPUT_DIR / "lora_adapter"
model.save_pretrained(lora_save_path)
print(f"LoRA adapter saved to {lora_save_path}")

LoRA adapter saved to /home/cc/clinical-query-summarization/long-t5/outputs/lora_adapter


## 11. Fine-tuned Model Inference

In [21]:
print("=" * 60)
print("FINE-TUNED MODEL INFERENCE")
print("=" * 60)

print("\n--- Validation Set ---")
finetuned_val_preds = generate_predictions(model, val_records, batch_size=2)
save_predictions(finetuned_val_preds, OUTPUT_DIR / "finetuned_validation_predictions.json")

print("\n--- Test Set ---")
finetuned_test_preds = generate_predictions(model, test_records, batch_size=2)
save_predictions(finetuned_test_preds, OUTPUT_DIR / "finetuned_test_predictions.json")

print("\nFine-tuned inference complete!")

FINE-TUNED MODEL INFERENCE

--- Validation Set ---
Generating predictions for 51 samples...




  Processing batch 10/26...
  Processing batch 20/26...
  Processing batch 20/26...
  Processing batch 26/26...
  Processing batch 26/26...
  Done! Generated 51 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/finetuned_validation_predictions.json

--- Test Set ---
Generating predictions for 109 samples...
  Done! Generated 51 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/finetuned_validation_predictions.json

--- Test Set ---
Generating predictions for 109 samples...
  Processing batch 10/55...
  Processing batch 10/55...
  Processing batch 20/55...
  Processing batch 20/55...
  Processing batch 30/55...
  Processing batch 30/55...
  Processing batch 40/55...
  Processing batch 40/55...
  Processing batch 50/55...
  Processing batch 50/55...
  Processing batch 55/55...
  Processing batch 55/55...
  Done! Generated 109 predictions
Saved predictions to /home/cc/clinical-query-summarization/long-t5/outputs/fin

## 12. Evaluation with ROUGE and BERTScore

In [20]:
# Evaluation metrics
print("Loading evaluation metrics...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')
print("Metrics loaded!")


def compute_metrics(predictions):
    refs = [p['reference'] for p in predictions]
    preds = [p['prediction'] for p in predictions]
    
    # ROUGE
    rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    
    # BERTScore
    bert_result = bertscore.compute(predictions=preds, references=refs, lang='en')
    
    metrics = {
        'rouge1': round(rouge_result['rouge1'], 4),
        'rouge2': round(rouge_result['rouge2'], 4),
        'rougeL': round(rouge_result['rougeL'], 4),
        'rougeLsum': round(rouge_result['rougeLsum'], 4),
        'bertscore_precision': round(np.mean(bert_result['precision']), 4),
        'bertscore_recall': round(np.mean(bert_result['recall']), 4),
        'bertscore_f1': round(np.mean(bert_result['f1']), 4),
    }
    
    return metrics

Loading evaluation metrics...
Metrics loaded!
Metrics loaded!


In [22]:
# Evaluate baseline model
print("=" * 60)
print("EVALUATION RESULTS")
print("=" * 60)

print("\n--- Baseline Model (Validation Set) ---")
baseline_val_metrics = compute_metrics(baseline_val_preds)
for k, v in baseline_val_metrics.items():
    print(f"  {k}: {v}")

print("\n--- Baseline Model (Test Set) ---")
baseline_test_metrics = compute_metrics(baseline_test_preds)
for k, v in baseline_test_metrics.items():
    print(f"  {k}: {v}")

EVALUATION RESULTS

--- Baseline Model (Validation Set) ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  rouge1: 0.2595
  rouge2: 0.0986
  rougeL: 0.1775
  rougeLsum: 0.177
  bertscore_precision: 0.8208
  bertscore_recall: 0.8574
  bertscore_f1: 0.8385

--- Baseline Model (Test Set) ---
  rouge1: 0.2472
  rouge2: 0.0884
  rougeL: 0.1618
  rougeLsum: 0.1618
  bertscore_precision: 0.8244
  bertscore_recall: 0.8499
  bertscore_f1: 0.8366
  rouge1: 0.2472
  rouge2: 0.0884
  rougeL: 0.1618
  rougeLsum: 0.1618
  bertscore_precision: 0.8244
  bertscore_recall: 0.8499
  bertscore_f1: 0.8366


In [23]:
# Evaluation
print("\n--- Fine-tuned Model (Validation Set) ---")
finetuned_val_metrics = compute_metrics(finetuned_val_preds)
for k, v in finetuned_val_metrics.items():
    print(f"  {k}: {v}")

print("\n--- Fine-tuned Model (Test Set) ---")
finetuned_test_metrics = compute_metrics(finetuned_test_preds)
for k, v in finetuned_test_metrics.items():
    print(f"  {k}: {v}")


--- Fine-tuned Model (Validation Set) ---
  rouge1: 0.2667
  rouge2: 0.0969
  rougeL: 0.1786
  rougeLsum: 0.179
  bertscore_precision: 0.8281
  bertscore_recall: 0.8587
  bertscore_f1: 0.8429

--- Fine-tuned Model (Test Set) ---
  rouge1: 0.2667
  rouge2: 0.0969
  rougeL: 0.1786
  rougeLsum: 0.179
  bertscore_precision: 0.8281
  bertscore_recall: 0.8587
  bertscore_f1: 0.8429

--- Fine-tuned Model (Test Set) ---
  rouge1: 0.2712
  rouge2: 0.1053
  rougeL: 0.1829
  rougeLsum: 0.1835
  bertscore_precision: 0.8328
  bertscore_recall: 0.856
  bertscore_f1: 0.8438
  rouge1: 0.2712
  rouge2: 0.1053
  rougeL: 0.1829
  rougeLsum: 0.1835
  bertscore_precision: 0.8328
  bertscore_recall: 0.856
  bertscore_f1: 0.8438


## 13. Summary Comparison Table

In [26]:
# comparison table
print("\n" + "=" * 80)
print("FINAL RESULTS COMPARISON (Validation Set)")
print("=" * 80)
print(f"{'Metric':<25} {'Baseline':>15} {'Fine-tuned':>15} {'Improvement':>15}")
print("-" * 80)

for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bertscore_f1']:
    baseline_val = baseline_val_metrics[metric]
    finetuned_val = finetuned_val_metrics[metric]
    improvement = finetuned_val - baseline_val
    sign = "+" if improvement > 0 else ""
    print(f"{metric:<25} {baseline_val:>15.4f} {finetuned_val:>15.4f} {sign}{improvement:>14.4f}")

print("=" * 80)

print("\n" + "=" * 80)
print("FINAL RESULTS COMPARISON (Test Set)")
print("=" * 80)
print(f"{'Metric':<25} {'Baseline':>15} {'Fine-tuned':>15} {'Improvement':>15}")
print("-" * 80)

for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'bertscore_f1']:
    baseline_val = baseline_test_metrics[metric]
    finetuned_val = finetuned_test_metrics[metric]
    improvement = finetuned_val - baseline_val
    sign = "+" if improvement > 0 else ""
    print(f"{metric:<25} {baseline_val:>15.4f} {finetuned_val:>15.4f} {sign}{improvement:>14.4f}")

print("=" * 80)


FINAL RESULTS COMPARISON (Validation Set)
Metric                           Baseline      Fine-tuned     Improvement
--------------------------------------------------------------------------------
rouge1                             0.2595          0.2667 +        0.0072
rouge2                             0.0986          0.0969        -0.0017
rougeL                             0.1775          0.1786 +        0.0011
rougeLsum                          0.1770          0.1790 +        0.0020
bertscore_f1                       0.8385          0.8429 +        0.0044

FINAL RESULTS COMPARISON (Test Set)
Metric                           Baseline      Fine-tuned     Improvement
--------------------------------------------------------------------------------
rouge1                             0.2472          0.2712 +        0.0240
rouge2                             0.0884          0.1053 +        0.0169
rougeL                             0.1618          0.1829 +        0.0211
rougeLsum          

In [25]:
all_metrics = {
    'baseline_validation': baseline_val_metrics,
    'baseline_test': baseline_test_metrics,
    'finetuned_validation': finetuned_val_metrics,
    'finetuned_test': finetuned_test_metrics,
}

with open(OUTPUT_DIR / "evaluation_metrics.json", 'w') as f:
    json.dump(all_metrics, f, indent=2)
print(f"\nAll metrics saved to {OUTPUT_DIR / 'evaluation_metrics.json'}")


All metrics saved to /home/cc/clinical-query-summarization/long-t5/outputs/evaluation_metrics.json
