In [None]:
# Install the Hugging Face datasets and other libraries
!pip install -U datasets
!pip install -U evaluate
!pip install -U rouge_score 

In [None]:
#Import necessary libraries
from datasets import load_dataset, Dataset
import pandas as pd
from itertools import islice
import os
import re
from bs4 import BeautifulSoup
import torch
import evaluate
import numpy as np

## Loading of the dataset
In this step, I loaded the grasool/breast-cancer-QAs-llama dataset from the Hugging Face Hub. To ensure smooth dataset caching, I defined a custom temporary cache directory. This ensures that the dataset is stored and accessed locally during multiple runs without needing to re-download it.

In [None]:
# Define a temporary cache directory for Hugging Face datasets
CACHE_DIR = "/tmp/huggingface_datasets_cache"
os.makedirs(CACHE_DIR, exist_ok=True)  # Ensure the directory exists

print(f"Using cache directory: {CACHE_DIR}")

# Load the breast cancer QA dataset from Hugging Face
dataset = load_dataset("grasool/breast-cancer-QAs-llama", split="train", cache_dir=CACHE_DIR)
print("Dataset loaded successfully.")

# Preview the first 5 examples to understand the structure
preview_data = list(islice(dataset, 5))

# Convert the preview data to a DataFrame for easier analysis
df = pd.DataFrame(preview_data)
df.head()


##Data cleaning and splitting
I defined a clean_text() function that removes HTML tags and unnecessary whitespace using BeautifulSoup and regex, to ensure input text is consistent and free from web formatting issues, which could mislead the model.
Upon inspection of the dataset, I found that it is structured in a prompt-response format using [INST]...[/INST]... so I created a function parse_llama_chat_format() to extract the QUESTION and ANSWER using regex.
#####The dataset was converted to a Pandas DataFrame to easily identify and drop rows with missing or empty values in either the question or answer. The final cleaned dataset was then split into training and validation sets using a 90/10 ratio.
#####I printed a sample QA pair from the training set to verify everything looked correct.


In [None]:
# Data Cleaning & Train/Validation Splits (Updated for FLAN-T5 and dataset format)

# Define the cleaning function
def clean_text(text):
    text = str(text)
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to parse the specific dataset format
def parse_llama_chat_format(text_entry):
    match = re.search(r'\[INST\]\s*(.*?)\s*\[/INST\]\s*(.*?)\s*</s>', text_entry, re.DOTALL)
    if match:
        question = clean_text(match.group(1))
        answer = clean_text(match.group(2))
        return {'QUESTION': question, 'ANSWER': answer}
    return None

print("Parsing dataset entries into 'QUESTION' and 'ANSWER' columns...")

# Apply parsing and filter invalid entries
parsed_data = [parse_llama_chat_format(entry['text']) for entry in dataset]
parsed_data = [entry for entry in parsed_data if entry is not None]

# Convert to Hugging Face Dataset format
qa_dataset = Dataset.from_list(parsed_data)
print(f"Parsed dataset with {len(qa_dataset)} QA pairs.")

# Handle missing or empty values
temp_df = qa_dataset.to_pandas()
original_rows = len(temp_df)

temp_df.dropna(subset=['QUESTION', 'ANSWER'], inplace=True)
temp_df = temp_df[temp_df['QUESTION'].str.strip() != '']
temp_df = temp_df[temp_df['ANSWER'].str.strip() != '']

dropped_rows = original_rows - len(temp_df)
qa_dataset = Dataset.from_pandas(temp_df)

if dropped_rows > 0:
    print(f"Dropped {dropped_rows} rows due to missing or empty 'QUESTION' or 'ANSWER' after parsing/cleaning.")
else:
    print("No rows dropped due to missing or empty 'QUESTION' or 'ANSWER'.")

print(f"\nDataset after parsing and cleaning: {len(qa_dataset)} examples.")

# Train/Validation split
train_test_split_result = qa_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset_hf = train_test_split_result['train']
val_dataset_hf = train_test_split_result['test']

print(f"\nTraining set size: {len(train_dataset_hf)}")
print(f"Validation set size: {len(val_dataset_hf)}")

print("\nExample of parsed and cleaned data from training set (first example):")
print(f"QUESTION: {train_dataset_hf[0]['QUESTION']}")
print(f"ANSWER: {train_dataset_hf[0]['ANSWER']}")


##Tokenization and Dataset Perparation
In this step, I prepared the dataset for training the FLAN-T5 model by carefully tokenizing both the questions and answers. I used the T5Tokenizer from Hugging Face’s Transformers library, specifically the "google/flan-t5-base" checkpoint, because this model is instruction-tuned and well-suited for question-answering tasks without needing extra prefixes like "question:".

I applied padding and truncation during tokenization to standardize sequence lengths, which is crucial for batching during training. For the target sequences (answers), I masked the padding tokens with a value of -100 so that the loss calculation ignores them — this is a standard technique to avoid penalizing the model for padded tokens.

After tokenization, I converted the datasets into PyTorch tensors, which allows smooth integration with the Hugging Face Trainer.

In [None]:
# Tokenization and Metric Setup
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Tokenization config (adjustable based on length analysis)
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 512

def preprocess_function(examples):
    inputs = examples['QUESTION']
    targets = examples['ANSWER']

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    model_inputs["labels"] = labels["input_ids"]

    # Mask padded label tokens with -100 so loss is not calculated on them
    model_inputs["labels"] = torch.where(
        labels["attention_mask"] == 0,
        torch.tensor(-100, dtype=torch.long),
        model_inputs["labels"]
    )

    return model_inputs

# Apply preprocessing to datasets
tokenized_train_dataset = train_dataset_hf.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset_hf.map(preprocess_function, batched=True)

# Format for PyTorch
tokenized_train_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

# Sample inspection
print("\nExample of tokenized input_ids (first question):")
print(tokenized_train_dataset[0]['input_ids'])

print("\nExample of tokenized attention_mask (first question):")
print(tokenized_train_dataset[0]['attention_mask'])

print("\nExample of tokenized labels (first answer):")
print(tokenized_train_dataset[0]['labels'])

# Decoded versions
print("\nDecoded input_ids (first question):")
print(tokenizer.decode(tokenized_train_dataset[0]['input_ids'], skip_special_tokens=True))

print("\nDecoded labels (first answer):")
print(tokenizer.decode(
    [token for token in tokenized_train_dataset[0]['labels'].tolist() if token != -100],
    skip_special_tokens=True
))


##Defining evaluation metrics
For evaluation, I chose three metrics to cover different aspects of answer quality:
- BLEU, which captures n-gram overlap and fluency, using smoothing to handle edge cases,
- SQuAD metrics (Exact Match and F1) that are standard for QA tasks measuring precise answer accuracy,
- ROUGE scores (1, 2, and L), which assess the overlap of key phrases and the overall structure between predictions and references.

Together, these metrics provide a comprehensive assessment of how well the model generates correct, fluent, and relevant answers in this specialized medical context.

In [None]:
# Evaluation metric setup
bleu_metric = evaluate.load("bleu")
squad_metric = evaluate.load("squad")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predicted_token_ids = np.argmax(predictions, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU (robust with smoothing)
    cleaned_preds = [p for p in decoded_preds if p.strip()]
    cleaned_labels = [[l] for l in decoded_labels if l.strip()]

    bleu_score = 0.0
    if cleaned_preds and cleaned_labels:
        bleu_results = bleu_metric.compute(
            predictions=cleaned_preds,
            references=cleaned_labels,
            max_order=4,
            smooth=True
        )
        bleu_score = bleu_results["bleu"]

    # SQuAD-style EM and F1
    formatted_preds = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(decoded_preds)]
    formatted_refs = [{"id": str(i), "answers": {"answer_start": [0], "text": [label]}} for i, label in enumerate(decoded_labels)]

    squad_results = squad_metric.compute(predictions=formatted_preds, references=formatted_refs)

    # ROUGE
    rouge_results = rouge_metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    return {
        "bleu": round(bleu_score, 4),
        "squad_f1": round(squad_results["f1"], 4),
        "squad_exact_match": round(squad_results["exact_match"], 4),
        "rouge1_f1": round(rouge_results["rouge1"], 4),
        "rouge2_f1": round(rouge_results["rouge2"], 4),
        "rougeL_f1": round(rouge_results["rougeL"], 4)
    }


To monitor training stability, I implemented a custom DebugCallback class that checks the model's gradients and parameters every 10 steps. This helped me detect issues like vanishing gradients, all-zero gradients (which indicate no learning), or NaN/Inf values that often signal numerical instability or divergence. By inspecting these values during training, I could proactively identify and fix issues like poor learning rates or batch sizes that were too small — making debugging more efficient and transparent.

In [None]:
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

class DebugCallback(TrainerCallback):
    """
    Custom callback to inspect gradients and model parameters during training.
    Useful for debugging vanishing gradients, NaNs, or training divergence.
    """
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.global_step % 10 == 0 and state.global_step > 0:
            model = kwargs.get("model")
            if model is None:
                print("DEBUG_CALLBACK: Model not found in kwargs.")
                return

            print(f"\n--- DEBUG_CALLBACK (Step {state.global_step}) ---")

            grads_found = False
            nan_grads = False
            all_grads_zero = True
            zero_threshold = 1e-9

            for name, param in model.named_parameters():
                if param.grad is not None:
                    grads_found = True
                    if torch.isnan(param.grad).any():
                        nan_grads = True
                        print(f"  WARNING: NaN gradient in {name}")
                    if param.grad.abs().sum().item() > zero_threshold:
                        all_grads_zero = False

            if not grads_found:
                print("  CRITICAL: No gradients found! Check backward pass.")
            elif nan_grads:
                print("  CRITICAL: NaNs detected in gradients.")
            elif all_grads_zero:
                print("  CRITICAL: All gradients near zero — model may not be learning.")
            else:
                print("  Good: Gradients appear non-zero.")

            nan_params = False
            for name, param in model.named_parameters():
                if torch.isnan(param).any():
                    print(f"  CRITICAL: NaN in parameter: {name}")
                    nan_params = True
                if torch.isinf(param).any():
                    print(f"  CRITICAL: Inf in parameter: {name}")
                    nan_params = True

            if nan_params:
                print("  CRITICAL: Model parameters contain NaNs/Infs — training instability likely.")

            print("-----------------------------------\n")


# Model training and evaluation

## FLAN-T5-BASE Train 1
In this section, I fine-tuned the FLAN-T5 base model on my cleaned question–answer dataset using the Hugging Face Trainer API. I specified appropriate training parameters including a low batch size to avoid memory issues, a learning rate of 3e-5, and set the evaluation and checkpoint saving strategy to run after each epoch. I also set the Trainer to load the best model at the end based on the lowest evaluation loss.

After training, I evaluated the model on the validation set using the metrics set previously. I also calculated perplexity from the evaluation loss to assess how confidently the model generates predictions. The model and tokenizer were saved for future inference or deployment.

In [None]:
# Trainer Setup and Training

from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# --- Configuration ---
MODEL_NAME = "google/flan-t5-base"
LEARNING_RATE = 3e-5
BATCH_SIZE = 2
NUM_EPOCHS = 7
OUTPUT_DIR = "./flan_t5_breast_cancer_qa"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Load the FLAN-T5 model ---
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# --- Define training arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=2,
)

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- Start training ---
print("Starting training...")
trainer.train()

# --- Final Evaluation ---
print("\nStarting final evaluation...")
eval_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
for key, value in eval_results.items():
    if key == "eval_loss":
        print(f"Loss: {value:.4f}")
    elif key == "eval_bleu":
        print(f"BLEU Score: {value:.4f}")
    elif key == "eval_squad_f1":
        print(f"F1 Score (SQuAD): {value:.4f}")
    elif key == "eval_squad_exact_match":
        print(f"Exact Match (SQuAD): {value:.4f}")
    elif key.startswith("eval_"):
        print(f"{key.replace('eval_', '').replace('_', ' ').title()}: {value:.4f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")

# --- Perplexity Calculation ---
if "eval_loss" in eval_results:
    perplexity = np.exp(eval_results["eval_loss"])
    print(f"Perplexity: {perplexity:.4f}")
else:
    print("\n'eval_loss' not found in evaluation results. Cannot calculate perplexity.")

print("\n--- Evaluation Complete ---")

# --- Save model and tokenizer ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")


## FLAN-T5-BASE Train 2
In this train, I fine-tuned the FLAN-T5 base model using a smaller batch size and a slightly higher learning rate of 1e-4 compared to the earlier 3e-5 to speed up convergence. I also extended the training duration to 10 epochs (previously 7), allowing the model more room to learn patterns from the data. One key change was increasing gradient_accumulation_steps to 8 to simulate a larger batch size and help stabilize training despite limited memory.

This configuration is more aggressive, but it comes with a higher risk of overfitting, which is why I preserved evaluation and saving after every epoch. I also introduced a custom callback for debugging and used AdamW explicitly for optimization control.

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.optim import AdamW

# --- Configuration ---
MODEL_NAME = 'google/flan-t5-base'
LEARNING_RATE = 1e-4
BATCH_SIZE = 1
NUM_EPOCHS = 10
OUTPUT_DIR = "./flan_t5_breast_cancer_qa_test2"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load model
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.0,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=8,
    fp16=False,
)

# Custom optimizer
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[DebugCallback()],
    optimizers=(optimizer, None),
)

# Start training
print("Starting training...")
trainer.train()

# Evaluation
print("\nStarting final evaluation...")
eval_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
for key, value in eval_results.items():
    if key == 'eval_loss':
        print(f"Loss: {value:.4f}")
    elif key == 'eval_bleu':
        print(f"BLEU Score: {value:.4f}")
    elif key == 'eval_squad_f1':
        print(f"F1 Score (SQuAD): {value:.4f}")
    elif key == 'eval_squad_exact_match':
        print(f"Exact Match (SQuAD): {value:.4f}")
    elif key.startswith('eval_'):
        print(f"{key.replace('eval_', '').replace('_', ' ').title()}: {value:.4f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")

# Perplexity calculation
if 'eval_loss' in eval_results:
    perplexity = np.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.4f}")
else:
    print("\n'eval_loss' not found in evaluation results. Cannot calculate perplexity.")

print("\n--- Evaluation Complete ---")

# Save model and tokenizer
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")


##FLAN-T5-BASE Train 3 (best performing model)
For this final and best-performing model, I used google/flan-t5-base with a smaller batch size (1) and trained for 20 epochs to allow more learning cycles while monitoring performance after each epoch. I set the learning rate to 1e-4, which worked better than previous lower values by accelerating learning without destabilizing training. I also enabled gradient accumulation (8 steps) to simulate a larger effective batch size, as I was running out of memory.

To ensure generalization and avoid overfitting, I added early stopping with a patience of 5 epochs. Compared to earlier attempts, this configuration balanced depth (more epochs) with caution (early stopping), and it showed clear improvements across metrics like BLEU, ROUGE, and SQuAD F1, as well as lower perplexity.

In [None]:
# Trainer Setup and Training

from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.optim import AdamW

# --- Configuration ---
MODEL_NAME = 'google/flan-t5-base'
LEARNING_RATE = 1e-4
BATCH_SIZE = 1
NUM_EPOCHS = 20 # more epochs
OUTPUT_DIR = "./flan_t5_breast_cancer_qa_3"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the model
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.05,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=8,
    fp16=False,
)

# Optimizer
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    optimizers=(optimizer, None),
)

# Start training
print("Starting training...")
trainer.train()

# Evaluation
print("\nStarting final evaluation...")
eval_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
for key, value in eval_results.items():
    if key == 'eval_loss':
        print(f"Loss: {value:.4f}")
    elif key == 'eval_bleu':
        print(f"BLEU Score: {value:.4f}")
    elif key == 'eval_squad_f1':
        print(f"F1 Score (SQuAD): {value:.4f}")
    elif key == 'eval_squad_exact_match':
        print(f"Exact Match (SQuAD): {value:.4f}")
    elif key.startswith('eval_'):
        print(f"{key.replace('eval_', '').replace('_', ' ').title()}: {value:.4f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")

# Perplexity calculation
if 'eval_loss' in eval_results:
    perplexity = np.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.4f}")
else:
    print("\n'eval_loss' not found in evaluation results. Cannot calculate perplexity.")

print("\n--- Evaluation Complete ---")

# Save model and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")


In [None]:
print(os.listdir("./flan_t5_breast_cancer_qa_3"))

## Qualitative Testing
To evaluate the performance of my fine-tuned FLAN-T5 model, I loaded the best checkpoint and deployed it for qualitative testing. I implemented a question-answer generation function that mirrors the same input formatting and max token lengths used during training to maintain consistency. The model was configured to generate responses using beam search with a no_repeat_ngram_size to reduce repetition.

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# --- Configuration (match with training setup) ---
OUTPUT_DIR = "./flan_t5_breast_cancer_qa_3"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 512

# --- Load Model and Tokenizer ---
print(f"Loading model and tokenizer from {OUTPUT_DIR}...")
try:
    model = T5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
    tokenizer = T5Tokenizer.from_pretrained(OUTPUT_DIR)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    exit()

# --- Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model moved to: {device}")

# --- Answer Generation Function ---
def generate_answer(question: str):
    formatted_question = f"question: {question}"

    input_ids = tokenizer.encode(
        formatted_question,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        input_ids,
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2 # to prevent repetitive phrases
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Qualitative Testing Example ---
print("\n--- Starting Qualitative Testing ---")

test_question = "What are common treatments for early stage breast cancer?"
predicted_answer = generate_answer(test_question)
print(f"\nQuestion: {test_question}")
print(f"Answer: {predicted_answer}")
print("-" * 30)

# --- Interactive Testing ---
print("\n--- Interactive Testing (type 'exit' to quit) ---")
while True:
    user_question = input("Enter your question: ")
    if user_question.lower() == 'exit':
        break
    if not user_question.strip():
        print("Please enter a question.")
        continue

    answer = generate_answer(user_question)
    print(f"Bot: {answer}")
    print("-" * 30)

print("\nQualitative testing complete.")


## Initial training
In the cells below, I started by training the model using the t5-small checkpoint. However, I quickly realized that both my preprocessing pipeline and the model itself were not well-suited for the complexity of the task. The T5-small model lacked the capacity to generate high-quality answers consistently, especially given the domain-specific nature of the breast cancer QA dataset. This resulted in low performance and high loss values. These were early-stage tests and do not reflect the final results—so feel free to skip them.

In [None]:
# Parse Q&A Pairs, Clean Text, and Create Train/Validation Splits
import re

MAIN_TEXT_COLUMN = dataset.column_names[0]

# Define the parsing function
def parse_qa_from_sft_format(example):
    """
    Parses the 'text' column to extract 'question' and 'answer'.
    Handles the <s>[INST] Q [/INST] A </s> format.
    """
    text = example[MAIN_TEXT_COLUMN]

    # Regex to find the pattern: <s>[INST] QUESTION_TEXT [/INST] ANSWER_TEXT </s>
    # re.DOTALL allows '.' to match newlines, important if Q/A spans multiple lines
    match = re.fullmatch(r'<s>\[INST\] (.*?) \[/INST\] (.*?) </s>', text, re.DOTALL)

    if match:
        question = match.group(1).strip()
        answer = match.group(2).strip()
        return {"question": question, "answer": answer}
    else:
        # If a line doesn't match the format, return None for filtering
        print(f"Warning: Could not parse example: {text[:100]}...") # Print problematic lines
        return {"question": None, "answer": None}

# Apply parsing to create 'question' and 'answer' columns
print("\nParsing 'text' column into 'question' and 'answer' columns...")
parsed_dataset = dataset.map(parse_qa_from_sft_format, remove_columns=[MAIN_TEXT_COLUMN]) # Remove original columns
print("Parsing complete. Filtering out unparsed examples...")

# Filter out any examples that failed parsing
parsed_dataset = parsed_dataset.filter(lambda example: example['question'] is not None and example['answer'] is not None)
print(f"Dataset after parsing and filtering: {len(parsed_dataset)} examples.")

# Define the cleaning function (re-used from previous iterations)
def clean_text_for_qa(text):
    text = str(text) # Ensure it's a string
    text = text.lower() # Lowercasing
    text = re.sub(r'\s+', ' ', text).strip() # Remove excessive whitespace
    return text

# Apply cleaning to the new 'question' and 'answer' columns
print("Applying cleaning function to 'question' and 'answer' columns...")
cleaned_dataset = parsed_dataset.map(lambda example: {
    'question': clean_text_for_qa(example['question']),
    'answer': clean_text_for_qa(example['answer'])
})
print("Cleaning complete.")

# Now, perform the train/validation split on the cleaned_dataset
train_test_split = cleaned_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset_hf = train_test_split['train']
val_dataset_hf = train_test_split['test']

print(f"\nTraining set size: {len(train_dataset_hf)}")
print(f"Validation set size: {len(val_dataset_hf)}")

print("\nExample of parsed and cleaned data from training set (first example):")
print(f"Question: {train_dataset_hf[0]['question']}")
print(f"Answer: {train_dataset_hf[0]['answer']}")

In [None]:
from transformers import T5Tokenizer
import torch

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    # Add the task prefix to the input questions
    inputs = [f"question: {q}" for q in examples['question']]
    targets = examples['answer']

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length= 128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Tokenize targets (answers)
    labels = tokenizer(
        targets,
        max_length= 380,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # T5 models expect the labels to be named 'labels'
    model_inputs["labels"] = labels["input_ids"]

    # For T5, we also need to replace padding token id in labels with -100
    # This is because -100 is ignored by the loss function.
    model_inputs["labels"] = torch.where(
        labels["attention_mask"] == 0,
        torch.tensor(-100, dtype=torch.long),
        model_inputs["labels"]
    )

    return model_inputs

# Apply the preprocessing function to the Hugging Face Datasets iusing .map()
tokenized_train_dataset = train_dataset_hf.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset_hf.map(preprocess_function, batched=True)

# Set the format of the datasets to PyTorch
# This ensures that when the Trainer accesses examples, they are already PyTorch tensors
tokenized_train_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


print("\nExample of tokenized input_ids (first question):")
print(tokenized_train_dataset[0]['input_ids'])
print("\nExample of tokenized attention_mask (first question):")
print(tokenized_train_dataset[0]['attention_mask'])
print("\nExample of tokenized labels (first answer):")
print(tokenized_train_dataset[0]['labels'])

print("\nDecoded input_ids (first question):")
print(tokenizer.decode(tokenized_train_dataset[0]['input_ids'], skip_special_tokens=True))
print("\nDecoded labels (first answer):")
# Note: -100 values will be skipped in decoding
print(tokenizer.decode([token for token in tokenized_train_dataset[0]['labels'].tolist() if token != -100], skip_special_tokens=True))


In [None]:
import evaluate
import numpy as np

# --- Define compute_metrics function for the Trainer ---
bleu_metric = evaluate.load("bleu")
squad_metric = evaluate.load("squad")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predicted_token_ids = np.argmax(predictions, axis=-1)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_results = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    bleu_score = bleu_results["bleu"]

    formatted_predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(decoded_preds)]
    formatted_references = [{"id": str(i), "answers": {"answer_start": [0], "text": [label]}} for i, label in enumerate(decoded_labels)]

    squad_results = squad_metric.compute(predictions=formatted_predictions, references=formatted_references)
    squad_f1 = squad_results["f1"]
    squad_em = squad_results["exact_match"]

    return {
        "bleu": round(bleu_score, 4),
        "squad_f1": round(squad_f1, 4),
        "squad_exact_match": round(squad_em, 4)
    }



In [None]:
# Trainer Setup and Training
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import os

# --- Configuration ---
MODEL_NAME = 't5-small'
LEARNING_RATE = 2e-5
BATCH_SIZE = 8
NUM_EPOCHS = 3
# # Output directory for saving model checkpoints
OUTPUT_DIR = "./t5_breast_cancer_chatbot"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- Load model ---
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    save_total_limit=1, # Saves only the last checkpoint
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    metric_for_best_model="eval_loss", # Metric to use for load_best_model_at_end
)

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- Train ---
print("Starting training...")
trainer.train()

# --- Evaluation ---
print("\nStarting evaluation...")
eval_results = trainer.evaluate()
print(eval_results)

# --- Save model and tokenizer ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")

In [None]:
# ]Model Evaluation and Perplexity Calculation

# --- Evaluation ---
print("\nStarting final evaluation...")
eval_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
# Print each metric with a clear label and formatted value
for key, value in eval_results.items():
    if key == 'eval_loss':
        print(f"Loss: {value:.4f}")
    elif key == 'eval_bleu': # Key name depends on what your compute_metrics returned
        print(f"BLEU Score: {value:.4f}")
    elif key == 'eval_squad_f1':
        print(f"F1 Score (SQuAD): {value:.4f}")
    elif key == 'eval_squad_exact_match':
        print(f"Exact Match (SQuAD): {value:.4f}")
    elif key.startswith('eval_'): # For any other 'eval_' metrics
        print(f"{key.replace('eval_', '').replace('_', ' ').title()}: {value:.4f}")
    else: # For runtime, samples_per_second etc.
        print(f"{key.replace('_', ' ').title()}: {value}")


# --- Perplexity Calculation (from eval_loss) ---
if 'eval_loss' in eval_results:
    perplexity = np.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.4f}")
else:
    print("\n'eval_loss' not found in evaluation results. Cannot calculate perplexity.")

print("\n--- Evaluation Complete ---")

In [None]:
# Trainer Setup and Training
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# --- Configuration (for tuning) ---
MODEL_NAME = 't5-small'
LEARNING_RATE = 1e-5
BATCH_SIZE = 16
NUM_EPOCHS = 5
# Output directory for saving model checkpoints
OUTPUT_DIR = "./t5_breast_cancer_chatbot_tuned_v1"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- Load model ---
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,  # 10% of total steps)
    save_total_limit=1, # Saves only the last checkpoint
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    metric_for_best_model="eval_loss",
    greater_is_better=False, # For eval_loss, smaller is better
)

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- Train ---
print("Starting training...")
trainer.train()

# --- Evaluation ---
print("\nStarting evaluation...")
eval_results = trainer.evaluate()
print(eval_results)

# --- Save model and tokenizer ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")

In [None]:
# ]Model Evaluation and Perplexity Calculation

# --- Evaluation ---
print("\nStarting final evaluation...")
eval_results = trainer.evaluate()

print("\n--- Evaluation Results ---")
# Print each metric with a clear label and formatted value
for key, value in eval_results.items():
    if key == 'eval_loss':
        print(f"Loss: {value:.4f}")
    elif key == 'eval_bleu': # Key name depends on what your compute_metrics returned
        print(f"BLEU Score: {value:.4f}")
    elif key == 'eval_squad_f1':
        print(f"F1 Score (SQuAD): {value:.4f}")
    elif key == 'eval_squad_exact_match':
        print(f"Exact Match (SQuAD): {value:.4f}")
    elif key.startswith('eval_'): # For any other 'eval_' metrics
        print(f"{key.replace('eval_', '').replace('_', ' ').title()}: {value:.4f}")
    else: # For runtime, samples_per_second etc.
        print(f"{key.replace('_', ' ').title()}: {value}")


# --- Perplexity Calculation (from eval_loss) ---
if 'eval_loss' in eval_results:
    perplexity = np.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.4f}")
else:
    print("\n'eval_loss' not found in evaluation results. Cannot calculate perplexity.")

print("\n--- Evaluation Complete ---")

In [None]:
# --- Configuration (for tuning) ---
MODEL_NAME = 't5-small' # Keep t5-small for now
LEARNING_RATE = 3e-5     # Increase slightly from 1e-5
BATCH_SIZE = 16
NUM_EPOCHS = 10           # Increased epochs to allow more time to converge
# # Output directory for saving model checkpoints
OUTPUT_DIR = "./t5_chatbot_tuned_v2"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- Load model ---
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True, # Keep this enabled!
    metric_for_best_model="eval_loss", # Ensure this is 'eval_loss' for smaller is better
    greater_is_better=False, # This is crucial for eval_loss
)

# --- Trainer --- (No changes here, use your existing setup)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- Train ---
print("Starting training...")
trainer.train()

# --- Evaluation ---
print("\nStarting evaluation...")
eval_results = trainer.evaluate()
print(eval_results)

# --- Save model and tokenizer ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✅ Model and tokenizer saved to {OUTPUT_DIR}")