In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import os
import re
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
)

In [3]:
# Configuration
model_checkpoint = "t5-small"  # You can try "t5-base" if you have enough VRAM
output_dir = "./models/t5_task_b_model_v2"
train_file = "../dataset/task_b+c/data/challenge_data/train.csv"
val_file = "../dataset/task_b+c/data/challenge_data/valid.csv"

max_input_length = 768
max_target_length = 512
num_train_epochs = 5
batch_size = 2
learning_rate = 3e-5
logging_steps = 50

In [4]:
# Load and preprocess dataset
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df["input_text"] = "Dialogue: " + df["dialogue"].str.replace(r"\s+", " ", regex=True).str.strip()
    df["target_text"] = df["note"].str.replace(r"\s+", " ", regex=True).str.strip()
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_dataset = load_and_preprocess(train_file)
val_dataset = load_and_preprocess(val_file)

In [5]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

def tokenize(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=max_input_length, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        example["target_text"], max_length=max_target_length, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [6]:
# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_dir="./logs_taskb_v2",
    logging_strategy="steps",
    logging_steps=logging_steps,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,  # ✅ Keep best model
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    report_to="none"
)



In [7]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

  trainer = Seq2SeqTrainer(


In [8]:
# Train and Save
trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Training complete. Model saved to:", output_dir)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,4.816385
2,6.715100,4.166975
3,4.480200,3.87141
4,4.480200,3.713581
5,4.191700,3.656216


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Training complete. Model saved to: ./models/t5_task_b_model_v2
