In [4]:
import os
import re
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

In [5]:
model_checkpoint = "t5-base"
output_dir = "./models/t5_task_a_model"
train_file = "../dataset/task_a/MTS-Dialog-TrainingSet.csv"
val_file = "../dataset/task_a/MTS-Dialog-ValidationSet.csv"
max_input_length = 512
max_target_length = 128
num_train_epochs = 3
batch_size = 4
learning_rate = 3e-5
logging_steps = 100

In [6]:
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df["input_text"] = "Dialogue: " + df["dialogue"].str.replace(r"\s+", " ", regex=True).str.strip()
    df["target_text"] = (
        "Section: " + df["section_header"].str.strip() +
        " Summary: " + df["section_text"].str.replace(r"\s+", " ", regex=True).str.strip()
    )
    return Dataset.from_pandas(df[["input_text", "target_text"]])

train_dataset = load_and_preprocess(train_file)
val_dataset = load_and_preprocess(val_file)


In [7]:
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

def tokenize(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=max_input_length, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        example["target_text"], max_length=max_target_length, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=logging_steps,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


  trainer = Seq2SeqTrainer(


In [9]:
trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.8941,0.784557
2,0.8189,0.743147
3,0.7356,0.73324


('./models/t5_task_a_model\\tokenizer_config.json',
 './models/t5_task_a_model\\special_tokens_map.json',
 './models/t5_task_a_model\\spiece.model',
 './models/t5_task_a_model\\added_tokens.json')