In [None]:
pip install SentencePiece



In [None]:
pip install datasets



In [None]:
pip install transformers[torch]



In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict



#gpu availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#data processig
def load_and_preprocess_dataset():
    dataset = load_dataset("knkarthick/dialogsum")

    def preprocess_function(examples):
        #data type checking
        input_text = ['summarize: ' + dialog for dialog in examples['dialogue']]
        target_text = examples['summary']
        return {
            'input_text': input_text,
            'target_text': target_text
        }

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    #10% of the data
    reduced_dataset = tokenized_datasets["train"].train_test_split(test_size=0.95)
    return DatasetDict({
        'train': reduced_dataset['test'],
        'validation': tokenized_datasets['validation']
    })

#tokenization
def tokenize_function(examples, tokenizer):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

#preparing for tarining
tokenized_datasets = load_and_preprocess_dataset()
tokenized_datasets = tokenized_datasets.map(lambda x: tokenize_function(x, tokenizer), batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

#fine-tuning model
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

trainer.train()

#saving fine-tuned model
model.save_pretrained("./my_fine_tuned_model")

# Function to summarize conversation
def summarize(conversation):
    inputs = tokenizer.encode("summarize: " + conversation, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0])


Using device: cuda


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/11837 [00:00<?, ? examples/s]



Step,Training Loss
100,9.2621
200,2.4645
300,0.8595
400,0.6971
500,0.7524
600,0.5946
700,0.5314
800,0.5442
900,0.5347
1000,0.5404


In [None]:
trainer.evaluate()

In [None]:
pip install rouge_score



In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()