In [2]:
# Mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define base path

BASE_PATH = '/content/drive/MyDrive/AI/tcfase3'

In [4]:
# Necessary installations

%%bash
pip install torch
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 471.6/471.6 kB 32.0 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 11.5 MB/s eta 0:00:00
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 57.0 MB/s eta 0:00:00
Downloading multi

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.


In [5]:
# Necessary Imports

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import nltk
import evaluate
import numpy as np
import torch

In [6]:
# Load the dataset saved to the dataset variable

DATASET_INDEX = 10

dataset = load_dataset('json', data_files=f"{BASE_PATH}/dataset_part_{DATASET_INDEX}.json")
dataset = dataset["train"].train_test_split(test_size=0.05)
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 19000
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 1000
    })
})


In [7]:
# Load the model to be trained from the disk

BASE_MODEL_PATH = BASE_PATH + '/base_model'
OUTPUT_MODEL_PATH = BASE_PATH + "/fine_tuned_model"

model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_PATH)
tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL_PATH)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [8]:
# Prepare data for fine-tuning

def preprocess_function(examples):
   inputs = [f"Please describe this product: {title}" for title in examples["title"]]
   model_inputs = tokenizer(inputs, max_length=300, truncation=True)

   labels = tokenizer(text_target=examples["content"],
                      max_length=650,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/19000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 19000
    })
    test: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})


In [10]:
# Define the function to evaluate the metrics, using rouge score

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
# Define training arguments

training_args = Seq2SeqTrainingArguments(
   output_dir=BASE_PATH+"/results",
   evaluation_strategy="epoch",
   learning_rate=1e-4, # Taxa de aprendizado
   per_device_train_batch_size=8, # Número de exemplos de treino usados por batch
   per_device_eval_batch_size=8, # Número de exemplos de avaliação usados por batch
   gradient_accumulation_steps=4, # Permite simular batches maiores, interessante utilizar quando há limitação de memória
   weight_decay=0.01, # Regularização para prevenir overfitting adicionando penalidade aos pesos do modelo
   save_total_limit=2, # Número de checkpoints do modelo salvos durante o treino
   num_train_epochs=3, # Épocas de treinamento
   predict_with_generate=True,
   push_to_hub=False
)




In [12]:
# Define Trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [13]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model(OUTPUT_MODEL_PATH)

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,3.3136,3.065063,0.106513,0.020318,0.083613,0.093346
1,3.2578,3.047984,0.105555,0.020653,0.08284,0.092829
2,3.2318,3.043123,0.105747,0.0198,0.082561,0.092483


