In [7]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = load_from_disk("../data/tokenized_squad_small")

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
model


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../model",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
    logging_steps=50,
    report_to="none"
)


In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [11]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0392,0.040439
2,0.0337,0.041152
3,0.0273,0.041577


TrainOutput(global_step=3750, training_loss=0.09189107964833577, metrics={'train_runtime': 685.617, 'train_samples_per_second': 43.756, 'train_steps_per_second': 5.47, 'total_flos': 4060254044160000.0, 'train_loss': 0.09189107964833577, 'epoch': 3.0})

In [12]:
trainer.save_model("../model/final")
tokenizer.save_pretrained("../model/final")


('../model/final\\tokenizer_config.json',
 '../model/final\\special_tokens_map.json',
 '../model/final\\spiece.model',
 '../model/final\\added_tokens.json',
 '../model/final\\tokenizer.json')