## Fine tuning T5 model to map english to reverse-english

* e.g., 'hello world' -> 'world hello'
* reverse-english: tokenized input reversed

In [46]:
!pip install transformers
!pip install datasets
!pip install nltk
!pip install numpy
!pip install rouge_score
!pip install torch torchvision torchaudio
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [47]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorWithPadding
from datasets import load_dataset, load_metric
import nltk
import torch

nltk.download('punkt')
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
dataset = load_dataset("multi_news")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/9507060efcd5189100109e25df8326eb07274a36/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_bucke

  0%|          | 0/3 [00:00<?, ?it/s]

## Loading and Preprocessing Dataset

In [48]:
train, val, test = dataset['train'], dataset['validation'], dataset['test']
train.shape, val.shape, test.shape

((44972, 2), (5622, 2), (5622, 2))

\</s> -> end of sequence token
\<unk> -> unknown token
\<pad> -> pad token

In [102]:
test = tokenizer("hello world it is cool outside and i like transformers.")
print(f"english: {tokenizer.decode(test['input_ids'])}")

rev_sent = tokenizer.decode(test['input_ids'][::-1][1:])
print(f"reverse-english:{rev_sent}")
tokenizer.encode(rev_sent)
tokenizer.decode(tokenizer.encode(rev_sent))

english: hello world it is cool outside and i like transformers.</s>
reverse-english:.s transformer likei  and outside cool is it world hello


'.s transformer likei and outside cool is it world hello</s>'

In [95]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [107]:
# create processed input and processed output columns
def process(row):
  """
  make input a question answer prompt
  question: What is this sentence in reverse?
  context: og sentence
  answer: sentence in reverse english
  (0) split by sentences, take first sentence per article
  (1) prepend question before sentence
  """
  # processs
  sentence = nltk.sent_tokenize(row['summary'][2:])[0]# each summary prefixed by '- '
  # rev_sentence = " ".join(reversed(sentence[:-1].split()))
  processed_input = f'question: What is this text in reverse?  context: {sentence}'
  # tokenize
  model_input = tokenizer(processed_input, max_length=128, truncation=True)
  og_input =  tokenizer(sentence, max_length=128, truncation=True)

  rev_sent = tokenizer.decode(og_input['input_ids'][::-1][1:]) # skip eos token
  model_target = tokenizer.encode(rev_sent) # encode reverse sentence
  model_input['labels'] = model_target
  return model_input

processed_dataset = dataset.map(process)

  0%|          | 0/44972 [00:00<?, ?ex/s]

  0%|          | 0/5622 [00:00<?, ?ex/s]

  0%|          | 0/5622 [00:00<?, ?ex/s]

In [109]:
# pre-processing sanity checks
processed_train = processed_dataset['train']
processed_val = processed_dataset['validation']
processed_test = processed_dataset['test']
print(tokenizer.decode(processed_test['input_ids'][0]) + '\n')
print(tokenizer.decode(processed_test['labels'][0]))

question: What is this text in reverse? context: It's a race for the governor's mansion in 11 states today, and the GOP could end the night at the helm of more than two-thirds of the 50 states.</s>

. states 50 the ofsthird- two than more ofhelm the at night the end could GOP the and, today states 11 insion mans' governor the for racea s' It</s>


In [110]:
token_lens = torch.tensor([len(row['input_ids']) for row in processed_train])
print(f'max tokenized input size: {torch.max(token_lens)}')

max tokenized input size: 128


In [111]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5622
    })
})

## Training Model

In [112]:
import numpy as np
import nltk
# rouge metric - https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460#:~:text=ROUGE%2DN%20measures%20the%20number,consist%20of%20a%20single%20word.

# for model evaluation
metric = load_metric('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [117]:
# train model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 8
model_name = "t5-small-english-reverse-english-v2"
model_dir = f"{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=400,
    logging_strategy="steps",
    logging_steps=400,
    save_strategy="steps",
    save_steps=400,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [118]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    AutoModelForSeq2SeqLM.from_pretrained('t5-small'),
    args=args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/9507060efcd5189100109e25df8326eb07274a36/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
   

In [119]:
# finally train
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary. If document, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 44972
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5622
  Number of trainable parameters = 60506624


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
400,4.441,1.287731,50.9988,31.7717,49.1679,49.1906,18.3718
800,1.6065,0.390891,66.6111,60.4096,66.4276,66.4316,18.3947
1200,0.8256,0.247632,68.5366,65.0932,68.4872,68.4816,18.4112
1600,0.5726,0.205685,69.0664,66.3378,69.0569,69.042,18.4379
2000,0.4445,0.184036,69.2624,66.7852,69.2572,69.2488,18.4427
2400,0.3807,0.173495,69.3456,66.9235,69.3389,69.3347,18.4466
2800,0.3341,0.158841,69.4655,67.1873,69.4572,69.4525,18.4502
3200,0.3095,0.153117,69.5056,67.3087,69.4989,69.4918,18.4516


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary. If document, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5622
  Batch size = 8
Saving model checkpoint to t5-small-english-reverse-english-v2/checkpoint-400
Configuration saved in t5-small-english-reverse-english-v2/checkpoint-400/config.json
Model weights saved in t5-small-english-reverse-english-v2/checkpoint-400/pytorch_model.bin
tokenizer config file saved in t5-small-english-reverse-english-v2/checkpoint-400/tokenizer_config.json
Special tokens file saved in t5-small-english-reverse-english-v2/checkpoint-400/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary. If document, summary are n

KeyboardInterrupt: ignored

### stopped early to save compute as loss rate appears to be plateauing

In [120]:
from google.colab import drive
from google.colab import files

!zip -r t5-small-english-reverse-english-v2.zip t5-small-english-reverse-english-v2

  adding: t5-small-english-reverse-english-v2/ (stored 0%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/ (stored 0%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/tokenizer_config.json (deflated 83%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/tokenizer.json (deflated 74%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/pytorch_model.bin (deflated 9%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/trainer_state.json (deflated 76%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/rng_state.pth (deflated 28%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/special_tokens_map.json (deflated 86%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/training_args.bin (deflated 48%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/scheduler.pt (deflated 49%)
  adding: t5-small-english-reverse-english-v2/checkpoint-3200/config.json (deflated 62%)
  adding: t5-small-

In [122]:
!cp t5-small-english-reverse-english-v2.zip 'gdrive/MyDrive/Colab Notebooks/'