In [1]:
# environment setting 
# https://huggingface.co/course/chapter0/1?fw=pt

!pip install transformers sentencepiece datasets nltk evaluate torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from huggingface_hub import notebook_login
# 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

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
# !apt install git-lfs

In [3]:
import transformers

print(transformers.__version__)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

4.23.1


In [4]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("scan", "simple")
raw_datasets



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['commands', 'actions'],
        num_rows: 16728
    })
    test: Dataset({
        features: ['commands', 'actions'],
        num_rows: 4182
    })
})

In [5]:
# data preprocessing
'''
Result: 
DatasetDict({
    train: Dataset({
        features: ['commands', 'actions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16728
    })
    test: Dataset({
        features: ['commands', 'actions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4182
    })
})
'''
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

model_checkpoint = "t5-small" 
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# encode feature/target and prepare dynamic padding 
max_input_length = 512
max_target_length = 512

def preprocess_function(examples):
    inputs = [doc for doc in examples["commands"]]
    model_inputs = tokenizer(inputs, padding="longest", max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["actions"], padding="longest", max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)  
data_collator = DataCollatorForSeq2Seq(tokenizer) # this is needed for dynamic padding

tokenized_datasets

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


DatasetDict({
    train: Dataset({
        features: ['commands', 'actions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16728
    })
    test: Dataset({
        features: ['commands', 'actions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4182
    })
})

In [None]:
summary_ids = model.generate(tokens_input, min_length=80,
                             max_length=150,
                             length_penalty=20, 
                             num_beams=2)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [6]:
# fine tuning
from transformers import Seq2SeqTrainingArguments, T5ForConditionalGeneration, Seq2SeqTrainer

# args = TrainingArguments("t5-test-trainer-0929")
# define args for finetuning
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
print(model_name)
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-yoon_1014",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=True,
)

# suggestion: T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint) # need to check if this is correct model 

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

t5-small


/content/t5-small-finetuned-yoon_1014 is already a clone of https://huggingface.co/yk2678/t5-small-finetuned-yoon_1014. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: actions, commands. If actions, commands are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16728
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 20910


Epoch,Training Loss,Validation Loss


In [None]:
predictions = trainer.generate(tokenized_datasets["test"])
print(predictions)

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: commands, actions. If commands, actions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4182
  Batch size = 8


PredictionOutput(predictions=array([[    0,    27,   834, ...,   834, 27262,    27],
       [    0,    27,   834, ...,    27,   834, 12054],
       [    0,    27,   834, ...,    27,   834,  5017],
       ...,
       [    0,    27,   834, ...,    27,   834, 21164],
       [    0,    27,   834, ...,    27,   834,  5017],
       [    0,    27,   834, ...,   834, 27262,    27]]), label_ids=array([[   27,   834, 21164, ...,     0,     0,     0],
       [   27,   834, 21164, ...,     0,     0,     0],
       [   27,   834, 21164, ...,     0,     0,     0],
       ...,
       [   27,   834, 21164, ...,  -100,  -100,  -100],
       [   27,   834, 21164, ...,  -100,  -100,  -100],
       [   27,   834, 21164, ...,  -100,  -100,  -100]]), metrics={'test_loss': 0.004545229021459818, 'test_runtime': 111.8764, 'test_samples_per_second': 37.381, 'test_steps_per_second': 4.675})


In [None]:
predictions

'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I'

In [None]:
# y_sample = [tokenizer.convert_ids_to_tokens(pred) for pred in predictions[1]]
# y_sample

In [None]:
# y_pred = [tokenizer.convert_ids_to_tokens(pred) for pred in predictions[0]]
y_pred = [tokenizer.convert_ids_to_tokens(pred) for pred in predictions]
y_pred

[['<pad>',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I'],
 ['<pad>',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'WA',
  'LK',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'WA'],
 ['<pad>',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'LO',
  'OK',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'LO'],
 ['<pad>',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'LE',
  'FT',
  '▁I',
  '_',
  'J',
  'UMP',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'LE',
  'FT',
  '▁I'],
 ['<pad>',
  '▁I',
  '_',
  'R',
  'UN',
  '▁I',
  '_',
  'R',
  'UN',
  '▁I',
  '_',
  'R',
  'UN',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'LE',
  'FT'],
 ['<pad>',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'WA',
  'LK',
  '▁I',
  '_',
  'TUR',
  'N',
  '_',
  'RIGHT',
  '▁I',
  '_',
  'WA'],
 ['<pad>',
  '▁I',
  '_',


In [None]:
y_pred_v2 = [''.join(t).replace('▁', ' ') for t in y_pred]
y_pred_v2

['<pad> I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 '<pad> I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 '<pad> I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LO',
 '<pad> I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 '<pad> I_RUN I_RUN I_RUN I_TURN_LEFT',
 '<pad> I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 '<pad> I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TUR',
 '<pad> I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 '<pad> I_TURN_LEFT I_TURN_LEFT I_LOOK I',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_RUN I_TUR',
 '<pad> I_LOOK I_LOOK I_TURN_RIGHT I_TURN_',
 '<pad> I_TURN_LEFT I_TURN_LEFT I_WALK I',
 '<pad> I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 '<pad> I_RUN I_RUN I_TURN_LEFT I_WALK',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_JUMP I_TUR',
 '<pad> I_TURN_LEFT I_TURN_LEFT I_JUMP I',
 '<pad> I_TURN_LEFT I_TURN_LEFT I_RUN I',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 '<pad> I_LOOK I_TURN_LEFT I_TURN_LEFT I',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 '<pad> I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGH

In [None]:
y_pred_v3 = [''.join(t).replace('<pad>', '') for t in y_pred_v2]
y_pred_v3

[' I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 ' I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 ' I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LO',
 ' I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 ' I_RUN I_RUN I_RUN I_TURN_LEFT',
 ' I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 ' I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TUR',
 ' I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 ' I_TURN_LEFT I_TURN_LEFT I_LOOK I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_RUN I_TUR',
 ' I_LOOK I_LOOK I_TURN_RIGHT I_TURN_',
 ' I_TURN_LEFT I_TURN_LEFT I_WALK I',
 ' I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 ' I_RUN I_RUN I_TURN_LEFT I_WALK',
 ' I_TURN_RIGHT I_TURN_RIGHT I_JUMP I_TUR',
 ' I_TURN_LEFT I_TURN_LEFT I_JUMP I',
 ' I_TURN_LEFT I_TURN_LEFT I_RUN I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 ' I_LOOK I_TURN_LEFT I_TURN_LEFT I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 ' I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_LO',
 ' I_WALK I_WALK I_WALK I_TURN_LEFT',
 ' I_TURN_RIGHT I_TURN_

In [None]:
y_pred_v4 = [t[1:] for t in y_pred_v3]
y_pred_v4

['I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 'I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 'I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LO',
 'I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 'I_RUN I_RUN I_RUN I_TURN_LEFT',
 'I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 'I_TURN_LEFT I_JUMP I_TURN_LEFT I',
 'I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TUR',
 'I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 'I_TURN_LEFT I_TURN_LEFT I_LOOK I',
 'I_TURN_RIGHT I_TURN_RIGHT I_RUN I_TUR',
 'I_LOOK I_LOOK I_TURN_RIGHT I_TURN_',
 'I_TURN_LEFT I_TURN_LEFT I_WALK I',
 'I_TURN_LEFT I_LOOK I_TURN_LEFT I',
 'I_RUN I_RUN I_TURN_LEFT I_WALK',
 'I_TURN_RIGHT I_TURN_RIGHT I_JUMP I_TUR',
 'I_TURN_LEFT I_TURN_LEFT I_JUMP I',
 'I_TURN_LEFT I_TURN_LEFT I_RUN I',
 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 'I_LOOK I_TURN_LEFT I_TURN_LEFT I',
 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 'I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_LO',
 'I_WALK I_WALK I_WALK I_TURN_LEFT',
 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I',
 '

In [None]:
# evaluate metric (EM)
# loading exact match metric
import numpy as np

from evaluate import load
exact_match_metric = load("exact_match")

wrong = []
res = 0.0
for i in range(len(tokenized_datasets["test"]['actions'])):
    predictions = y_pred_v4[i] 
    references = tokenized_datasets["test"]['actions'][i] 
    min_length = min(len(predictions), len(references))
    
    match = exact_match_metric.compute(predictions=predictions[:min_length], references=references[:min_length])['exact_match']
    
    if match != 1.0: wrong.append([i, match])
    res += exact_match_metric.compute(predictions=predictions[:min_length], references=references[:min_length])['exact_match']
    
print(res)

4179.529602338644


In [None]:
score = res / len(tokenized_datasets["test"]['actions'])
print('score: ', score)

score:  0.999409278416701


In [None]:
len(y_pred_v4[0]), len(tokenized_datasets["test"]['actions'][0])

(40, 101)

In [None]:
y_pred_v4[1], tokenized_datasets["test"]['actions'][1]

('I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WA',
 'I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN')

In [None]:
len(tokenized_datasets["test"]['actions'])

4182

In [None]:
wrong

[[84, 0.8709677419354839],
 [379, 0.7894736842105263],
 [1161, 0.8709677419354839],
 [1224, 0.8666666666666667],
 [1468, 0.8709677419354839],
 [1684, 0.7837837837837838],
 [2157, 0.5769230769230769],
 [2303, 0.8709677419354839],
 [2680, 0.8260869565217391],
 [2738, 0.717948717948718],
 [3317, 0.48484848484848486]]

In [None]:
for i, k in wrong:
    print('Wrong case:', i, 'Score: ', k)
    print(y_pred_v4[i])
    print(tokenized_datasets["test"]['actions'][i])
    print()

Wrong case: 84 Score:  0.8709677419354839
I_WALK I_WALK I_JUMP I_JUMP</s>
I_WALK I_WALK I_JUMP I_JUMP I_JUMP

Wrong case: 379 Score:  0.7894736842105263
I_TURN_RIGHT I_TURN_RIGHT I_LOOK</s>
I_TURN_RIGHT I_LOOK

Wrong case: 1161 Score:  0.8709677419354839
I_LOOK I_LOOK I_LOOK I_LOOK</s>
I_LOOK I_LOOK I_LOOK I_LOOK I_LOOK

Wrong case: 1224 Score:  0.8666666666666667
I_TURN_LEFT I_TURN_LEFT I_TURN_
I_TURN_LEFT I_TURN_LEFT I_LOOK

Wrong case: 1468 Score:  0.8709677419354839
I_JUMP I_JUMP I_JUMP I_JUMP</s>
I_JUMP I_JUMP I_JUMP I_JUMP I_JUMP

Wrong case: 1684 Score:  0.7837837837837838
I_TURN_RIGHT I_TURN_RIGHT I_TURN_LEFT
I_TURN_RIGHT I_TURN_RIGHT I_RUN I_RUN

Wrong case: 2157 Score:  0.5769230769230769
I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_LO
I_TURN_RIGHT I_LOOK I_LOOK

Wrong case: 2303 Score:  0.8709677419354839
I_TURN_LEFT I_TURN_LEFT I_TURN_
I_TURN_LEFT I_TURN_LEFT I_RUN I_RUN

Wrong case: 2680 Score:  0.8260869565217391
I_RUN I_LOOK I_LOOK</s>
I_RUN I_LOOK I_LOOK I_LOOK

Wrong case: 2738 