In [None]:
# environment setting 
# https://huggingface.co/course/chapter0/1?fw=pt

!pip install transformers sentencepiece datasets nltk evaluate torch


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 14.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 33.7 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 71.2 MB/s 
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 37.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-n

In [None]:
import transformers
import torch
import numpy as np
import nltk
nltk.download('punkt')

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from evaluate import load

# preprocssing. this encodes command and actions 
def preprocess_function(examples):

    model_inputs = tokenizer(examples["commands"], max_length=tokenizer.model_max_length, truncation=True)

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["actions"], max_length=tokenizer.model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# training function, returns the model (which is trainer)
def model_train():
  # define args for finetuning
  batch_size = 8
  args = Seq2SeqTrainingArguments(
      f"{model_name}-scan-finetuned-yoon-1027",
      evaluation_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=20, # or 20
      predict_with_generate=True,
      # fp16=True,
      # push_to_hub=True,
  )

  # collator fixed of max length 
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
  )

  trainer.train()

  return trainer    

# decoding function, this converts list of intergers into sentence 
def decode_data(prediction, label):
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # We convert back into the sentence
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

  print('##################')
  print('decoded prediction: ', decoded_preds[0])
  print('decoded label: ', decoded_labels[0])

  return decoded_preds, decoded_labels

######### truncate function: THIS FUNCTION SHOULD BE REMOVED 
# this is actually wrong, as we are truncating the actual result to match the length
# We don't want to use this function, but this is needed as per now 
def truncate_data(decoded_preds, decoded_labels):

  test_preds = []
  test_labels = []

  for i in range(len(decoded_labels)):
      test_preds.append(decoded_preds[i][:len(decoded_labels[i])
      test_labels.append(decoded_labels[i])

  return test_preds, test_labels

# evaluate function: this calculates exact match score 
def evaluate_exact_match(decoded_preds, decoded_labels):
  exact_match_metric = load("exact_match")

  match = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)['exact_match']
  print('Exact match: ', match)

  return match


if __name__ == "__main__":
  # dataset 
  raw_datasets = load_dataset("scan", "simple")
  
  ### smoke test
  # raw_datasets['train'] = raw_datasets["train"].shuffle(seed=42).select(range(500))
  # raw_datasets['test'] = raw_datasets["test"].shuffle(seed=42).select(range(500))
  
  # model name, this can be changed 
  model_checkpoint = "t5-small" 
  model_name = model_checkpoint.split("/")[-1]

  # define tokenizer, model 
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

  # preprocess dataset
  tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)  

  # train and predict 
  trainer = model_train()

  # creating max length among test labels
  max_length = 0
  for i in tokenized_datasets['test']['labels']:
    if len(i) > max_length: 
      max_length = len(i)

  # predict: this needs to be fixed, but we are not sure how to work on it 
  # by default, it truncates the prediction into max_length = 20, but we are not sure why this is happening
  predictions, label_ids, _ = trainer.predict(tokenized_datasets["test"], max_length=max_length) 

  print('prediction: ', predictions[0])
  print('label: ', label_ids[0])

  # decoding results 
  decoded_preds, decoded_labels = decode_data(predictions, label_ids)  

  #### IMPORTANT #####################################
  #### truncate needed as per now, but want to figure out this part
  test_preds, test_labels = truncate_data(decoded_preds, decoded_labels)
  print('truncated_prediction: ', test_preds[0])
  print('truncated_label: ', test_labels[0])

  # evaluate exact match result 
  match = evaluate_exact_match(test_preds, test_labels)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/2 [00:00<?, ?it/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/5 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: commands, actions. If commands, actions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16728
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 41820
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.1086,0.052165
2,0.058,0.030111
3,0.0389,0.031016
4,0.0304,0.012465
5,0.0256,0.012969
6,0.021,0.009573
7,0.0177,0.01162
8,0.0159,0.008016
9,0.0145,0.013507
10,0.0134,0.008969


Saving model checkpoint to t5-small-scan-finetuned-yoon-1027/checkpoint-500
Configuration saved in t5-small-scan-finetuned-yoon-1027/checkpoint-500/config.json
Model weights saved in t5-small-scan-finetuned-yoon-1027/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-500/special_tokens_map.json
Copy vocab file to t5-small-scan-finetuned-yoon-1027/checkpoint-500/spiece.model
Saving model checkpoint to t5-small-scan-finetuned-yoon-1027/checkpoint-1000
Configuration saved in t5-small-scan-finetuned-yoon-1027/checkpoint-1000/config.json
Model weights saved in t5-small-scan-finetuned-yoon-1027/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-1000/special_tokens_map.json
Copy v

Epoch,Training Loss,Validation Loss
1,0.1086,0.052165
2,0.058,0.030111
3,0.0389,0.031016
4,0.0304,0.012465
5,0.0256,0.012969
6,0.021,0.009573
7,0.0177,0.01162
8,0.0159,0.008016
9,0.0145,0.013507
10,0.0134,0.008969


Saving model checkpoint to t5-small-scan-finetuned-yoon-1027/checkpoint-24500
Configuration saved in t5-small-scan-finetuned-yoon-1027/checkpoint-24500/config.json
Model weights saved in t5-small-scan-finetuned-yoon-1027/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-24500/tokenizer_config.json
Special tokens file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-24500/special_tokens_map.json
Copy vocab file to t5-small-scan-finetuned-yoon-1027/checkpoint-24500/spiece.model
Deleting older checkpoint [t5-small-scan-finetuned-yoon-1027/checkpoint-23000] due to args.save_total_limit
Saving model checkpoint to t5-small-scan-finetuned-yoon-1027/checkpoint-25000
Configuration saved in t5-small-scan-finetuned-yoon-1027/checkpoint-25000/config.json
Model weights saved in t5-small-scan-finetuned-yoon-1027/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in t5-small-scan-finetuned-yoon-1027/checkpoint-25000/tokeniz

prediction:  [    0    27   834 21164   567   834 27262    27   834 21164   567   834
 27262    27   834 21164   567   834 27262    27   834 21164   567   834
 27262    27   834 21164   567   834 27262    27   834 21164   567   834
 27262    27   834 21164   567   834  3765  6245    27   834 21164   567
   834  3765  6245     1     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0  

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

Exact match:  0.966284074605452


In [None]:
print('truncated_prediction: ', test_preds[0])
print('truncated_label: ', test_labels[0])

truncated_prediction:  I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_LEFT I_TURN_LEFT
truncated_label:  I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_LEFT I_TURN_LEFT


In [None]:
len(test_preds[0]), len(test_labels[0])

(101, 101)

In [None]:
import gc
gc.collect()

1985