In [None]:
# environment setting 
# https://huggingface.co/course/chapter0/1?fw=pt

!pip install transformers sentencepiece datasets nltk evaluate torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 12.2 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 66.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 67.4 MB/s 
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.1 MB/s 
[31mERROR: Operation cancelled by user[0m


In [None]:
import transformers
import torch
import numpy as np
import nltk
nltk.download('punkt')

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from evaluate import load

# preprocssing. this encodes command and actions 
def preprocess_function(examples):

    model_inputs = tokenizer(examples["commands"], max_length=tokenizer.model_max_length, truncation=True)

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples["actions"], max_length=tokenizer.model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# training function, returns the model (which is trainer)
def model_train():
  # define args for finetuning
  batch_size = 8
  args = Seq2SeqTrainingArguments(
      f"{model_name}-scan-finetuned-yoon-1027",
      evaluation_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=10, # or 20
      predict_with_generate=True,
      # fp16=True,
      # push_to_hub=True,
  )

  # collator fixed of max length 
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
  )

  trainer.train()

  return trainer    

# decoding function, this converts list of intergers into sentence 
def decode_data(prediction, label):
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # We convert back into the sentence
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

  print('##################')
  print('decoded prediction: ', decoded_preds[0])
  print('decoded label: ', decoded_labels[0])

  return decoded_preds, decoded_labels

######### truncate function: THIS FUNCTION SHOULD BE REMOVED 
# this is actually wrong, as we are truncating the actual result to match the length
# We don't want to use this function, but this is needed as per now 
def truncate_data(decoded_preds, decoded_labels):

  test_preds = []
  test_labels = []

  for i in range(len(decoded_labels)):
      min_length = min(len(decoded_preds[i]), len(decoded_labels[i]))
      test_preds.append(decoded_preds[i][:min_length])
      test_labels.append(decoded_labels[i][:min_length])

  return test_preds, test_labels

# evaluate function: this calculates exact match score 
def evaluate_exact_match(decoded_preds, decoded_labels):
  exact_match_metric = load("exact_match")

  match = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)['exact_match']
  print('Exact match: ', match)

  return match


if __name__ == "__main__":
  # dataset 
  raw_datasets = load_dataset("scan", "simple")
  
  ### smoke test
  # raw_datasets['train'] = raw_datasets["train"].shuffle(seed=42).select(range(500))
  # raw_datasets['test'] = raw_datasets["test"].shuffle(seed=42).select(range(500))
  
  # model name, this can be changed 
  model_checkpoint = "t5-small" 
  model_name = model_checkpoint.split("/")[-1]

  # define tokenizer, model 
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

  # preprocess dataset
  tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)  

  # train and predict 
  trainer = model_train()

  # creating max length among test labels
  max_length = 0
  for i in tokenized_datasets['test']['labels']:
    if len(i) > max_length: 
      max_length = len(i)

  # predict: this needs to be fixed, but we are not sure how to work on it 
  # by default, it truncates the prediction into max_length = 20, but we are not sure why this is happening
  predictions, label_ids, _ = trainer.predict(tokenized_datasets["test"], max_length=max_length) 

  print('prediction: ', predictions[0])
  print('label: ', label_ids[0])

  # decoding results 
  decoded_preds, decoded_labels = decode_data(predictions, label_ids)  

  #### IMPORTANT #####################################
  #### truncate needed as per now, but want to figure out this part
  test_preds, test_labels = truncate_data(decoded_preds, decoded_labels)
  print('truncated_prediction: ', test_preds[0])
  print('truncated_label: ', test_labels[0])

  # evaluate exact match result 
  match = evaluate_exact_match(test_preds, test_labels)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/2 [00:00<?, ?it/s]

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
 

Epoch,Training Loss,Validation Loss
1,No log,1.181746
2,No log,0.614694
3,No log,0.370956
4,No log,0.287225
5,No log,0.262319
6,No log,0.243996
7,No log,0.232734
8,0.851700,0.225099
9,0.851700,0.229417
10,0.851700,0.223816


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: commands, actions. If commands, actions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: commands, actions. If commands, actions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: commands, actions. If commands, actions are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation ***

prediction:  [    0    27   834 21164   567   834  3765  6245    27   834   448  7443
    27   834 21164   567   834  3765  6245    27   834   448  7443    27
   834 21164   567   834  3765  6245    27   834   448  7443    27   834
 21164   567   834  3765  6245    27   834   448  7443    27   834 21164
   567   834  3765  6245    27   834   448  7443    27   834 21164   567
   834  3765  6245    27   834   448  7443    27   834 21164   567   834
  3765  6245    27   834   448  7443    27   834 21164   567   834  3765
  6245    27   834   448  7443    27   834 21164   567   834  3765  6245
    27   834   448  7443    27   834 21164   567   834  3765  6245    27
   834   448  7443    27   834 21164   567   834  3765  6245    27   834
 21164   567   834  3765  6245    27   834   448  7443    27   834 21164
   567   834  3765  6245    27   834   448  7443    27   834 21164   567
   834  3765  6245    27   834   448  7443    27   834 21164   567   834
  3765  6245    27   834   448  7443  

In [None]:
print('truncated_prediction: ', test_preds[0])
print('truncated_label: ', test_labels[0])

truncated_prediction:  I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I
truncated_label:  I_WALK I_WALK I_TURN_LEFT I_TURN_LEFT I_RUN I_TURN_LEFT I_TURN_LEFT I_RUN I_TURN_LEFT I_TURN_LEFT I_RUN


In [None]:
len(test_preds[0]), len(test_labels[0])

In [None]:
import gc
gc.collect()

1985