### set device and data

In [1]:
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {DEVICE}')

using device: cuda


In [2]:
def file2list(file):
    lst = []
    for line in open(f'./{file}', 'r', encoding='utf-8-sig'):
      line = line.replace('\n', '')
      lst.append(line)
    return lst

# train, valid, test texts
en_train_lst = file2list('train.en')
en_valid_lst = file2list('val.en')
en_test_lst  = file2list('test.en')
print(f'en data: {len(en_train_lst), len(en_valid_lst), len(en_test_lst)}')

de_train_lst = file2list('train.de')
de_valid_lst = file2list('val.de')
de_test_lst  = file2list('test.de')
print(f'de data: {len(de_train_lst), len(de_valid_lst), len(de_test_lst)}')

en data: (29001, 1015, 1000)
de data: (29001, 1015, 1000)


### make dataset dictionary

In [3]:
train_dict = {'text': en_train_lst, 'label': de_train_lst}
valid_dict = {'text': en_valid_lst, 'label': de_valid_lst}
test_dict  = {'text': en_test_lst, 'label': de_test_lst}

In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from datasets import Dataset
from datasets.dataset_dict import DatasetDict

dataset_dict = DatasetDict({'train': Dataset.from_dict(train_dict),
                            'valid': Dataset.from_dict(valid_dict),
                            'test' : Dataset.from_dict(test_dict)})
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 29001
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1015
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


In [6]:
dataset_dict['train'][0]

{'text': 'Two young, White males are outside near many bushes.',
 'label': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

### make tokenized dataset

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-small')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
def preprocess_function(examples):
  inputs  = [text for text in examples['text']]
  targets = [label for label in examples['label']]

  # for inputs
  model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
  # for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=128, truncation=True)
  model_inputs['labels'] = labels['input_ids']

  return model_inputs

tokenized_dataset_dict = dataset_dict.map(preprocess_function, batched=True)
tokenized_dataset_dict = tokenized_dataset_dict.remove_columns(dataset_dict['train'].column_names)

  0%|          | 0/30 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
tokenized_dataset_dict['train'][0]

{'input_ids': [2759,
  1021,
  6,
  1945,
  5069,
  7,
  33,
  1067,
  1084,
  186,
  3,
  30271,
  5,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [11280,
  16815,
  7838,
  15,
  16282,
  436,
  256,
  8671,
  35,
  16,
  74,
  13271,
  2221,
  49,
  21162,
  3992,
  5,
  1]}

### transformer

In [11]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

### training

In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

LR_RATE    = 0.0001
BATCH_SIZE = 64

training_args = Seq2SeqTrainingArguments(
    output_dir='./',
    evaluation_strategy='epoch',
    learning_rate=LR_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

In [14]:
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import numpy as np
from datasets import load_metric

def postprocess_text(preds, labels):
  preds  = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]
  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds

  # decode prediction
  if isinstance(preds, tuple): preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # replace -100 in the labels to pad token, decode labels
  labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # postprocessing texts
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  # metric
  metric = load_metric("sacrebleu")
  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {'bleu': result['score']}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result['gen_len'] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict['train'],
    eval_dataset=tokenized_dataset_dict['valid'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 29001
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1362
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.935897,34.0913,16.9113
2,1.287300,0.891357,34.704,16.9458
3,1.124200,0.881139,35.0002,16.9379


***** Running Evaluation *****
  Num examples = 1015
  Batch size = 64
Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1015
  Batch size = 64
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1015
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1362, training_loss=1.1721441462176487, metrics={'train_runtime': 238.9225, 'train_samples_per_second': 364.147, 'train_steps_per_second': 5.701, 'total_flos': 759083261657088.0, 'train_loss': 1.1721441462176487, 'epoch': 3.0})

In [16]:
# download saved model
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained('./')

Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin


### testing

In [17]:
test_dataset = tokenized_dataset_dict['test']
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


{'eval_loss': 0.9221218228340149,
 'eval_bleu': 35.2462,
 'eval_gen_len': 16.717,
 'eval_runtime': 5.475,
 'eval_samples_per_second': 182.65,
 'eval_steps_per_second': 2.922,
 'epoch': 3.0}