# Transformer Training (Pytorch)

Requirements:
 - python 3.7+
 - pytorch
 - transformers (Hugging Face)
 - datasets (Hugging Face)
 - tqdm
 - seaborn
 - matplotlib

In [None]:
# Import cell
import os
from datasets import load_dataset
import torch
import transformers
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig,
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,
                          Seq2SeqTrainer, pipeline)

In [None]:

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
transformers.logging.set_verbosity_warning()


## Translation english to french with T5

### Inference with a pretrained model - translation en-fr

In [None]:
base_pipeline = pipeline("translation_en_to_fr", model='t5-small')
outputs = base_pipeline(["I like Transformers.", "The movie right?"])
output_text = [output['translation_text'] for output in outputs]
print(output_text)

### Fine tuning a pretrained model

In [None]:
# Load dataset (Opus Books) and split into train and validation
translation_ds = load_dataset("opus_books", "en-fr")['train']
translation_ds = translation_ds.train_test_split(test_size=0.2)
translation_ds['train'][0:2]

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Transformer
transformer = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# Data collator (that pad the sequences dynamically)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=transformer)

In [None]:
tokenizer(["I like Transformers.", "The movie right?"])

In [None]:
# Pre-process dataset for T5
def preprocess_ds(tokenizer, dataset, input_key, output_key, prefix='',
                  max_length=128):
    input_txt = [prefix + sample[input_key] for sample in dataset['translation']]
    output_txt = [sample[output_key] for sample in dataset['translation']]
    input_dataset = tokenizer(input_txt, max_length=max_length, truncation=True)
    with tokenizer.as_target_tokenizer():  # use output tokenizer here
        labels = tokenizer(output_txt, max_length=max_length, truncation=True)
    input_dataset['labels'] = labels['input_ids']
    return input_dataset

translation_ds = translation_ds.map(lambda ds: preprocess_ds(
    tokenizer, ds, 'en', 'fr', 'translate English to French: '
    ),batched=True)  # process in batch of size 1000

translation_ds['train'][0].keys()


In [None]:
def train_model(model):
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=1,
        warmup_ratio=0.1,
        fp16=True,
        report_to="none",
        )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=translation_ds["train"],
        eval_dataset=translation_ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        )

    trainer.train()

In [None]:
train_model(transformer)

In [None]:
# checkpoint = torch.load('results/pytorch_model.bin')
# transformer.load_state_dict(checkpoint)

In [None]:
device = 0 if torch.cuda.is_available() else -1
pipeline_translation = pipeline("translation_en_to_fr",
                                model=transformer,
                                tokenizer=tokenizer,
                                device=device)
outputs = pipeline_translation(["I like Transformers.", "The movie right?"])
output_text = [output['translation_text'] for output in outputs]
print(output_text)

### Train a model from random initialization

In [None]:
# Get base config of the needed architecture
config = AutoConfig.from_pretrained("t5-small")
# Changes the configuration (optional)
config.d_ff = 1024
# Build the model
new_transformer = AutoModelForSeq2SeqLM.from_config(config)
print(new_transformer.config.d_ff)


In [None]:
train_model(new_transformer)

### Attention visualization

In [None]:
seq = 1

device = 'cuda' if torch.cuda.is_available() else 'cpu'
transformer = transformer.to(device)
input_ids = torch.tensor(translation_ds['train']['input_ids'][seq:seq+1], device=device)
attention_mask = torch.tensor(translation_ds['train']['attention_mask'][seq:seq+1], device=device)
output_with_attention = transformer.generate(input_ids,
                                             attention_mask=attention_mask,
                                             output_attentions=True,
                                             return_dict_in_generate=True,
                                             max_length=100)

In [None]:
def draw(data, x, y, ax):
    sns.heatmap(
        data, xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
        cbar=True, ax=ax, cmap="cool"
    )

def remove_underscore(tokens):
    return [token.replace(chr(9601), "") for token in tokens]

input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0],
                         skip_special_tokens=True)
input_tokens = remove_underscore(input_tokens)

output_tokens = tokenizer.convert_ids_to_tokens(
            output_with_attention.sequences[0], skip_special_tokens=True, 
        )
output_tokens = remove_underscore(output_tokens)

print(input_tokens)
print(output_tokens)
print('input len:', len(input_tokens))
print('output len', len(output_tokens))

#### Cross attention

In [None]:
attention_raw = output_with_attention.cross_attentions
# attention_raw: (output_token, layer, sequence, head, 1, input_token)

attention = torch.stack([torch.stack(attention_raw[i], dim=0)
                         for i in range(len(attention_raw))])
# attentions: (output_token, layer, sequence, head, 1, input_token)
print(attention.shape)
attention = torch.squeeze(attention, dim=-2)
attention = torch.permute(attention, (1, 3, 2, 0, 4))
attention = torch.reshape(attention, (attention.shape[0], attention.shape[1],
                                      -1, attention.shape[4]))
# attentions: (layer, head, output_token, input_token)
print(attention.shape)

# mean attention over layers and heads
mean_attention = torch.mean(attention, dim=[0, 1])
print(mean_attention.shape)

normalized_mean_attention = mean_attention / torch.max(mean_attention)

_, ax = plt.subplots(1, 1, figsize=(15, 15))
draw(
    normalized_mean_attention.detach().cpu().numpy(),
    input_tokens,
    output_tokens,
    ax=ax,
    )
plt.title('Mean Cross Attention')
plt.savefig("ressources/cross_attention.png", facecolor='white')
plt.show();

#### Input self attention

In [None]:
attention_raw = output_with_attention.encoder_attentions
attention = torch.stack(attention_raw)
attention = torch.squeeze(attention, dim=1)
mean_attention = torch.mean(attention, dim=[0, 1])
normalized_mean_attention = mean_attention / torch.max(mean_attention)

_, ax = plt.subplots(1, 1, figsize=(15, 15))
draw(
    normalized_mean_attention.detach().cpu().numpy(),
    input_tokens,
    input_tokens,
    ax=ax,
    )
plt.title('Mean Self Attention Encoder')
plt.savefig("ressources/input_attention.png", facecolor='white')
plt.show();

#### Output self attention

In [None]:
attention_raw = output_with_attention.decoder_attentions
n = len(attention_raw)

attention = torch.zeros((n, n), device=device)
for i in range(n):
    att = torch.stack(attention_raw[i])[:, 0, :, 0, :]
    mean_att = torch.mean(att, dim=[0, 1])
    attention[i, torch.arange(end=i+1)] = mean_att

normalized_mean_attention = attention / torch.max(attention)

_, ax = plt.subplots(1, 1, figsize=(15, 15))
draw(
    normalized_mean_attention.detach().cpu().numpy(),
    ['<start>'] + output_tokens[:-1],
    output_tokens,
    ax=ax,
    )
plt.title('Mean Self Attention Decoder')
plt.savefig("ressources/output_attention.png", facecolor='white')
plt.show();