In [1]:
import torch
from bertviz import head_view, model_view

from transformer_implementation import Transformer, Tokenizer, TransformerConfig

## Init

In [2]:
# init tokenizer
tokenizer = Tokenizer()

In [3]:
# init config
config = TransformerConfig(
    vocab_size = tokenizer.vocab_size(),
    block_size = 64,
    n_layer = 6, # 6,
    n_head = 8,
    n_embd = 512,
    max_epochs=150,
    train_data_size = 30000, # batch * 500 iters
    max_iters = int(30000/12)-1, # dataset_size / batch_size
    eval_iters = int(3000/12)-1, # val_dataset_size / batch_size
    BOS_IDX = tokenizer.BOS_IDX,
    EOS_IDX = tokenizer.EOS_IDX,
    PAD_IDX = tokenizer.PAD_IDX,
)
print(config)

TransformerConfig:

Tokenizer:
+---------------------+
| vocab_size : 100277 |
| BOS_IDX    : 100264 |
| EOS_IDX    : 100265 |
| PAD_IDX    : 100266 |
+---------------------+

Data:
+---------------------------------+
| block_size              : 64    |
| batch_size              : 12    |
| train_data_size         : 30000 |
| grad_accumulation_steps : 40    |
+---------------------------------+

Model:
+-----------------+
| n_layer : 6     |
| n_head  : 8     |
| n_embd  : 512   |
| dropout : 0.1   |
| bias    : 0     |
+-----------------+

Training Loop:
+-------------------+
| max_epochs : 150  |
| max_iters  : 2499 |
| eval_iters : 249  |
+-------------------+

AdamW Optimizer:
+------------------------+
| learning_rate : 0.0006 |
| beta1         : 0.9    |
| beta2         : 0.95   |
| weight_decay  : 0.1    |
| eps           : 1e-09  |
+------------------------+

System:
+-----------------------+
| device_type : cuda    |
| device      : cuda    |
| dtype       : float16 |
| compil

In [4]:
# Create model
model = Transformer(config)
# model.load_model("./out/transformer-train2.pth")
model.load_model("./out/train_model.pth")
model = model.to(config.device)
model.eval()
print()

Number of Encoder parameters: 70.22M
number of Decoder parameters: 76.52M
Total number of parameters: 146.74M



In [5]:
def translate(sentences, tokenizer, model, config):
    """
    This function tokenizes input sentences, translates them using the provided model,
    and decodes the output into human-readable text. It also returns the attention dictionary from the model.

    Args:
        - sentences (list[str]): List of sentences to be translated.
        - tokenizer (Tokenizer): Tokenizer used for encoding and decoding sequences.
        - model (Transformer): The model used for translation.
        - config (Config): The configuration object that defines parameters like block_size.

    Returns:
        - decode_output (list[str]): List of translated sentences.
        - attn (dict): Dictionary containing attention information from the last layer of the model.
    """
    # Tokenize sentences
    tknzr = tokenizer.encoder
    sequences = []
    masks =  []

    # Encode each sentence and add it to the list of sequences
    for sentence in sentences:
        sequence = tokenizer.sequence_padding(tknzr.encode(sentence), config.block_size).unsqueeze(dim=0)
        mask = tokenizer.generate_padding_mask(sequence, device = config.device)
        sequences.append(sequence)
        masks.append(mask)

    # Concatenate the sequences into a tensor
    sequences = torch.cat(sequences, dim=0)
    masks = torch.cat(masks, dim=0).to(config.device)

    # Set the model to evaluation mode and translate sentences
    model.eval()
    outputs, attn = model.generate(
        src=sequences.to(config.device),
        idx=torch.full((1, 1), tokenizer.BOS_IDX).long().to(config.device),
        src_mask=masks,
        max_new_tokens=config.block_size,
    )
    # Initialize a list to store the decoded sentences
    decode_output = []
    # Decode each output sequence and add it to the list of decoded outputs
    for output in outputs:
        output = tokenizer.sequence_cleaner(output)
        decode_output += [tknzr.decode(output)]

    # Return the decoded sentences and the attention dictionary
    return decode_output, attn

In [6]:
input = ["""This is an issue which, as you will understand, will have to be addressed at IGC level and is an issue that only a Special European Council can authorise the Portuguese Presidency to pursue, because this kind of issue falls outside the institutional framework in which the forthcoming Intergovernmental Conference will take place."""]
# expected_output = ['Je suis un professeur.']

In [7]:
outputs, attentions = translate(input, tokenizer, model, config)



In [8]:
outputs

["Comme c'est un problème qui comprendra, vous le comprendrez, le niveau de la Conférence intergouvernementale est qu'il ne s'agit pas d'un problème extraordinaire de l'autorité européenne. Ceyra, puisse poursuivre dans ce domaine hab"]

C'est une question qui devra être étudiée, vous le comprendrez, au niveau de la Conférence intergouvernementale. Il s'agit là d'un thème que seul un Conseil européen extraordinaire peut autoriser car c'est une question située hors du cadre institutionnel dans lequel s'inscrit la prochaine Conférence intergouvernementale.

In [9]:
def format_attn(input, output, attentions, batch: int = 0):
    """
    This function formats the attention outputs and tokenized inputs and outputs for easier interpretation and visualization.

    Args:
        - input (str): The original input sentence.
        - output (str): The translated output sentence.
        - attentions (dict): A dictionary containing the attention information from the model.
        - batch (int, optional): The batch index to format. Defaults to 0.

    Returns:
        - tokens_input (list[str]): The tokenized input sentence, padded to max_len.
        - tokens_output (list[str]): The tokenized output sentence, padded to max_len.
        - tensor_encoder_attn (torch.Tensor): The attention tensor for the encoder, trimmed and reshaped.
        - tensor_cross_attn (torch.Tensor): The cross-attention tensor, trimmed and reshaped.
        - tensor_decoder_attn (torch.Tensor): The attention tensor for the decoder, trimmed and reshaped.
    """

    # Stack the attention tensors along a new dimension
    tensor_encoder_attn = torch.stack(attentions['encoder_attn'], dim=0)
    tensor_cross_attn = torch.stack(attentions['cross_attn'], dim=0)
    tensor_decoder_attn = torch.stack(attentions['decoder_attn'], dim=0)

    # Tokenize the input and output sentences
    tokens_input = tokenizer.tokenize_from_str(input[batch])
    tokens_output = tokenizer.tokenize_from_str(output[batch])

    # Find the maximum length of the input and output tokens
    max_len = min(len(tokens_input), len(tokens_output))

    # If the input tokens are shorter than the max length, pad with empty strings
    if len(tokens_input) < max_len:
        tokens_input = tokens_input + [''] * (max_len - len(tokens_input))
    # Otherwise, pad the output tokens with empty strings
    else:
        tokens_output = tokens_output + [''] * (max_len - len(tokens_output))

    # Trim and reshape the attention tensors
    tensor_encoder_attn = tensor_encoder_attn[:, batch:batch+1, :, 0:max_len, 0:max_len] # layers, batch, heads, seq_len, seq_len
    tensor_cross_attn = tensor_cross_attn[:, batch:batch+1, :, 0:max_len, 0:max_len] # layers, batch, heads, seq_len, seq_len
    tensor_decoder_attn = tensor_decoder_attn[:, batch:batch+1, :, 0:max_len, 0:max_len] # layers, batch, heads, seq_len, seq_len

    # Return the formatted tokens and attention tensors
    return tokens_input, tokens_output, tensor_encoder_attn, tensor_cross_attn, tensor_decoder_attn

In [10]:
tokens_input,\
tokens_output,\
tensor_encoder_attn,\
tensor_cross_attn,\
tensor_decoder_attn = format_attn(input, outputs, attentions)

In [11]:
html_model_view = model_view(
    encoder_attention=tensor_encoder_attn,
    decoder_attention=tensor_decoder_attn,
    cross_attention=tensor_cross_attn,
    encoder_tokens=tokens_input[0:tensor_decoder_attn.size(-1)],
    decoder_tokens=tokens_output[0:tensor_decoder_attn.size(-1)],
    html_action='return'
)
with open("./out/model_view.html", 'w') as file:
    file.write(html_model_view.data)

In [12]:
html_head_view = head_view(
    encoder_attention=tensor_encoder_attn,
    decoder_attention=tensor_decoder_attn,
    cross_attention=tensor_cross_attn,
    encoder_tokens=tokens_input[0:tensor_decoder_attn.size(-1)],
    decoder_tokens=tokens_output[0:tensor_decoder_attn.size(-1)],
    html_action='return'
)
with open("./out/head_view.html", 'w') as file:
    file.write(html_head_view.data)