In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertConfig, BertTokenizer

In [5]:
def positional_encoding(positions, d):
    """
    Precomputes a matrix with all the positional encodings 
    
    Arguments:
        positions (int) -- Maximum number of positions to be encoded 
        d (int) -- Encoding size 
    
    Returns:
        pos_encoding -- (1, position, d_model) A matrix with the positional encodings
    """
    # initialize a matrix angle_rads of all the angles 
    pos = torch.arange(positions).unsqueeze(1)
    ks = torch.arange(d).unsqueeze(0)
    angle_rads = get_angles(pos, ks, d)

    # apply sin to even indices in the array; 2i
    # apply cos to odd indices in the array; 2i+1
    for k in range(d):
        if k % 2 == 0:
            angle_rads[:, k] = torch.sin(angle_rads[:, k])
        else:
            angle_rads[:, k] = torch.cos(angle_rads[:, k])
    
    pos_encoding = angle_rads.unsqueeze(0)
    
    return pos_encoding.float()

def get_angles(pos, ks, d):
    angle_rates = 1 / torch.pow(10000, (2 * (ks // 2)) / torch.tensor(d).float())
    return pos.float() * angle_rates

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate)
        self.mha2 = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate)
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, fully_connected_dim),
            nn.ReLU(),
            nn.Linear(fully_connected_dim, embedding_dim)
        )
        self.layernorm1 = nn.LayerNorm(embedding_dim, eps=layernorm_eps)
        self.layernorm2 = nn.LayerNorm(embedding_dim, eps=layernorm_eps)
        self.layernorm3 = nn.LayerNorm(embedding_dim, eps=layernorm_eps)
        self.dropout_ffn = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, padding_mask):
        mha_output1 = self.mha1(x, x, x)
        out1 = self.layernorm1(x + mha_output1)

        mha_output2 = self.mha2(out1, enc_output, enc_output, key_padding_mask=padding_mask)
        out2 = self.layernorm2(out1 + mha_output2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout_ffn(ffn_output)

        out3 = self.layernorm3(out2 + ffn_output)

        return out3

class Decoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
                 maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(target_vocab_size, embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, embedding_dim)

        self.dec_layers = nn.ModuleList([
            DecoderLayer(embedding_dim, num_heads, fully_connected_dim, dropout_rate, layernorm_eps)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, padding_mask):
        seq_len = x.size(1)

        x = self.embedding(x)
        x += self.pos_encoding[:, :seq_len, :]  # Add positional encoding
        x *= torch.sqrt(torch.tensor(self.embedding_dim, dtype=torch.float32))
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output, padding_mask)

        return x

In [7]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig

class Encoder(nn.Module):
    def __init__(self, bert_model, maximum_position_encoding, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.bert = bert_model
        self.config = bert_model.config  # Extract BERT configuration

        self.pos_encoding = positional_encoding(maximum_position_encoding, self.config.hidden_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, padding_mask):
        """
        Forward pass for the Encoder
        
        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, hidden_size)
                An array of the indexes of the words in the input sentence
            padding_mask -- Boolean mask to ensure that the padding is not 
                            treated as part of the input
        Returns:
            out -- Tensor of shape (batch_size, input_seq_len, hidden_size)
        """
        # Get BERT embeddings
        bert_output = self.bert(x, attention_mask=~padding_mask)[0]
        
        # Apply positional encoding
        seq_len = bert_output.size(1)
        pos_encoding = self.pos_encoding[:, :seq_len, :]
        bert_output += pos_encoding

        # Apply dropout
        out = self.dropout(bert_output)

        return out


In [19]:
class Transformer(nn.Module):
    def __init__(self, bert_model, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
                 maximum_position_encoding,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(bert_model, dropout_rate=dropout_rate, maximum_position_encoding=maximum_position_encoding)

        self.decoder = Decoder(num_layers=num_layers, 
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               target_vocab_size=target_vocab_size,
                               maximum_position_encoding=maximum_position_encoding,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = nn.Linear(fully_connected_dim, target_vocab_size)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, input_sentence, output_sentence, enc_padding_mask, dec_padding_mask):
        enc_output = self.encoder(input_sentence, enc_padding_mask)
        dec_output = self.decoder(output_sentence, enc_output, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)
        final_output = self.softmax(final_output)

        return final_output

In [20]:

def create_transformer_model(bert_model_name, num_layers, fully_connected_dim, num_heads,
                             maximum_position_encoding=1000,
                             dropout_rate=0.1, layernorm_eps=1e-6):
    # Load pretrained BERT model
    bert_model = BertModel.from_pretrained(bert_model_name)
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)

    # Extract relevant parameters from BERT configuration
    embedding_dim = bert_model.config.hidden_size
    target_vocab_size = len(tokenizer.get_vocab())

    # Create Transformer model
    transformer_model = Transformer(
        bert_model=bert_model,
        num_layers=num_layers,
        embedding_dim=embedding_dim,
        num_heads=num_heads,
        fully_connected_dim=fully_connected_dim,
        target_vocab_size=target_vocab_size,
        maximum_position_encoding=maximum_position_encoding,
        dropout_rate=dropout_rate,
        layernorm_eps=layernorm_eps
    )

    return transformer_model, tokenizer



Some weights of BertModel were not initialized from the model checkpoint at Geotrend/bert-base-uk-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

bert_model_name = 'Geotrend/bert-base-uk-cased'
num_layers = 6  # Adjust as needed
fully_connected_dim = 512  # Adjust as needed
num_heads = 8

transformer_model = create_transformer_model(
    bert_model_name, num_layers, fully_connected_dim, num_heads
)


2023-11-25 22:23:34 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |

2023-11-25 22:23:34 INFO: Using device: cuda
2023-11-25 22:23:34 INFO: Loading: tokenize
2023-11-25 22:23:34 INFO: Loading: mwt
2023-11-25 22:23:34 INFO: Done loading processors!


Complete Vocabulary: {'<pad>': 0, '<unk>': 1, 'Привіт': 2, ',': 3, 'світе': 4, '!': 5, 'Це': 6, 'приклад': 7, 'тексту': 8, 'для': 9, 'обробки': 10, '.': 11}
Processed Lines:
tensor([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11]])
Mask:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
