# **Implementation of the Transformer architecture with PyTorch**

## **Importations**

In [None]:
!pip install transformers
!pip install sentencepiece

In [2]:
import torch
from torch import nn
from transformers import CamembertTokenizer

# **1) Model**

In [3]:
class EmbeddingLayer(nn.Module):
  def __init__(self, vocab_size=35000, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

  def forward(self, x):
    return self.embedding(x)

model = EmbeddingLayer(vocab_size=10000, embedding_dim=512)
x = torch.randint(0, 100, (32, 20))
res = model(x)
assert res.shape == (32, 20, 512)

In [4]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.query_layer = nn.Linear(embedding_dim, embedding_dim)
    self.key_layer = nn.Linear(embedding_dim, embedding_dim)
    self.value_layer = nn.Linear(embedding_dim, embedding_dim)

  def forward(self, x, mask=torch.IntTensor([[]])):
    batch_size, nb_tokens, embedding_dim = x.shape
    Q = self.query_layer(x)
    K = self.key_layer(x)
    V = self.value_layer(x)
    K_transpose = torch.transpose(K, 1, 2)
    QK = torch.matmul(Q, K_transpose)
    
    QK_normalized = QK / (embedding_dim**(1/2))

    _, n = mask.shape
    if n != 0:
      for i in range(batch_size):
        for j in range(nb_tokens):
          if mask[i][j] == 0:
            QK_normalized[i, :, j] = 0
              
    softmax = nn.Softmax(dim=2)(QK_normalized)
    res = torch.matmul(softmax, V)

    return res

model = ScaledDotProductAttention(embedding_dim=512)
x = torch.rand(32, 20, 512)
mask = torch.randint(0, 2, (32, 20))
res = model(x, mask=mask)
assert res.shape == (32, 20, 512)

In [5]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, attention_heads=8, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.attention_heads = attention_heads
    self.embedding_dim = embedding_dim
    self.attention_layers = [ScaledDotProductAttention(embedding_dim=embedding_dim//attention_heads) for _ in range(attention_heads)]

  def forward(self, x, mask=torch.IntTensor([[]])):
    batch_size, nb_tokens, embedding_dim = x.shape
    x = x.reshape(batch_size, nb_tokens, self.attention_heads, embedding_dim // self.attention_heads)
    
    concat = torch.Tensor()
    for i, attention_layer in enumerate(self.attention_layers):
      attention = attention_layer(x[:, :, i, :], mask=mask)
      concat = torch.concat([concat, attention], dim=2)
  
    return concat

model = MultiHeadAttentionLayer(embedding_dim=512)
x = torch.rand(32, 20, 512)
mask = torch.randint(0, 2, (32, 20))
res = model(x, mask=mask)
assert res.shape == (32, 20, 512)

In [6]:
class EncoderLayer(nn.Module):
  def __init__(self, embedding_dim=512, dropout_rate=0.1, **kwargs):
    super(**kwargs).__init__()
    self.multi_head_attention = MultiHeadAttentionLayer()
    self.dense1 = nn.Linear(embedding_dim, embedding_dim)
    self.dense2 = nn.Linear(embedding_dim, embedding_dim)
    self.layer_norm = nn.LayerNorm(embedding_dim)
    self.dropout = nn.Dropout(dropout_rate)
    self.relu = nn.ReLU()

  def forward(self, x, mask=torch.IntTensor([[]])):
    x = self.layer_norm(x + self.dropout(self.multi_head_attention(x, mask=mask)))
    x = self.layer_norm(x + self.dropout(self.dense2(self.relu(self.dense1(x)))))

    return x

model = EncoderLayer(embedding_dim=512)
x = torch.rand(32, 20, 512)
mask = torch.randint(0, 2, (32, 20))
res = model(x, mask=mask)
assert res.shape == (32, 20, 512)

In [7]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim=512, dropout_rate=0.1, **kwargs):
    super(**kwargs).__init__()
    self.embedding_layer = EmbeddingLayer(vocab_size, embedding_dim)
    self.encoder_layer1 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)
    self.encoder_layer2 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)
    self.encoder_layer3 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)
    self.encoder_layer4 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)
    self.encoder_layer5 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)
    self.encoder_layer6 = EncoderLayer(embedding_dim=embedding_dim, dropout_rate=dropout_rate)

  def forward(self, x, mask=torch.IntTensor([[]])):
    x = self.embedding_layer(x)
    x = self.encoder_layer1(x, mask=mask)
    x = self.encoder_layer2(x, mask=mask)
    x = self.encoder_layer3(x, mask=mask)
    x = self.encoder_layer4(x, mask=mask)
    x = self.encoder_layer5(x, mask=mask)
    x = self.encoder_layer6(x, mask=mask)

    return x

model = Encoder(vocab_size=10000, embedding_dim=512)
x = torch.randint(0, 100, (32, 20))
mask = torch.randint(0, 2, (32, 20))
res = model(x, mask=mask)
assert res.shape == (32, 20, 512)

model

Encoder(
  (embedding_layer): EmbeddingLayer(
    (embedding): Embedding(10000, 512)
  )
  (encoder_layer1): EncoderLayer(
    (multi_head_attention): MultiHeadAttentionLayer()
    (dense1): Linear(in_features=512, out_features=512, bias=True)
    (dense2): Linear(in_features=512, out_features=512, bias=True)
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (relu): ReLU()
  )
  (encoder_layer2): EncoderLayer(
    (multi_head_attention): MultiHeadAttentionLayer()
    (dense1): Linear(in_features=512, out_features=512, bias=True)
    (dense2): Linear(in_features=512, out_features=512, bias=True)
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (relu): ReLU()
  )
  (encoder_layer3): EncoderLayer(
    (multi_head_attention): MultiHeadAttentionLayer()
    (dense1): Linear(in_features=512, out_features=512, bias=True)
    (dense2): Linear(in_features

# **2) Test**

## **a) Data**

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd

path = "drive/MyDrive/IA/Datasets/IMDB Movie Sentiment/data.csv"
df = pd.read_csv(path)
print(df.shape)
df.head(2)

(40000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [11]:
df['label'].unique()

array([0, 1])

## **b) Preprocessing**

In [12]:
import re

def preprocess(s):
  s = s.lower()
  s = re.sub("(,|'|\"|-|/|\(|\)|#|:|;)", " ", s)
  s = re.sub("\.", " ", s)
  s = re.sub("[0-9]", "", s)
  s = re.sub(" +", " ", s)
  return s

preprocess("When I put this movie (Interstellar 1983) in my DVD player")
df['text_processed'] = df['text'].apply(preprocess)

## **c) Tokenizer**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [14]:
def get_tokens_mask(tokenizer, text, max_len=50):
    """
    Prend en entrée un tokenizer et une chaine de caractères, et l'encode grâce au tokenizer
    
    Args
    ----
        tokenizer (CamembertTokenizer)
        text (str) : phrase à encoder
        max_len (int) : entier indiquant le nombre maximal de caractères
    
    Returns
    -------
        tokens (int list): liste de tokens encodés
        mask (int list): le mask à fournir au modèle CamemBERT
        size (int): le nombre de tokens utiles dans la phrase
    """

    tokens = []
    mask = []

    text = tokenizer.encode(text)
    text = text[:max_len-1] # On a besoin de ce -1 car on va rajouter le token 'pad' ensuite, et sans le -1 la longueur totale de text sera max_len+1

    size = len(text)
    pads = tokenizer.encode(['[PAD]']*(max(0, max_len - size)), is_split_into_words=True)

    tokens[:max(max_len, size)] = text[:max(max_len, size)]
    tokens += pads[1:-1]
    mask = [1]*size+[0]*len(pads[1:-1])

    return tokens, mask, size

vocab_size = len(tokenizer)
max_len = 30

## **d) Données**

In [15]:
from torch.utils.data import DataLoader, Dataset

class Data(Dataset):
    """
    Dataset qui contient deux listes de strings (text1 et text2) ainsi qu'une liste de similarités entre ces strings (target)
    """
    def __init__(self, df, tokenizer, max_len=50):      
        self.tokenizer = tokenizer
        self.text = df.text_processed
        self.target = df.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        tokens, mask, size = get_tokens_mask(self.tokenizer, self.text[index], max_len=self.max_len)
        target = self.target[index]

        return tokens, mask, target
  

dataset = Data(df, tokenizer=tokenizer, max_len=max_len)
test_size = 0.1
nb_rows = len(dataset)
nb_rows_train = int((1 - test_size) * nb_rows)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [nb_rows_train, nb_rows - nb_rows_train])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

## **e) Entrainement d'un modèle**

In [43]:
class Model(nn.Module):
  def __init__(self, vocab_size=0, max_len=30, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim)
    self.dense = nn.Linear(embedding_dim, 2)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, mask=torch.IntTensor([[]])):
    x = self.encoder(x, mask=mask)
    x = self.dense(x[:, 0])
    x = self.softmax(x)

    return x

model = Model(vocab_size=vocab_size, max_len=max_len, embedding_dim=512)
model

Model(
  (encoder): Encoder(
    (embedding_layer): EmbeddingLayer(
      (embedding): Embedding(28996, 512)
    )
    (encoder_layer1): EncoderLayer(
      (multi_head_attention): MultiHeadAttentionLayer()
      (dense1): Linear(in_features=512, out_features=512, bias=True)
      (dense2): Linear(in_features=512, out_features=512, bias=True)
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (relu): ReLU()
    )
    (encoder_layer2): EncoderLayer(
      (multi_head_attention): MultiHeadAttentionLayer()
      (dense1): Linear(in_features=512, out_features=512, bias=True)
      (dense2): Linear(in_features=512, out_features=512, bias=True)
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (relu): ReLU()
    )
    (encoder_layer3): EncoderLayer(
      (multi_head_attention): MultiHeadAttentionLayer()
      (dense1): Linear(in_features=51

In [44]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
cross_entropy = nn.CrossEntropyLoss()

In [45]:
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

EPOCHS = 5

losses_train = []
losses_test = []

for epoch in range(EPOCHS):
    running_loss_train = 0
    running_loss_test = 0
    print("Epoch {}/{}".format(epoch+1, EPOCHS))
    for tokens, masks, targets in tqdm(train_dataloader):
        optimizer.zero_grad() 

        tokens = pad_sequence(tokens).to(device)
        masks = pad_sequence(masks).to(device)
        targets = targets.to(device)

        targets_one_hot = nn.functional.one_hot(targets.long(), num_classes=2)

        outputs = model(tokens, mask=masks)

        loss = cross_entropy(outputs, targets)
        loss.backward()

        optimizer.step()

        running_loss_train += loss.item()

    running_loss_train /= len(train_dataloader)
    losses_train.append(running_loss_train)

    with torch.no_grad():
        for tokens, masks, targets in test_dataloader:
          tokens = pad_sequence(tokens).to(device)
          masks = pad_sequence(masks).to(device)
          targets = targets.to(device)

          targets_one_hot = nn.functional.one_hot(targets.long(), num_classes=2)

          outputs = model(tokens, mask=masks)

          loss = cross_entropy(outputs, targets)

          running_loss_test += loss.item()

    running_loss_test /= len(test_dataloader)
    losses_test.append(running_loss_test)

    print("Train Loss : {}, Test Loss : {}".format(running_loss_train, running_loss_test))

Epoch 1/5


100%|██████████| 57/57 [01:16<00:00,  1.34s/it]


Train Loss : 0.7967694778191415, Test Loss : 0.7150288053921291
Epoch 2/5


100%|██████████| 57/57 [01:15<00:00,  1.32s/it]


Train Loss : 0.7546201933894241, Test Loss : 0.7896267260823931
Epoch 3/5


100%|██████████| 57/57 [01:14<00:00,  1.31s/it]


Train Loss : 0.7119479127097548, Test Loss : 0.72144729750497
Epoch 4/5


100%|██████████| 57/57 [01:17<00:00,  1.36s/it]


Train Loss : 0.6984366450393409, Test Loss : 0.6947180884225028
Epoch 5/5


100%|██████████| 57/57 [01:16<00:00,  1.34s/it]


Train Loss : 0.6974164770360578, Test Loss : 0.6972350137574332
