# **Implementation of the Transformer architecture with PyTorch**

## **Importations**

In [28]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 55.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K  

In [29]:
import torch
from torch import nn
from transformers import CamembertTokenizer

# **1) Model**

In [30]:
class EmbeddingLayer(nn.Module):
  def __init__(self, vocab_size=35000, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

  def forward(self, x):
    return self.embedding(x)

model = EmbeddingLayer(vocab_size=35000, embedding_dim=512)
x = torch.randint(0, 100, (32, 20))
res = model(x)
assert res.shape == (32, 20, 512)

In [31]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.query_layer = nn.Linear(embedding_dim, embedding_dim)
    self.key_layer = nn.Linear(embedding_dim, embedding_dim)
    self.value_layer = nn.Linear(embedding_dim, embedding_dim)

  def forward(self, x):
    batch_size, nb_tokens, embedding_dim = x.shape
    Q = self.query_layer(x)
    K = self.key_layer(x)
    V = self.value_layer(x)
    K_transpose = torch.transpose(K, 1, 2)
    QK = torch.matmul(Q, K_transpose)
    
    QK_normalized = QK / (embedding_dim**(1/2))

    softmax = nn.Softmax()(QK_normalized)
    res = torch.matmul(softmax, V)

    return res

model = ScaledDotProductAttention(embedding_dim=512)
x = torch.rand(32, 20, 512)
res = model(x)
assert res.shape == (32, 20, 512)



In [32]:
class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, attention_heads=8, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.attention_heads = attention_heads
    self.embedding_dim = embedding_dim
    self.attention_layers = [ScaledDotProductAttention(embedding_dim=embedding_dim//attention_heads) for _ in range(attention_heads)]

  def forward(self, x):
    batch_size, nb_tokens, embedding_dim = x.shape
    x = x.reshape(batch_size, nb_tokens, self.attention_heads, embedding_dim // self.attention_heads)
    
    concat = torch.Tensor()
    for i, attention_layer in enumerate(self.attention_layers):
      attention = attention_layer(x[:, :, i, :])
      concat = torch.concat([concat, attention], dim=2)
  
    return concat

model = MultiHeadAttentionLayer(embedding_dim=512)
x = torch.rand(32, 20, 512)
res = model(x)
assert res.shape == (32, 20, 512)



In [33]:
class EncoderLayer(nn.Module):
  def __init__(self, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.multi_head_attention = MultiHeadAttentionLayer()
    self.dense = nn.Linear(embedding_dim, embedding_dim)
    self.layer_norm = nn.LayerNorm(embedding_dim)

  def forward(self, x):
    x = self.layer_norm(x + self.multi_head_attention(x))
    x = self.layer_norm(x + self.dense(x))

    return x

model = EncoderLayer(embedding_dim=512)
x = torch.rand(32, 20, 512)
res = model(x)
assert res.shape == (32, 20, 512)



In [34]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, nb_encoder_layers=6, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.embedding_layer = EmbeddingLayer(vocab_size, embedding_dim)
    self.encoder_layers = [EncoderLayer() for _ in range(nb_encoder_layers)]

  def forward(self, x):
    x = self.embedding_layer(x)
    for encoder_layer in self.encoder_layers:
      x = encoder_layer(x)

    return x

model = Encoder(vocab_size=35000, embedding_dim=512)
x = torch.randint(0, 100, (32, 20))
res = model(x)
assert res.shape == (32, 20, 512)



# **2) Test**

In [None]:
from tqdm import tqdm

## **a) Data**

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
import pandas as pd

path = "drive/MyDrive/IA/Datasets/IMDB Movie Sentiment/data.csv"
df = pd.read_csv(path)
df.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [56]:
df['label'].unique()

array([0, 1])

## **b) Preprocessing**

In [57]:
import re

def preprocess(s):
  s = s.lower()
  s = re.sub("(,|'|\"|-|/|\(|\)|#|:|;)", " ", s)
  s = re.sub("\.", " ", s)
  s = re.sub("[0-9]", "", s)
  s = re.sub(" +", " ", s)
  return s

preprocess("When I put this movie (Interstellar 1983) in my DVD player")
df['text_processed'] = df['text'].apply(preprocess)

## **c) Tokenizer**

In [58]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [63]:
def get_tokens_mask(tokenizer, text, max_len=50):
    """
    Prend en entrée un tokenizer et une chaine de caractères, et l'encode grâce au tokenizer
    
    Args
    ----
        tokenizer (CamembertTokenizer)
        text (str) : phrase à encoder
        max_len (int) : entier indiquant le nombre maximal de caractères
    
    Returns
    -------
        tokens (int list): liste de tokens encodés
        mask (int list): le mask à fournir au modèle CamemBERT
        size (int): le nombre de tokens utiles dans la phrase
    """

    tokens = []
    mask = []

    text = tokenizer.encode(text)
    text = text[:max_len-1] # On a besoin de ce -1 car on va rajouter le token 'pad' ensuite, et sans le -1 la longueur totale de text sera max_len+1

    size = len(text)
    pads = tokenizer.encode(['[PAD]']*(max(0, max_len - size)), is_split_into_words=True)

    tokens[:max(max_len, size)] = text[:max(max_len, size)]
    tokens += pads[1:-1]
    mask = [1]*size+[0]*len(pads[1:-1])

    return tokens, mask, size

vocab_size = len(tokenizer)
max_len = 30

## **d) Données**

In [64]:
from torch.utils.data import DataLoader, Dataset

class Data(Dataset):
    """
    Dataset qui contient deux listes de strings (text1 et text2) ainsi qu'une liste de similarités entre ces strings (target)
    """
    def __init__(self, df, tokenizer, max_len=50):      
        self.tokenizer = tokenizer
        self.text = df.text_processed
        self.target = df.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        tokens, mask, size = get_tokens_mask(self.tokenizer, self.text[index], max_len=self.max_len)
        target = self.target[index]

        return tokens, target
  

dataset = Data(df, tokenizer=tokenizer, max_len=max_len)
test_size = 0.1
nb_rows = len(dataset)
nb_rows_train = int((1 - test_size) * nb_rows)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [nb_rows_train, nb_rows - nb_rows_train])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

## **e) Entrainement d'un modèle**

In [74]:
class Model(nn.Module):
  def __init__(self, vocab_size=0, max_len=30, embedding_dim=512, **kwargs):
    super(**kwargs).__init__()
    self.encoder = Encoder(vocab_size=vocab_size, embedding_dim=embedding_dim)
    self.dense = nn.Linear(embedding_dim, 2)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    x = self.encoder(x)
    x = self.dense(x[:, 0])
    x = self.softmax(x)

    return x

model = Model(vocab_size=vocab_size, max_len=max_len, embedding_dim=512)
x = torch.randint(0, 100, (32, 20))
res = model(x)
assert res.shape == (32, 2)



In [69]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)
cross_entropy = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [103]:
from torch.nn.utils.rnn import pad_sequence

EPOCHS = 5

losses_train = []
losses_test = []

model.to(device)

for epoch in range(EPOCHS):
    running_loss_train = 0
    running_loss_test = 0
    print("Epoch {}/{}".format(epoch+1, EPOCHS))
    for tokens, targets in tqdm(train_dataloader):
        optimizer.zero_grad() 

        tokens = pad_sequence(tokens).to(device)
        targets = targets.to(device)

        targets_one_hot = nn.functional.one_hot(targets.long(), num_classes=2)

        outputs = model(tokens)

        loss = cross_entropy(outputs, targets)
        loss.backward()

        optimizer.step()

        running_loss_train += loss.item()

    losses_train.append(running_loss_train)

    with torch.no_grad():
        for tokens, targets in tqdm(train_dataloader):
          tokens = pad_sequence(tokens).to(device)
          targets = targets.to(device)

          targets_one_hot = nn.functional.one_hot(targets.long(), num_classes=2)

          outputs = model(tokens)

          loss = cross_entropy(outputs, targets)

          running_loss_test += loss.item()

    losses_test.append(running_loss_test)

    print("Train Loss : {}, Test Loss : {}".format(running_loss_train, running_loss_test))