# TP Self-attention & architecture Transformer

In [None]:
import math
import click
from torch.utils.tensorboard import SummaryWriter
import logging
import re
from pathlib import Path
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import time
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from IPython.display import display, HTML


# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [None]:
MAX_LENGTH = 500
logging.basicConfig(level=logging.INFO)

# outils avancés de gestion des chemins
BASEPATH = Path("/tmp/runs/")
TB_PATH =  BASEPATH / "logs"
TB_PATH.mkdir(parents=True, exist_ok=True)

# usage externe de tensorboard: (1) lancer la commande dans une console; (2) copier-coller l'URL dans un navigateur
display(HTML("<h2>Informations</h2><div>Pour visualiser les logs, tapez la commande : </div>"))
print(f"tensorboard --logdir {Path(TB_PATH).absolute()}")

## Classe de gestion des données textuelles (idem TP précédent)

1. Récupération d'embedding glove
    1. Téléchargement:
    ```wget http://nlp.stanford.edu/data/glove.6B.zip```
    2. Lecture des fichiers
2. Récupération des données imdb (classification d'opinion)
3. Traitement des données

In [None]:
def get_embeddings_glove(PATH, EMB_SIZE=50):
    vocab,embeddings = [],[]
    with open(PATH+'glove.6B.{:d}d.txt'.format(EMB_SIZE),'rt') as fi:
        full_content = fi.read().strip().split('\n')
    for i in range(len(full_content)):
        i_word = full_content[i].split(' ')[0]
        i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
        vocab.append(i_word)
        embeddings.append(i_embeddings)
    return vocab, embeddings

# recuperation des embbeding 
EMB_SIZE = 50 # 100, 200 or 300
PATH = "./data/glove/glove.6B/" # répertoire où vous avez récupéré les embeddings
vocab, embeddings = get_embeddings_glove(PATH, EMB_SIZE=EMB_SIZE)

In [None]:
n=5
for i in range(n):
    print(vocab[i]) # premier mot
    print(len(embeddings[i]), embeddings[i]) # premier embedding

In [None]:
# récupération via huggingface des données imdb
from datasets import load_dataset
dataset = load_dataset('imdb')

# dataset["train"][0]
print(dataset["train"][0]['text'])
print(dataset["train"][0]['label'])

In [None]:
class FolderText(Dataset):
    """Dataset gérant la tokenization des documents à la volée"""

    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.txts   = [data[i]["text"] for i in range(len(data))]
        self.labels = [data[i]["label"] for i in range(len(data))]
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, ix):
        return self.tokenizer(self.txts[ix]), self.labels[ix]
    def get_txt(self,ix):
        # s = self.txts[ix]
        return self.txts[ix], self.labels[ix]


In [None]:
def format_dataset(vocab, embeddings):
    # mise en forme du dataset
    WORDS = re.compile(r"\S+")

    embedding_size = len(embeddings[0])
    OOVID = len(vocab)
    vocab.append("__OOV__")
    word2id = {word: ix for ix, word in enumerate(vocab)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")
    logging.info("Get the IMDB dataset")

    train_data, test_data=FolderText(dataset["train"], tokenizer), FolderText(dataset["test"], tokenizer)
    id2word = dict((v, k) for k, v in word2id.items())
    return train_data, test_data, embeddings, tokenizer, id2word


train_data, test_data, embeddings, tokenizer, id2word = format_dataset(vocab, embeddings)

In [None]:
# vérification rapide du bon fonctionnemnet des éléments ci-dessus

sent = "The movie was great"
ind = tokenizer(sent)
print(ind)
print("check reconstruction :", " ".join([id2word[i] for i in ind]))

# avec un mot inconnu
sent = "this movie was qslkjgf"
ind = tokenizer(sent)
print(ind)
print("check reconstruction :", " ".join([id2word[i] for i in ind]))

# Modélisation de l'attention propre

In [None]:
def masked_softmax(x,lens=None):
    #X : B x N x N
    if lens is None:
        lens = torch.zeros(x.size(0),1).fill_(x.size(1)).to(x.device)
    
    return x.softmax(-1).nan_to_num(0)

In [None]:
# la fonction ci dessus calcul e softmax pour vous = normalise l'attention ... Avec un masque d'attention!

# TODO :


In [None]:
class AttentionBasicLayer(nn.Module):
    def __init__(self,dim,layer_norm=True):
        super().__init__()
        self.key = nn.Linear(dim,dim)
        self.value = nn.Linear(dim,dim)
        self.query = nn.Linear(dim,dim)
        self.final =  nn.Linear(dim,dim)

    def forward(self,x,lens=None): ## B x L x Z
        # 0. Regarder la documentation de bmm !!
        # 1. Q, K, V
        # 2.  d_k   = query.size(-1)
        # 3. score
        # 4. a = masked_softmax(scores,lens)
        #  TODO 

        out = F.relu(self.final(out))
        return out 
    

# compréhension:
1. Afficher les dimensions des différentes matrices
2. Afficher la matrice d'attention propre
3. Comparer les objets à leur entrée puis sortie du système

In [None]:
# Si on met des données random
B = 2   # batch
L = 10  # length
Z = 50  # dim_embedding
x = torch.rand(B,L,Z)
net = AttentionBasicLayer(Z,layer_norm=True)
out,scores = net(x)

print(out.size(), scores.size())


In [None]:
# self-attention
# - symétrique?
# - diagonale forte?

plt.figure()
plt.imshow(scores[0].to("cpu").detach().numpy())

In [None]:
# quelle est la différence entre l'entrée et la sortie

plt.figure()
plt.subplot(2,1,1)
plt.imshow(x[0,:,:].to("cpu").numpy()) # instance 1
plt.subplot(2,1,2)
plt.imshow(out[0,:,:].to("cpu").detach().numpy()) # instance 1



## Reste du modèle

In [None]:
class AttentionResidualLayer(nn.Module):
    def __init__(self,dim):
        super().__init__()
        self.key = nn.Linear(dim,dim)
        self.value = nn.Linear(dim,dim)
        self.query = nn.Linear(dim,dim)
        self.final =  nn.Linear(dim,dim)
        self.layer_norm = nn.LayerNorm(dim)
    def forward(self,x,lens=None): ## B x L x D
        x = self.layer_norm(x)
        # A compléter
        #  TODO 
        # regarder la différence avec la fonction précédente !!
        out = F.relu(self.final(x+out)) # care of the details
        return out

class SelfAttentionModel(nn.Module):
    def __init__(self, dim, attention, nclasses=2,numlayers=3,pos_emb=False):
        super().__init__()
        self.pos_emb =  nn.Embedding(MAX_LENGTH,dim) if pos_emb else None
        self.final = nn.Linear(dim,nclasses)
        self.attention = nn.ModuleList([attention(dim) for _ in range(numlayers)])
    
    def forward(self,x,lens=None):
        # tout est donné... Mais il faut le comprendre
        out = x
        if self.pos_emb is not None:
            pos = torch.arange(x.size(1)).to(x.device)
            pos = self.pos_emb(pos).unsqueeze(0).expand(x.size(0),x.size(1),x.size(2))
            out = x + pos
        for att in self.attention:
            out = att(out,lens)
        return self.final(out.sum(1)/lens.view(-1,1)) # pooling
        



In [None]:
class Learner:
    """Base class for supervised learning"""

    def __init__(self, model, model_id: str):
        super().__init__()
        self.model = model
        self.optim = torch.optim.Adam(model.parameters(),lr=1e-3)
        self.model_id = model_id
        self.iteration = 0

    def run(self,train_loader, test_loader, epochs, test_iterations):
        """Run a model during `epochs` epochs"""
        writer = SummaryWriter(f"/tmp/runs/{self.model_id}")
        loss = nn.CrossEntropyLoss()
        loss_nagg = nn.CrossEntropyLoss(reduction='sum')

        self.model.train()
        for epoch in tqdm(range(epochs)):
            # Iterate over batches
            for x, y,lens in train_loader:
                self.optim.zero_grad()
                yhat = self.model(x,lens)
                l = loss(yhat, y)
                l.backward()
                self.optim.step()
                writer.add_scalar('loss/train', l, self.iteration)
                self.iteration += 1
                
                if self.iteration % test_iterations == 0:
                    self.model.eval()
                    with torch.no_grad():
                        cumloss = 0
                        cumcorrect = 0
                        count = 0
                        for x, y, lens in test_loader:
                            yhat = self.model(x,lens)
                            cumloss += loss_nagg(yhat, y)
                            cumcorrect += (yhat.argmax(1) == y).sum()
                            count += x.shape[0]
                            
                        writer.add_scalar(
                            'loss/test', cumloss.item() / count, self.iteration)
                        writer.add_scalar(
                            'correct/test', cumcorrect.item() / count, self.iteration)
                        
                        
                    self.model.train()


In [None]:
# def collate(batch):
#     """ Collate function for DataLoader """
#     data = [torch.LongTensor(item[0][:MAX_LENGTH]) for item in batch]
#     lens = [len(d) for d in data]
#     labels = [item[1] for item in batch]
#     return emb_layer(torch.nn.utils.rnn.pad_sequence(data, batch_first=True,padding_value = PAD)).to(device)\
#         , torch.LongTensor(labels).to(device), torch.Tensor(lens).to(device)

# sorties des cahrgements de données/embedding précédents
PAD = word2id["__OOV__"] # variable globale pour collate
embeddings = torch.Tensor(embeddings)
emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embeddings))   
MAX_LENGTH = 500 # tout ce qui se trouve après est éliminé

def collate(batch):
        """ Collate function for DataLoader """
        data = [torch.LongTensor(item[0][:MAX_LENGTH]) for item in batch]
        lens = [len(d) for d in data]
        labels = [item[1] for item in batch]
        return emb_layer(torch.nn.utils.rnn.pad_sequence(data, batch_first=True,padding_value = PAD)).to(device), torch.LongTensor(labels).to(device), torch.Tensor(lens).to(device)




In [None]:
batch_size = 16
emb_size = 50
modeltype = 2
epochs = 50
test_iterations = 1000


train_loader = DataLoader(train_data, shuffle=True,
                        batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size,collate_fn=collate,shuffle=False)
## [[STUDENT]]
if modeltype == 0:
    model = SelfAttentionModel(emb_size,AttentionBasicLayer,2,1).to(device)
elif modeltype == 1:
    model = SelfAttentionModel(emb_size,AttentionResidualLayer,2,3).to(device)
elif modeltype == 2:
    model = SelfAttentionModel(emb_size,AttentionResidualLayer,2,3,True).to(device)
else:
    print("No model of this type")
    exit(1)
learner = Learner(model, time.asctime())
learner.run(train_loader,test_loader,epochs,test_iterations)


In [None]:
import os

def save_model(model,fichier): # pas de sauvegarde de l'optimiseur ici
      """ sauvegarde du modèle dans fichier """
      state = {'model_state': model.state_dict()}
      torch.save(state,fichier) # pas besoin de passer par pickle
 
def load_model(fichier,model):
      """ Si le fichier existe, on charge le modèle  """
      if os.path.isfile(fichier):
          state = torch.load(fichier)
          model.load_state_dict(state['model_state'])

In [None]:
# sauvegarde du réseau (économie de 30 minutes :)
# ATTENTION: pour que ça marche, il faut que les réseaux soient structurellement identiques... Il vous faut donc ma correction :)
# la classe utilisée est dans model.py dans le répertoire model.

path = "./model/"

model.name ="transfo-res-posemb-l3" # transformation du nom pour normaliser
fichier = path+f"{model.name}"
save_model(model,fichier)


# vous pouvez utiliser les formules symmétriques pour le chargement

# transfo1 = SelfAttentionModel(emb_size,AttentionBasicLayer,2,1).to(device).to(device)
# transfo1.name ="transfo-base-l1"

# load_model(path+"/trasnfo-base-l1", transfo1)


In [None]:
# evalaution des performances en test
#  TODO 

## Méthode optimisée pour générer des embeddings de position

In [None]:
class PositionalEncoding(nn.Module):
    "Position embeddings"

    def __init__(self, d_model: int, max_len: int = 5000):
        """Génère des embeddings de position

        Args:
            d_model (int): Dimension des embeddings à générer
            max_len (int, optional): Longueur maximale des textes. 
                Attention, plus cette valeur est haute, moins bons seront les embeddings de position.
        """
        super().__init__()

        pe = torch.zeros(max_len, d_model, dtype=torch.float)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        pe.requires_grad = False
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        """Ajoute les embeddings de position"""
        x = x + self.pe[:, :x.size(1)]
        return x



In [None]:
pe = PositionalEncoding(EMB_SIZE, max_len=MAX_LENGTH)
print(pe.pe.shape)

In [None]:
import seaborn as sns

length = 200
with torch.no_grad():
    # Draw a heatmap with the numeric values in each cell
    pes = pe(torch.zeros(1, length, emb_size)).squeeze() # batch x length x emb

    inners = pes @ pes.t()
    f, ax = plt.subplots(figsize=(9, 6))
    sns.heatmap(inners, annot=False, fmt="d", ax=ax, cmap="coolwarm")
    f.show()
    input("Press Enter to continue...")

In [None]:
###  TODO )"," TODO ",\
    txt, flags=re.DOTALL))
f2.close()

### </CORRECTION> ###