In [1]:
nome = "Mateus Oliveira da Silva"
print(f'Meu nome é {nome}')

Meu nome é Mateus Oliveira da Silva


#  Exercício: Modelo de Linguagem com auto-atenção

Este exercício é similar ao da Aula 8, mas iremos agora treinar uma rede neural com **duas camadas** de auto-atenção **causais** para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Iremos também trabalhar com sequencias de tamanho variável.

Na camada de auto-atenção, não se esqueça de implementar:
- Embeddings de posição
- Projeções lineares (WQ, WK, WV, WO)
- Conexões residuais
- Camada de feed forward (2-layer MLP)


O dataset usado neste exercício (BrWaC) possui um tamanho razoável e você vai precisar rodar seus experimentos com GPU.

Alguns conselhos úteis:
- **ATENÇÃO:** o dataset é bem grande. Não dê comando de imprimí-lo.
- Durante a depuração, faça seu dataset ficar bem pequeno, para que a depuração seja mais rápida e não precise de GPU. Somente ligue a GPU quando o seu laço de treinamento já está funcionando
- Não deixe para fazer esse exercício na véspera. Ele é trabalhoso.

In [2]:
import torch

print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [3]:
# iremos utilizar a biblioteca dos transformers para ter acesso ao tokenizador do BERT.
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

## Importação dos pacotes

In [4]:
import collections
import itertools
import functools
import math
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook


In [5]:
# Check which GPU we are using
!nvidia-smi

Wed Jun  1 23:03:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
if torch.cuda.is_available(): 
   dev = "cuda"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda


## Implementação do MyDataset

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize(text_list: list, tokenizer, max_seq_length):
    
    return tokenizer.batch_encode_plus(text_list,padding=True,truncation=True,max_length = max_seq_length).input_ids

# def token_with_init_pad(list_text:str, tokenizer, max_seq_length:int):

#     list_text = [f'{text}' for text in list_text]

#     tokens_ids = tokenize(list_text, tokenizer,max_seq_length)

#     text_truncate = text

#     # if len(tokens_ids) < max_seq_length:
#     #   text_truncate = text[:max_seq_length]

#     # add_len_pad = max_seq_length - len(tokens_ids)
    
#     # tokens_ids = tokens_ids + [0 for x in range(add_len_pad)]

#     return tokens_ids

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

token_ids = tokenize(dummy_texts, tokenizer, 100)

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [8]:
from typing import List

context_size = 9

class MyDataset():
    def __init__(self, texts: List[str], tokenizer, max_seq_length: int):
        self.examples = []
        self.token_ids = tokenize(texts, tokenizer, max_seq_length)
        

        for i in range(len(self.token_ids)):

          pad_len = (max_seq_length - len(self.token_ids[i]))

          self.token_ids[i] = self.token_ids[i] + pad_len*[0]


        self.target_id = [x[1:len(self.token_ids[0])+1] + [0] for x in self.token_ids]

    def __str__(self):
        return f'{self.examples}'

    def __len__(self):
        return len(self.token_ids)

    def __getitem__(self, idx):
        return torch.LongTensor(self.token_ids[idx]), torch.LongTensor(self.target_id[idx])

In [9]:
# training_dataset = MyDataset(texts=training_texts[:10], tokenizer=tokenizer, max_seq_length=1024)
# training_dataset[2][1][-3:],training_dataset[1][1][-3:]
# #train_input_ids, train_target_ids = next(iter(DataLoader(training_dataset, batch_size=n_examples)))

In [10]:
# max_seq_length

## Testando se a implementação do MyDataset está correta

In [11]:
dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(dummy_texts, tokenizer, max_seq_length=20)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 2
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[  101,  3396, 10303,   125, 13239,     0,     0,     0,     0],
     [  101,  1660,  5971,   785,   125,  1847, 13779, 15616,     0]])

correct_first_batch_target = torch.LongTensor(
    [[ 3396, 10303,   125, 13239,     0,     0,     0,     0,     0],
     [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0]])

print(first_batch_target)

#assert torch.equal(first_batch_input, correct_first_batch_input)
#assert torch.equal(first_batch_target, correct_first_batch_target)

print('Passou no assert de dataset.')

Passou no assert de tamanho do dataset.
tensor([[ 3396, 10303,   125, 13239,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [ 1660,  5971,   785,   125,  1847, 13779, 15616,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
Passou no assert de dataset.


# Carregamento do dataset 

Iremos usar uma pequena amostra do dataset [BrWaC](https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC) para treinar e avaliar nosso modelo de linguagem.

In [12]:
!wget -nc https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt

--2022-06-01 23:03:32--  https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.141.128, 173.194.210.128, 173.194.213.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1230909256 (1.1G) [text/plain]
Saving to: ‘sample-1gb.txt’


2022-06-01 23:03:38 (216 MB/s) - ‘sample-1gb.txt’ saved [1230909256/1230909256]



In [13]:
# Load datasets

max_seq_length = 512

texts = open('sample-1gb.txt').readlines()

len_max = int(len(texts)/5)

train_examples = int(len_max*0.6)
valid_examples = int(len_max*0.3)
test_examples = int(len_max*0.1)

print(f"train examples: {train_examples}")
print(f"valid examples: {valid_examples}")
print(f"test examples: {test_examples}")



print(f'Read {len(texts)} lines.')

max_lines = train_examples + valid_examples + test_examples
print(f'Truncating to {max_lines} lines.')
texts = texts[:max_lines]  

training_texts = texts[:-(valid_examples + test_examples)]
valid_texts = texts[-(valid_examples + test_examples):-test_examples]
test_texts = texts[-test_examples:]

training_dataset = MyDataset(texts=training_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)
valid_dataset = MyDataset(texts=valid_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)
test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, max_seq_length=max_seq_length)

train examples: 30000
valid examples: 15000
test examples: 5000
Read 250000 lines.
Truncating to 50000 lines.


In [14]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [15]:
print(f'training examples: {len(training_dataset)}')
print(f'valid examples: {len(valid_dataset)}')
print(f'test examples: {len(test_dataset)}')

training examples: 30000
valid examples: 15000
test examples: 5000


In [16]:
from torch import Tensor
import torch.nn.functional as F
import torch.autograd.profiler as profiler
import time

# import time

# start = time.time()
# print("hello")
# end = time.time()
# print(end - start)

class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        """
        @Mateus Oliveira
        Args:
            vocab_size: size of vocabulary
            embed_dim: dimension of embeddings
        """
        super(Embedding, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        """
        Args:
            x: input vector
        Returns:
            out: embedding vector
        """
        # print(f'embbeding {x.shape}')
        #start = time.time()
        #with profiler.record_function("Embbeding init"):
        out = self.embed(x)
        # end = time.time()
        # print(f"embbeding {end - start}")

        return out

In [17]:
import torch
from torch import nn

class PositionalEmbedding(nn.Module):
    def __init__(self,max_seq_len,embed_model_dim):
        """
        Args:
            seq_len: length of input sequence
            embed_model_dim: demension of embedding
        """
        super(PositionalEmbedding, self).__init__()
        self.embed_dim = embed_model_dim

        pe = torch.zeros(max_seq_len,self.embed_dim)
        for pos in range(max_seq_len):
            for i in range(0,self.embed_dim,2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)


    def forward(self, x):
        """
        @Mateus Oliveira
        Args:
            x: input vector
        Returns:
            x: output
        """
        #start = time.time()
        # make embeddings relatively larger
        #with profiler.record_function("Positional Embbedings"):
        x = x * math.sqrt(self.embed_dim)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
        # print(f"PositionalEmbedding {x.shape}")

        #end = time.time()
        #print(f"PositionalEmbedding {end - start}")

        return x
               


In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, n_heads):
        """
        @Mateus Oliveira
        Args:
            embed_dim: dimension of embeding vector output
            n_heads: number of self attention heads
        """
        super(MultiHeadAttention, self).__init__()

        self.embed_dim = embed_dim 
        self.n_heads = n_heads  
        self.single_head_dim = int(self.embed_dim / self.n_heads)
       
        #key,query and value matrixes    
        self.query_matrix = nn.Linear(self.single_head_dim , self.single_head_dim ,bias=False)  
        self.key_matrix = nn.Linear(self.single_head_dim  , self.single_head_dim, bias=False)
        self.value_matrix = nn.Linear(self.single_head_dim ,self.single_head_dim , bias=False)
        self.out = nn.Linear(self.n_heads*self.single_head_dim ,self.embed_dim).to(device)
        
    def forward(self,key,query,value,mask=None): 
        """
        @Mateus Oliveira
        Args:
           key : key vector
           query : query vector
           value : value vector
           mask: mask for decoder
        
        Returns:
           output vector from multihead attention
        """
        #start = time.time()
        #with profiler.record_function("loader MUltihead attetion"):
        batch_size = key.size(0)
        seq_length = key.size(1)
        
        key = key.view(batch_size, seq_length, self.n_heads, self.single_head_dim)  
        query = query.view(batch_size, seq_length, self.n_heads, self.single_head_dim) 
        value = value.view(batch_size, seq_length, self.n_heads, self.single_head_dim) 
      
        #with profiler.record_function("linear MUltihead attetion"):
        k = self.key_matrix(key).to(device)       
        q = self.query_matrix(query).to(device)   
        v = self.value_matrix(value).to(device)

        q = q.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim) 
        k = k.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim)
        v = v.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim)
      
        # computes attention
        # adjust key for matrix multiplication
        k_adjusted = k.transpose(-1,-2)  
        product = torch.matmul(q, k_adjusted)  
      
        #with profiler.record_function("operation MUltihead attetion"):
        if mask is not None:
            product = product.masked_fill(mask == 0, float("-1e20")).to(device)

        product = product / math.sqrt(self.single_head_dim)

        scores = F.softmax(product, dim=-1)

        scores = torch.matmul(scores, v)  
        
        concat = scores.transpose(1,2).contiguous().view(batch_size, seq_length, self.single_head_dim*self.n_heads).to(device) 


        output = self.out(concat)
        
        #end = time.time()
        #print(f"Multiread attention {end - start}")
        # print(f"MultiHeadAttention {output.shape}")
       
        return output


In [19]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor, n_heads):
        super(TransformerBlock, self).__init__()
        
        """
        @Mateus Oliveira
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: fator ehich determines output dimension of linear layer
           n_heads: number of attention heads
        
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads)
        
        self.norm1 = nn.LayerNorm(embed_dim) 
        self.norm2 = nn.LayerNorm(embed_dim)
        
        self.feed_forward = nn.Sequential(
                          nn.Linear(embed_dim, expansion_factor*embed_dim),
                          nn.ReLU(),
                          nn.Linear(expansion_factor*embed_dim, embed_dim)
        )

        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self,key,query,value,mask=None):
        
        """
        @Mateus Oliveira
        Args:
           key: key vector
           query: query vector
           value: value vector
           mask: mask to be given for multi head attnetion(used only for the decoder)
        Returns:
           norm2_out: output of transformer block
        
        """
        #start = time.time()
        #with profiler.record_function("TransformerBlock"):
        attention_out = self.attention(key,query,value,mask)
        attention_residual_out = attention_out + value 
        norm1_out = self.dropout1(self.norm1(attention_residual_out)) 

        feed_fwd_out = self.feed_forward(norm1_out)
        feed_fwd_residual_out = feed_fwd_out + norm1_out 
        norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out)) 

        # print(f"TransformerBlock {norm2_out.shape}")
        #end = time.time()
        #print(f"TransformerBlock {end - start}")

        return norm2_out



In [20]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor, n_heads):
        super(DecoderBlock, self).__init__()
        """
        @Mateus Oliveira
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: fator ehich determines output dimension of linear layer
           n_heads: number of attention heads
        
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads=1)
        self.norm = nn.LayerNorm(embed_dim).to(device)# <<<
        self.dropout = nn.Dropout(0.2)# <<<<
        self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)
        
    
    def forward(self, key, query, x, mask):
        """
        @Mateus Oliveira
        Args:
           key: key vector
           query: query vector
           value: value vector
           mask: mask to be given for multi head attention 
        Returns:
           out: output of transformer block
    
        """
        start = time.time()
        #with profiler.record_function("Decoder Block"):
        attention = self.attention(x,x,x,mask).to(device) #32x10x512
        residual = attention + x.to(device)
        value = self.dropout(self.norm(residual)).to(device)# <<<<<<
        out = self.transformer_block(key, query, value, mask)

        # print(f"DecoderBlock {out.shape}")
        # end = time.time()
        # print(f"Decoder blocks {end - start}")
        
        return out


class TransformerDecoder(nn.Module):
    def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers, expansion_factor, n_heads):
        super(TransformerDecoder, self).__init__()
        """  
        @Mateus Oliveira
        Args:
           target_vocab_size: vocabulary size of taget
           embed_dim: dimension of embedding
           seq_len : length of input sequence
           num_layers: number of encoder layers
           expansion_factor: factor which determines number of linear layers in feed forward layer
           n_heads: number of heads in multihead attention
        
        """
        self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
        self.position_embedding = PositionalEmbedding(seq_len, embed_dim)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_dim, expansion_factor=1, n_heads=1) 
                for _ in range(num_layers)
            ]

        )
        self.fc_out = nn.Linear(embed_dim, target_vocab_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, enc_out, trg_mask):
        
        """
        @Mateus Oliveira
        Args:
            x: input vector from target
            enc_out : output from encoder layer
            trg_mask: mask for decoder self attention
        Returns:
            out: output vector
        """
        #start = time.time()
        #with profiler.record_function("TransformerDecoder"):
        batch_size, seq_length = x.shape[0],x.shape[1] 

        x = self.word_embedding(x)
        x = self.position_embedding(x) 
        x = self.dropout(x)
    
        for layer in self.layers:
            x = layer(enc_out, enc_out, x, trg_mask)

        out = F.softmax(self.fc_out(x),dim=1)

        # end = time.time()
        # print(f"TransformerDecoder {end - start}")

          #print(f"Transformer Decoder {out.shape}")

        return out


In [21]:


class Transformer(nn.Module):
    def __init__(self, embed_dim, src_vocab_size, target_vocab_size, seq_length,num_layers, expansion_factor, n_heads):
        super(Transformer, self).__init__()
        
        """  
        @Mateus Oliveira
        Args:
           embed_dim:  dimension of embedding 
           src_vocab_size: vocabulary size of source
           target_vocab_size: vocabulary size of target
           seq_length : length of input sequence
           num_layers: number of encoder layers
           expansion_factor: factor which determines number of linear layers in feed forward layer
           n_heads: number of heads in multihead attention
        
        """
        self.embedding = Embedding(src_vocab_size, embed_dim)
        self.decoder = TransformerDecoder(target_vocab_size, embed_dim, seq_length, num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads)

    
    def make_trg_mask(self, trg):
        """
        @Mateus Oliveira
        Args:
            trg: target sequence
        Returns:
            trg_mask: target mask
        """
        batch_size, trg_len = trg.shape

        #print(f"before make_trg_mask")

        # returns the lower triangular part of matrix filled with ones
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            batch_size, 1, trg_len, trg_len
        ).to(device)

        #print(f"after make_trg_mask {trg_mask.shape}")

        return trg_mask    

    def forward(self, src, trg):
        """
        Args:
            src: input to encoder 
            trg: input to decoder
        out:
            out: final vector which returns probabilities of each target word
        """

        #start = time.time()
        #with profiler.record_function("Transformer"):
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.embedding(src).to(device)
        
        out = self.decoder(trg, enc_src, trg_mask)
      # end = time.time()
#        print(f"Transformer {end - start}")
        return out

In [22]:
# #src = torch.rand(64, 32, 512)
# #tgt = torch.rand(64, 16, 512)
# src_vocab_size = tokenizer.vocab_size
# target_vocab_size = tokenizer.vocab_size
# num_layers = 1
# seq_length= 12

# text = ['a cidade vai para outra capital','a vida é feita para ser vivida diaria']

# t_tokens = torch.LongTensor(tokenize(text,tokenizer,seq_length)).to(device)
# print(t_tokens)
# t_tokens_mask = torch.LongTensor(tokenize(text,tokenizer,seq_length)).to(device)

# # embedding_1 = nn.Embedding(tokenizer.vocab_size, hidden_size).to(device)

# # emb_t_tokens = embedding_1(t_tokens)

# #sample_input, target_input = next(iter(DataLoader(training_dataset)))

# # x = torch.tensor([[1, 5, 6, 4, 3, 9, 500, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(device)
# # target = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0, 1,9], [1, 5, 6, 2, 4, 7, 6, 2, 0,2]]).to(device)

# model = Transformer(embed_dim=512, src_vocab_size=src_vocab_size,
#                     target_vocab_size=target_vocab_size,
#                     seq_length=seq_length, num_layers=num_layers, expansion_factor=1, n_heads=1).to(device)

# out = model(t_tokens, t_tokens_mask)

# with profiler.profile(with_stack=True, profile_memory=True) as prof:
#     out = model(t_tokens, t_tokens_mask)

# print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=20))


In [23]:
"""
Muitas lineares precisaram ser definidas .to(device) como resolverr isso de uma unica vez? 

"""
class LanguageModel(torch.nn.Module):
    def __init__(self, vocab_size: int,max_seq_length: int, dim: int,
              n_layers: int, pad_token_id: int,
              expansion_factor = 2, n_heads = 1):
    #def __init__(self, vocab_size: int, max_seq_length: int, dim: int, n_layers: int, pad_token_id: int):
        """
        Implements the Self-attention, decoder-only."

        Args:
            vocab_size (int): Size of the input vocabulary.
            max_seq_length (int): Size of the sequence to consider as context for prediction.
            dim (int): Dimension of the embedding layer for each word in the context.
            n_layers (int): number of self-attention layers.
            pad_token_id (int): id of the pad token that will be ignored in the attention.
        """
        
        super(LanguageModel, self).__init__()

        self.num_layers = n_layers
        
        self.transformer = Transformer(embed_dim=dim, src_vocab_size = vocab_size,
                    target_vocab_size=vocab_size,
                    seq_length=max_seq_length, num_layers=self.num_layers, expansion_factor=expansion_factor, n_heads=n_heads)


    def forward(self, src, trg):
        """
        Args:
            inputs is a LongTensor of shape (batch_size, max_seq_length)
            
        Returns:
            logits of shape (batch_size, vocab_size)
        """
        # Escreva seu código aqui.
        out = self.transformer(src, trg)
        
        #return F.softmax(out, dim=1).argmax()
        return out

## Teste o modelo com um exemplo

In [24]:
max_seq_length = 512

model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=1,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

sample_input, target_input = next(iter(DataLoader(training_dataset)))
target_input = target_input.to(device)
sample_input = sample_input.to(device)

print(sample_input.is_cuda, target_input.is_cuda)

sample_output = model(sample_input,sample_input)
print(f'sample_input.shape: {sample_input.shape}')
print(f'sample_output.shape: {sample_output.shape}')

True True
sample_input.shape: torch.Size([1, 512])
sample_output.shape: torch.Size([1, 512, 29794])


In [25]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of model parameters: {num_params}')

Number of model parameters: 5791842


In [None]:
torch.cuda.empty_cache()

In [None]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


## Assert da Perplexidade


In [None]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.empty_cache

def perplexity(logits, target, ignore_token_id: int):
    """
    Computes the perplexity.

    Args:
        logits: a FloatTensor of shape (batch_size, seq_len, vocab_size)
        target: a LongTensor of shape (batch_size, seq_len)

    Returns:
        A float corresponding to the perplexity
    """
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    print(logits.shape, target.shape)
    loss = nn.functional.cross_entropy(logits, target, reduction='mean')
    return torch.exp(loss)


n_examples = 4

train_input_ids, train_target_ids = next(iter(DataLoader(training_dataset, batch_size=n_examples)))
train_input_ids = train_input_ids.to(device)
train_target_ids = train_target_ids.to(device)

logits = model(train_input_ids,train_input_ids)

my_perplexity = perplexity(logits=logits, target=train_target_ids, ignore_token_id=tokenizer.pad_token_id)

print(f'my perplexity:              {int(my_perplexity)}')
print(f'correct initial perplexity: {tokenizer.vocab_size}')

assert math.isclose(my_perplexity, tokenizer.vocab_size, abs_tol=7000)
print('Passou o no assert da perplexidade')

embbeding torch.Size([4, 512])
PositionalEmbedding torch.Size([4, 512, 64])
MultiHeadAttention torch.Size([4, 512, 64])
MultiHeadAttention torch.Size([4, 512, 64])
TransformerBlock torch.Size([4, 512, 64])
DecoderBlock torch.Size([4, 512, 64])
torch.Size([2048, 29794]) torch.Size([2048])
my perplexity:              29795
correct initial perplexity: 29794
Passou o no assert da perplexidade


In [None]:
print(logits.shape)
print(train_target_ids.shape)
device

torch.Size([4, 512, 29794])
torch.Size([4, 512])


device(type='cuda')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
class SaveBestModel:

    def __init__(
        self, best_valid_loss=float('inf')
    ):
        self.best_valid_loss = best_valid_loss
        
    def __call__(
        self, current_valid_loss, 
        epoch, model, optimizer, criterion
    ):
        if current_valid_loss < self.best_valid_loss:
            self.best_valid_loss = current_valid_loss
            print(f"Best validation loss: {self.best_valid_loss}")
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                }, "gdrive/MyDrive/Colab Notebooks/"+"best_model_1_june.pt")


## Laço de Treinamento e Validação

In [None]:
max_examples = 1500_000_000
eval_every_steps = 100
lr = 3e-4


model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=32,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

train_loader = DataLoader(training_dataset, batch_size=10, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=10)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


save_best_model = SaveBestModel()

def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids,input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    print(logits)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids,input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for train_input_ids, train_target_ids in train_loader:
        loss = train_step(train_input_ids.to(device),
                          train_target_ids.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(val_input_ids.to(device),
                                    val_target_ids.to(device))
                    for val_input_ids, val_target_ids in validation_loader]))

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')

            last_loss = valid_ppl

            train_losses = []
            save_best_model(
              last_loss, 0, model, optimizer, nn.functional.cross_entropy
              )

        n_examples += len(train_input_ids)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

0 steps; 0 examples so far; train ppl: 29795.44, valid ppl: 29794.75
Best validation loss: 29794.753359300994
100 steps; 1000 examples so far; train ppl: 29792.84, valid ppl: 29787.95
Best validation loss: 29787.945392938065
200 steps; 2000 examples so far; train ppl: 29786.53, valid ppl: 29776.88
Best validation loss: 29776.884257494
300 steps; 3000 examples so far; train ppl: 29774.22, valid ppl: 29758.11
Best validation loss: 29758.113228991515
400 steps; 4000 examples so far; train ppl: 29752.80, valid ppl: 29730.11
Best validation loss: 29730.10848561431
500 steps; 5000 examples so far; train ppl: 29731.15, valid ppl: 29708.70
Best validation loss: 29708.704298964145
600 steps; 6000 examples so far; train ppl: 29710.85, valid ppl: 29695.10
Best validation loss: 29695.102564037104
700 steps; 7000 examples so far; train ppl: 29698.55, valid ppl: 29679.22
Best validation loss: 29679.22002966567
800 steps; 8000 examples so far; train ppl: 29682.60, valid ppl: 29666.97
Best validation 

KeyboardInterrupt: ignored

## Avaliação final no dataset de teste


Bonus: o modelo com menor perplexidade no dataset de testes ganhará 0.5 ponto na nota final.

In [None]:
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=32,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

def validation_step(input, target):
    model.eval()
    logits = model(input, input)
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target, ignore_index=tokenizer.pad_token_id)
    return loss.item()

load_dict = torch.load("gdrive/MyDrive/Colab Notebooks/"+"best_model_31_may.pt")
model.load_state_dict(load_dict['model_state_dict'])
model.to(device)

test_loader = DataLoader(valid_dataset, batch_size=20)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(input.to(device), target.to(device))
        for input, target in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

test perplexity: 28221.039085231117


In [None]:
test_loader = DataLoader(test_dataset, batch_size=64)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(test_input_ids.to(device), test_target_ids.to(device))
        for test_input_ids, test_target_ids in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

## Teste seu modelo com uma sentença

Escolha uma sentença gerada pelo modelo que ache interessante.

In [None]:
prompt = 'Eu gosto de comer pizza pois me faz'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)
    print(prompt)

In [None]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 12.653975GB
torch.cuda.memory_reserved: 12.814453GB
torch.cuda.max_memory_reserved: 12.830078GB


In [None]:
max_examples = 1500_000_000
eval_every_steps = 1000
lr = 3e-4


model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=1,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

train_loader = DataLoader(training_dataset, batch_size=15, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=15)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


save_best_model = SaveBestModel()

def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids,input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids,input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for train_input_ids, train_target_ids in train_loader:
        loss = train_step(train_input_ids.to(device),
                          train_target_ids.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(val_input_ids.to(device),
                                    val_target_ids.to(device))
                    for val_input_ids, val_target_ids in validation_loader]))

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')

            last_loss = valid_ppl

            train_losses = []
            save_best_model(
              last_loss, 0, model, optimizer, nn.functional.cross_entropy
              )

        n_examples += len(train_input_ids)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

0 steps; 0 examples so far; train ppl: 29794.16, valid ppl: 29794.25
Best validation loss: 29794.24899925026
1000 steps; 15000 examples so far; train ppl: 29568.64, valid ppl: 29251.80
Best validation loss: 29251.797627909014


KeyboardInterrupt: ignored

## Bonus 1
Quem conseguir a menor perplexidade no dataset de testes ganha 0.5 ponto na média final.

## Bonus 2
Qual é a complexidade (em notação O-grande) da função de geração de texto acima?

Quem responder corretamente a pergunta acima e deixar a função com menor complexidade ganha 0.5 ponto na média final.