In [1]:
nome = "Mateus Oliveira da Silva"
print(f'Meu nome é {nome}')

Meu nome é Mateus Oliveira da Silva


#  Exercício: Modelo de Linguagem com auto-atenção

Este exercício é similar ao da Aula 8, mas iremos agora treinar uma rede neural com **duas camadas** de auto-atenção **causais** para prever a próxima palavra de um texto, data as palavras anteriores como entrada. 

Iremos também trabalhar com sequencias de tamanho variável.

Na camada de auto-atenção, não se esqueça de implementar:
- Embeddings de posição
- Projeções lineares (WQ, WK, WV, WO)
- Conexões residuais
- Camada de feed forward (2-layer MLP)


O dataset usado neste exercício (BrWaC) possui um tamanho razoável e você vai precisar rodar seus experimentos com GPU.

Alguns conselhos úteis:
- **ATENÇÃO:** o dataset é bem grande. Não dê comando de imprimí-lo.
- Durante a depuração, faça seu dataset ficar bem pequeno, para que a depuração seja mais rápida e não precise de GPU. Somente ligue a GPU quando o seu laço de treinamento já está funcionando
- Não deixe para fazer esse exercício na véspera. Ele é trabalhoso.

In [2]:
import torch

print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [3]:
# iremos utilizar a biblioteca dos transformers para ter acesso ao tokenizador do BERT.
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 16.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unin

## Importação dos pacotes

In [4]:
import collections
import itertools
import functools
import math
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook


In [5]:
# Check which GPU we are using
!nvidia-smi

Wed Jun  8 16:05:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
if torch.cuda.is_available(): 
   dev = "cuda"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda


## Implementação do MyDataset

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize(text_list: list, tokenizer):
    
    return tokenizer.batch_encode_plus(text_list,padding=True).input_ids
    #return tokenizer(text_list)

# def token_with_init_pad(list_text:str, tokenizer, max_seq_length:int):

#     list_text = [f'{text}' for text in list_text]

#     tokens_ids = tokenize(list_text, tokenizer,max_seq_length)

#     text_truncate = text

#     # if len(tokens_ids) < max_seq_length:
#     #   text_truncate = text[:max_seq_length]

#     # add_len_pad = max_seq_length - len(tokens_ids)
    
#     # tokens_ids = tokens_ids + [0 for x in range(add_len_pad)]

#     return tokens_ids

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

token_ids = tokenize(dummy_texts, tokenizer)

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [8]:
# shift = 4

# print(token_ids[0])
# print('-------')
# row_matrix = len(token_ids[0]) - shift + 1
# for i in range(row_matrix):
#     print(token_ids[0][i:i + shift])

# #Create case from vector highter
# #Os dados precisam estar um do lado do outro necessariamente

In [9]:
# seq_length = 11
# generate_zeros = (seq_length - len(token_ids[0]))
# token_ids[0] + generate_zeros*[0]

In [10]:
# tokenizer.batch_encode_plus()
# max_seq_length = 512
# dim = 64

# embbeding = nn.Embedding(max_seq_length, dim)
# embbeding(token_ids[0])

In [11]:
# from typing import List



# class MyDataset():
#     def __init__(self, texts: List[str], tokenizer, seq_length: int):
#         self.examples = []
#         self.token_ids = tokenize(texts, tokenizer)
#         self.tokens = []

#         shift = seq_length

#         for i in range(len(self.token_ids)):
            
#             generate_zeros = (seq_length - len(self.token_ids[i]))

#             self.token_ids[i] = self.token_ids[i] + generate_zeros*[0]

#             row_matrix = len(self.token_ids[i]) - shift + 1

#             for j in range(row_matrix):

#                 index_target = (j+1)
                
#                 #verify if vector only zeros.
#                 if np.array(self.token_ids[i][j:j + shift]).sum() == 0:
#                     continue
                
#                 #print('input: ',self.token_ids[i][j:j + shift])

#                 if len(self.token_ids[i][index_target:index_target + shift]) < shift:

#                     rest = shift - len(self.token_ids[i][index_target:index_target + shift])

#                     input = self.token_ids[i][j:j + shift]

#                     target = self.token_ids[i][index_target:index_target + shift] + rest*[0]

#                     self.tokens.append((input, target))

#                 #    print('R target: ',self.token_ids[i][index_target:index_target + shift] + rest*[0])

#                 else:

#                     input = self.token_ids[i][j:j + shift]
                    
#                     target = self.token_ids[i][index_target:index_target + shift]

#                     self.tokens.append((input, target))
                
#                 if j == 2: break #para nao quebrar o treino
#                 #    print('target: ',self.token_ids[i][index_target:index_target + shift])
                
#                 #print('')

#     def __str__(self):
#         return f'{self.examples}'

#     def __len__(self):
#         return len(self.tokens)

#     def __getitem__(self, idx):
#         return torch.LongTensor(self.tokens[idx][0]), torch.LongTensor(self.tokens[idx][1])
        
# dummy_dataset = MyDataset(dummy_texts, tokenizer, seq_length=3)

# # for i in range(len(dummy_dataset)):
# #     print(dummy_dataset[i])

In [12]:
from tqdm.auto import tqdm
from typing import List
import gc



def tokenize(text: str, tokenizer):
    """
    Tentei usar de todas as formas o batch_encode_plus, mas ele quebrava o meu code.
    - A forma Como eu estava usando, estar descrito na celula anterior.
    """
    # Recomenda-se usar o tokenizer.batch_encode_plus pois é mais rápido.
    return tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids


class MyDataset:
    def __init__(self, texts: List[str], tokenizer, max_seq_length: int):
        self.tokens_ids = []

        for text in tqdm(texts):

          tokenized_text = tokenize(f'{tokenizer.cls_token} {text}', tokenizer)
          padding = [tokenizer.pad_token_id] * max(0,(max_seq_length - len(tokenized_text) + 1))
          tokenized_text += padding
          
        #   print(f'qual é o range {0}, {len(tokenized_text) - 1}, {max_seq_length}')
        #   print(list(range(0, len(tokenized_text) - 1, max_seq_length)))

          for i in range(0, len(tokenized_text) - 1, max_seq_length):
            
            #Verifica o tamanho, para que ele possa fazer uma janela deslizante pegando todos os tokens da frase.
            if i + max_seq_length < len(tokenized_text):
              self.tokens_ids.append(tokenized_text[i: i + max_seq_length + 1])
              continue

            self.tokens_ids.append(tokenized_text[-max_seq_length - 1:])

        """
        gc.collect()  method
        The free lists maintained for a number of built-in types are cleared
         whenever a full collection or collection of the highest generation (2) is run.
          Not all items in some free lists may be freed due to the particular
           implementation, in particular float.
        """
        self.tokens_ids = torch.LongTensor(self.tokens_ids)
        gc.collect()

    def __len__(self):
        return len(self.tokens_ids)

    def __getitem__(self, idx):

        token_id = self.tokens_ids[idx]
        
        return token_id[:-1], token_id[1:]

dummy_texts = ['Eu gosto de correr momento lazer vida', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(dummy_texts, tokenizer, 9)
#dummy_dataset[0]

  0%|          | 0/2 [00:00<?, ?it/s]

## Testando se a implementação do MyDataset está correta

In [13]:
dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, max_seq_length=9)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 2
print('Passou no assert de tamanho do dataset.')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[  101,  3396, 10303,   125, 13239,     0,     0,     0,     0],
     [  101,  1660,  5971,   785,   125,  1847, 13779, 15616,     0]])

correct_first_batch_target = torch.LongTensor(
    [[ 3396, 10303,   125, 13239,     0,     0,     0,     0,     0],
     [ 1660,  5971,   785,   125,  1847, 13779, 15616,     0,     0]])

assert torch.equal(first_batch_input, correct_first_batch_input)
assert torch.equal(first_batch_target, correct_first_batch_target)

print('Passou no assert de dataset.')

  0%|          | 0/2 [00:00<?, ?it/s]

Passou no assert de tamanho do dataset.
Passou no assert de dataset.


# Carregamento do dataset 

Iremos usar uma pequena amostra do dataset [BrWaC](https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC) para treinar e avaliar nosso modelo de linguagem.

In [14]:
!wget -nc https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt

--2022-06-08 16:05:15--  https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula9/sample-1gb.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.218.128, 142.251.31.128, 142.251.18.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.218.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1230909256 (1.1G) [text/plain]
Saving to: ‘sample-1gb.txt’


2022-06-08 16:05:24 (138 MB/s) - ‘sample-1gb.txt’ saved [1230909256/1230909256]



In [15]:
# Load datasets

seq_length = 127

max_seq_length = seq_length

texts = open('sample-1gb.txt').readlines()

len_max = int(len(texts)/2)

train_examples = int(len_max*0.6)
valid_examples = int(len_max*0.3)
test_examples = int(len_max*0.1)

print(f"train examples: {train_examples}")
print(f"valid examples: {valid_examples}")
print(f"test examples: {test_examples}")



print(f'Read {len(texts)} lines.')

max_lines = train_examples + valid_examples + test_examples
print(f'Truncating to {max_lines} lines.')
texts = texts[:max_lines]  

training_texts = texts[:-(valid_examples + test_examples)]
valid_texts = texts[-(valid_examples + test_examples):-test_examples]
test_texts = texts[-test_examples:]

training_dataset = MyDataset(texts=training_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)
valid_dataset = MyDataset(texts=valid_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)
test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, max_seq_length = max_seq_length)

train examples: 75000
valid examples: 37500
test examples: 12500
Read 250000 lines.
Truncating to 125000 lines.


  0%|          | 0/75000 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [16]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [17]:
print(f'training examples: {len(training_dataset)}')
print(f'valid examples: {len(valid_dataset)}')
print(f'test examples: {len(test_dataset)}')

training examples: 692798
valid examples: 355365
test examples: 112375


In [18]:
from torch import Tensor
import torch.nn.functional as F
import torch.autograd.profiler as profiler
import time
import torch
from torch import nn

class MultiHeadSelfAttentionLayer(nn.Module):
    def __init__(self, n_heads, dim, max_length):
        super().__init__()

        self.n_heads = n_heads
        self.dim = dim
        self.max_length = max_length

        self.W_q = nn.Linear(self.dim, self.dim, bias=False)
        self.W_k = nn.Linear(self.dim, self.dim, bias=False)
        self.W_v = nn.Linear(self.dim, self.dim, bias=False)
        self.W_o = nn.Linear(self.dim, self.dim, bias=False)

        self.feed_forward = nn.Sequential(
            nn.Linear(self.dim, self.dim),
            nn.ReLU(),
            nn.Linear(self.dim, self.dim),
        )

        self.layer_norm1 = nn.LayerNorm(self.dim, eps=1e-6)

        self.layer_norm2 = nn.LayerNorm(self.dim, eps=1e-6)

    def attention(self, q, k, v, mask=None):
        scores = torch.matmul(q, k.transpose(-1, -2))
        scores /= math.sqrt(self.dim // self.n_heads)

        #print(f"mask inside attention: {mask}")        
        if mask is not None:
          scores += mask[:, None, :]

        probs = F.softmax(scores, dim=-1)
        e = torch.matmul(probs, v)
        #print(f"scores inside attention: {scores}")

        return e

    def forward(self, inputs, attention_mask):
        batch_size = inputs.size(0)
        residual = inputs
        
        #print(self.W_q)
        q = self.W_q(inputs).reshape(batch_size, self.max_length, self.n_heads, self.dim // self.n_heads)
        k = self.W_k(inputs).reshape(batch_size, self.max_length, self.n_heads, self.dim // self.n_heads)
        v = self.W_v(inputs).reshape(batch_size, self.max_length, self.n_heads, self.dim // self.n_heads)

        #print(f"shape q before{q.shape}")

        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        #print(f"shape q {q.shape}")
        
        output = self.attention(q, k, v, attention_mask)

        #print(f"output forward:{output.shape}")
        #print(f"output transpose forward:{output.transpose(1,2).shape}")
        #print(f"output transpose contiguous forward:{output.transpose(1,2).contiguous().shape}")

        output = output.transpose(1, 2).contiguous()
        output = output.reshape(batch_size, self.max_length, self.dim)
        output = self.W_o(output)

        output = self.layer_norm1(residual + output)
        residual = output
        output = self.feed_forward(output)
        output = self.layer_norm2(residual + output)

        # output *= attention_mask[:, :, None]
        return output


class LanguageModel(nn.Module):
    def __init__(self, vocab_size: int, max_seq_length: int, dim: int, n_layers: int, pad_token_id: int):
        """
        Implements the Self-attention, decoder-only."

        Args:
            vocab_size (int): Size of the input vocabulary.
            max_seq_length (int): Size of the sequence to consider as context for prediction.
            dim (int): Dimension of the embedding layer for each word in the context.
            n_layers (int): number of self-attention layers.
            pad_token_id (int): id of the pad token that will be ignored in the attention.
        """
        super().__init__()

        self.pad_token_id = pad_token_id
        self.embeddings = nn.Embedding(vocab_size, dim, padding_idx=pad_token_id)
        self.positional_embeddings = nn.Embedding(max_seq_length, dim)

        self.layers = (
            [MultiHeadSelfAttentionLayer(n_heads=1, dim=dim, max_length=max_seq_length).to(device)] +
            [MultiHeadSelfAttentionLayer(n_heads=1, dim=dim, max_length=max_seq_length).to(device) for _ in range(n_layers - 1)]
        )

        self.lm_head = nn.Sequential(
            torch.nn.Linear(dim, dim),
            nn.ReLU(),
            torch.nn.Linear(dim, vocab_size, bias=False),
        )


    def forward(self, inputs):
        """
        Args:
            inputs is a LongTensor of shape (batch_size, max_seq_length)
            
        Returns:
            logits of shape (batch_size, max_seq_length, vocab_size)
        """

        seq_len = inputs.size(-1)
        #print(f"seq_len {seq_len}")

        pad_mask = (inputs == self.pad_token_id)
        #print(f"pad_mask {pad_mask}")

        repeated_examples = inputs.unsqueeze(1).repeat(1, seq_len, 1)
        #print(f"repeated_examples {repeated_examples}")

        attention_mask = torch.empty(repeated_examples.size(), device=inputs.device).fill_(float('-inf'))
        #print(f"attention_mask torch.empty{attention_mask}")

        attention_mask = attention_mask.triu(diagonal=1) # diagonal matrix
        #print(f"attention_mask.triu: {attention_mask}")

        attention_mask.masked_fill_(pad_mask.unsqueeze(1), float('-inf'))
        #print(f"attention_mask.masked_fill_ {attention_mask}")

        out = self.embeddings(inputs)
        pos_ids = torch.arange(inputs.size(1), dtype=torch.long, device=device)
        pos_embeddings = self.positional_embeddings(pos_ids)
        out += pos_embeddings

        for layer in self.layers:
          out = layer(out, attention_mask)

        return self.lm_head(out)

## Teste o modelo com um exemplo

In [19]:
max_seq_length = seq_length

model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=1,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

sample_input, target_input = next(iter(DataLoader(training_dataset)))
target_input = target_input.to(device)
sample_input = sample_input.to(device)

print(sample_input.is_cuda, target_input.is_cuda)

sample_output = model(sample_input)
print(f'sample_input.shape: {sample_input.shape}')
print(f'sample_output.shape: {sample_output.shape}')

True True
sample_input.shape: torch.Size([1, 127])
sample_output.shape: torch.Size([1, 127, 29794])


In [20]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of model parameters: {num_params}')

Number of model parameters: 3825920


In [21]:
torch.cuda.empty_cache()

In [22]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.028845GB
torch.cuda.memory_reserved: 0.037109GB
torch.cuda.max_memory_reserved: 0.037109GB


## Assert da Perplexidade


In [23]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.empty_cache

def perplexity(logits, target, ignore_token_id: int):
    """
    Computes the perplexity.

    Args:
        logits: a FloatTensor of shape (batch_size, seq_len, vocab_size)
        target: a LongTensor of shape (batch_size, seq_len)

    Returns:
        A float corresponding to the perplexity
    """
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    print(logits.shape, target.shape)
    loss = nn.functional.cross_entropy(logits, target, reduction='mean')
    return torch.exp(loss)


n_examples = 4

train_input_ids, train_target_ids = next(iter(DataLoader(training_dataset, batch_size=n_examples)))
train_input_ids = train_input_ids.to(device)
train_target_ids = train_target_ids.to(device)

logits = model(train_input_ids)

my_perplexity = perplexity(logits=logits, target=train_target_ids, ignore_token_id=tokenizer.pad_token_id)

print(f'my perplexity:              {int(my_perplexity)}')
print(f'correct initial perplexity: {tokenizer.vocab_size}')

assert math.isclose(my_perplexity, tokenizer.vocab_size, abs_tol=7000)
print('Passou o no assert da perplexidade')

torch.Size([508, 29794]) torch.Size([508])
my perplexity:              30756
correct initial perplexity: 29794
Passou o no assert da perplexidade


In [24]:
print(logits.shape)
print(train_target_ids.shape)
device

torch.Size([4, 127, 29794])
torch.Size([4, 127])


device(type='cuda')

In [25]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [27]:
class SaveBestModel:

    def __init__(
        self, best_valid_loss=float('inf')
    ):
        self.best_valid_loss = best_valid_loss
        
    def __call__(
        self, current_valid_loss, 
        epoch, model, optimizer, criterion
    ):
        if current_valid_loss < self.best_valid_loss:
            self.best_valid_loss = current_valid_loss
            print(f"Best validation loss: {self.best_valid_loss}")
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                }, "gdrive/MyDrive/Colab Notebooks/"+"2_best_model_8_june.pt")


## Laço de Treinamento e Validação

In [None]:
max_examples = 1500_000_000
eval_every_steps = 1000
lr = 3e-4


model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

train_loader = DataLoader(training_dataset, batch_size=127, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=127)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


save_best_model = SaveBestModel()

def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for train_input_ids, train_target_ids in train_loader:
        loss = train_step(train_input_ids.to(device),
                          train_target_ids.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(val_input_ids.to(device),
                                    val_target_ids.to(device))
                    for val_input_ids, val_target_ids in validation_loader]))

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')

            last_loss = valid_ppl

            train_losses = []
            save_best_model(
              last_loss, 0, model, optimizer, nn.functional.cross_entropy
              )
            
        #16hours
        n_examples += len(train_input_ids)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

## Avaliação final no dataset de teste


Bonus: o modelo com menor perplexidade no dataset de testes ganhará 0.5 ponto na nota final.

In [None]:
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

def validation_step(input, target):
    model.eval()
    logits = model(input)
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target, ignore_index=tokenizer.pad_token_id)
    return loss.item()

load_dict = torch.load("gdrive/MyDrive/Colab Notebooks/"+"2_best_model_4_june.pt")
model.load_state_dict(load_dict['model_state_dict'])
model.to(device)

test_loader = DataLoader(valid_dataset, batch_size=20)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(input.to(device), target.to(device))
        for input, target in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

In [None]:
test_loader = DataLoader(test_dataset, batch_size=64)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(test_input_ids.to(device), test_target_ids.to(device))
        for test_input_ids, test_target_ids in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

## Teste seu modelo com uma sentença

Escolha uma sentença gerada pelo modelo que ache interessante.

In [None]:
prompt = 'Eu gosto de comer pizza pois me faz'
max_output_tokens = 20
model.eval()

for _ in range(max_output_tokens):
    input_ids = tokenize(text=prompt, tokenizer=tokenizer)
    input_ids_truncated = input_ids[-max_seq_length:]  # Usamos apenas os últimos <max_seq_length> tokens como entrada para o modelo.
    logits = model(torch.LongTensor([input_ids_truncated]).to(device))
    logits = logits[:, -1, :]  # Usamos apenas o ultimo token da sequencia
    # Ao usarmos o argmax, a saída do modelo em cada passo é o token de maior probabilidade.
    # Isso se chama decodificação gulosa (greedy decoding).
    predicted_id = torch.argmax(logits).item()
    input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
    prompt = tokenizer.decode(input_ids)
    print(prompt)

In [None]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

In [22]:
max_examples = 1000_000_000
eval_every_steps = 10000
lr = 3e-4


model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)


load_dict = torch.load("gdrive/MyDrive/Colab Notebooks/"+"2_best_model_4_june.pt")
model.load_state_dict(load_dict['model_state_dict'])
model.to(device)


train_loader = DataLoader(training_dataset, batch_size=127, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=127)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    return loss.item()

save_best_model = SaveBestModel()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for input, target in train_loader:
        loss = train_step(input.to(device), target.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(input.to(device), target.to(device))
                    for input, target in validation_loader]))
            
            last_loss = valid_ppl

            save_best_model(
              last_loss, 0, model, optimizer, nn.functional.cross_entropy
              )

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')
            train_losses = []

        n_examples += len(input)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

Best validation loss: 1102.5383686704638
0 steps; 0 examples so far; train ppl: 1070.90, valid ppl: 1102.54
Best validation loss: 215.76455550284257
10000 steps; 1270000 examples so far; train ppl: 228.52, valid ppl: 215.76
Best validation loss: 208.03912286986946
20000 steps; 2540000 examples so far; train ppl: 204.50, valid ppl: 208.04
Best validation loss: 203.52394826228493
30000 steps; 3810000 examples so far; train ppl: 197.09, valid ppl: 203.52
Best validation loss: 200.5853672107694
40000 steps; 5080000 examples so far; train ppl: 192.22, valid ppl: 200.59
Best validation loss: 198.52835081261318
50000 steps; 6350000 examples so far; train ppl: 188.85, valid ppl: 198.53
Best validation loss: 196.65283936310468
60000 steps; 7620000 examples so far; train ppl: 186.35, valid ppl: 196.65
Best validation loss: 195.41390432395696
70000 steps; 8890000 examples so far; train ppl: 184.19, valid ppl: 195.41
Best validation loss: 194.2745784113688
80000 steps; 10160000 examples so far; tr

KeyboardInterrupt: ignored

In [28]:
max_examples = 1000_000_000
eval_every_steps = 10000
lr = 3e-4


model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)


load_dict = torch.load("gdrive/MyDrive/Colab Notebooks/"+"2_best_model_4_june.pt")
model.load_state_dict(load_dict['model_state_dict'])
model.to(device)


train_loader = DataLoader(training_dataset, batch_size=127, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=127)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def train_step(input_ids, target_ids):
    model.train()
    model.zero_grad()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input_ids, target_ids):
    model.eval()
    logits = model(input_ids)
    logits = logits.reshape(-1, logits.shape[-1])
    target_ids = target_ids.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target_ids, ignore_index=tokenizer.pad_token_id)
    return loss.item()

save_best_model = SaveBestModel()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for input, target in train_loader:
        loss = train_step(input.to(device), target.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(input.to(device), target.to(device))
                    for input, target in validation_loader]))
            
            last_loss = valid_ppl

            save_best_model(
              last_loss, 0, model, optimizer, nn.functional.cross_entropy
              )

            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')
            train_losses = []

        n_examples += len(input)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

Best validation loss: 33108.722496936614
0 steps; 0 examples so far; train ppl: 36778.95, valid ppl: 33108.72
Best validation loss: 197.3138675253675
10000 steps; 1270000 examples so far; train ppl: 219.12, valid ppl: 197.31
Best validation loss: 189.25555875271982
20000 steps; 2540000 examples so far; train ppl: 183.74, valid ppl: 189.26
Best validation loss: 185.25332105717283
30000 steps; 3810000 examples so far; train ppl: 177.97, valid ppl: 185.25
Best validation loss: 182.7443965159436
40000 steps; 5080000 examples so far; train ppl: 174.78, valid ppl: 182.74
Best validation loss: 180.7899573090807
50000 steps; 6350000 examples so far; train ppl: 172.47, valid ppl: 180.79
Best validation loss: 179.19358406780503
60000 steps; 7620000 examples so far; train ppl: 170.77, valid ppl: 179.19
Best validation loss: 177.97371351519112
70000 steps; 8890000 examples so far; train ppl: 169.22, valid ppl: 177.97
Best validation loss: 176.84438038844016
80000 steps; 10160000 examples so far; t

KeyboardInterrupt: ignored

In [29]:
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    max_seq_length=max_seq_length,
    dim=64,
    n_layers=2,
    pad_token_id=tokenizer.pad_token_id,
).to(device)

def validation_step(input, target):
    model.eval()
    logits = model(input)
    logits = logits.reshape(-1, logits.shape[-1])
    target = target.reshape(-1)
    loss = nn.functional.cross_entropy(logits, target, ignore_index=tokenizer.pad_token_id)
    return loss.item()

load_dict = torch.load("gdrive/MyDrive/Colab Notebooks/"+"2_best_model_4_june.pt")
model.load_state_dict(load_dict['model_state_dict'])
model.to(device)

test_loader = DataLoader(valid_dataset, batch_size=20)

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(input.to(device), target.to(device))
        for input, target in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

test perplexity: 38399.125222548035


## Bonus 1
Quem conseguir a menor perplexidade no dataset de testes ganha 0.5 ponto na média final.

## Bonus 2
Qual é a complexidade (em notação O-grande) da função de geração de texto acima?

Quem responder corretamente a pergunta acima e deixar a função com menor complexidade ganha 0.5 ponto na média final.