<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/ex07/Alexander_Valle/IA025_Alexander_Valle_Aula_7_Exerc%C3%ADcio__190model3_per176.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
nome = "Rolan Alexander Valle Rey Sánchez"
print(f'Meu nome é {nome}')

Meu nome é Rolan Alexander Valle Rey Sánchez


#  Exercício: Modelo de Linguagem (Bengio 2003) - MLP + Embeddings

Neste exercício iremos treinar uma rede neural simples para prever a proxima palavra de um texto, data as palavras anteriores como entrada. Esta tarefa é chamada de "Modelagem da Língua".

Este dataset já possui um tamanho razoável e é bem provável que você vai precisar rodar seus experimentos com GPU.

Alguns conselhos úteis:
- **ATENÇÃO:** o dataset é bem grande. Não dê comando de imprimí-lo.
- Durante a depuração, faça seu dataset ficar bem pequeno, para que a depuração seja mais rápida e não precise de GPU. Somente ligue a GPU quando o seu laço de treinamento já está funcionando
- Não deixe para fazer esse exercício na véspera. Ele é trabalhoso.

In [None]:
# iremos utilizar a biblioteca dos transformers para ter acesso ao tokenizador do BERT.
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

## Importação dos pacotes

In [None]:
import collections
import itertools
import functools
import math
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook


In [None]:
# Check which GPU we are using
!nvidia-smi

Wed May 18 20:21:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Check which GPU we are using (2nd run)
!nvidia-smi

Wed May 18 20:21:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    13W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

Using cuda:0


In [None]:
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed(123)

## Implementação do MyDataset

In [None]:
from typing import List


def tokenize(text: str, tokenizer):
    return tokenizer(text, return_tensors=None, add_special_tokens=False).input_ids

class MyDataset():
    def __init__(self, texts: List[str], tokenizer, context_size: int):
      self.tokentexts = [tokenize(text, tokenizer) for text in texts] # command line from Pedro Gengo
      self.context_size = context_size
      self.X,self.y=self.getXy(self.tokentexts,self.context_size)

    def getXy(self,corpus,n_gram):
      input,target=[],[]
      for sentence in corpus:
        for i in range(len(sentence)-n_gram):# just sentences bigger 
          #print('text',sentence[i:i+n_gram])
          #print('target',sentence[i+n_gram])
          input.append(sentence[i:i+n_gram])
          target.append(sentence[i+n_gram])
      return torch.LongTensor(input),torch.LongTensor(target)# X,y 

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

## Teste se sua implementação do MyDataset está correta

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

dummy_texts = ['Eu gosto de correr', 'Ela gosta muito de comer pizza']

dummy_dataset = MyDataset(texts=dummy_texts, tokenizer=tokenizer, context_size=3)
dummy_loader = DataLoader(dummy_dataset, batch_size=6, shuffle=False)
assert len(dummy_dataset) == 5
print('passou no assert de tamanho do dataset')

first_batch_input, first_batch_target = next(iter(dummy_loader))

correct_first_batch_input = torch.LongTensor(
    [[ 3396, 10303,   125],
     [ 1660,  5971,   785],
     [ 5971,   785,   125],
     [  785,   125,  1847],
     [  125,  1847, 13779]])

correct_first_batch_target = torch.LongTensor([13239,   125,  1847, 13779, 15616])

assert torch.equal(first_batch_input, correct_first_batch_input)
print('Passou no assert de input')
assert torch.equal(first_batch_target, correct_first_batch_target)
print('Passou no assert de target')

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

passou no assert de tamanho do dataset
Passou no assert de input
Passou no assert de target


In [None]:
first_batch_input

tensor([[ 3396, 10303,   125],
        [ 1660,  5971,   785],
        [ 5971,   785,   125],
        [  785,   125,  1847],
        [  125,  1847, 13779]])

In [None]:
first_batch_target

tensor([13239,   125,  1847, 13779, 15616])

# Carregamento do dataset 

Iremos usar uma pequena amostra do dataset [BrWaC](https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC) para treinar e avaliar nosso modelo de linguagem.

In [None]:
!wget -nc https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula7/sample_brwac.txt

--2022-05-18 20:21:23--  https://storage.googleapis.com/unicamp-dl/ia025a_2022s1/aula7/sample_brwac.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.202.128, 74.125.20.128, 108.177.98.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.202.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 123983611 (118M) [text/plain]
Saving to: ‘sample_brwac.txt’


2022-05-18 20:21:24 (179 MB/s) - ‘sample_brwac.txt’ saved [123983611/123983611]



In [None]:
# Load datasets
context_size = 9

valid_examples = 100
test_examples = 100
texts = open('sample_brwac.txt').readlines()

#print('Truncating for debugging purposes.')
#texts = texts[:500]  

training_texts = texts[:-(valid_examples + test_examples)]
valid_texts = texts[-(valid_examples + test_examples):-test_examples]
test_texts = texts[-test_examples:]

training_dataset = MyDataset(texts=training_texts, tokenizer=tokenizer, context_size=context_size)
valid_dataset = MyDataset(texts=valid_texts, tokenizer=tokenizer, context_size=context_size)
test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, context_size=context_size)

In [None]:
print(f'training examples: {len(training_dataset)}')
print(f'valid examples: {len(valid_dataset)}')
print(f'test examples: {len(test_dataset)}')

training examples: 27675945
valid examples: 82070
test examples: 166726


In [None]:
class LanguageModel(nn.Module):
  def __init__(self, vocab_size, context_size, embedding_dim, hidden_size):
    #inspiration from   #https://gist.github.com/naturale0/b0c15b0940c23e40d8775acfdb5a575e
    # https://blog.krybot.com/a?ID=16e33a23-7668-4223-8068-774deb29fd5d
    """  
    Implements the Neural Language Model proposed by Bengio et al."
    Args: vocab_size (int): Size of the input vocabulary.
          context_size (int): Size of the sequence to consider as context for prediction.
          embedding_dim (int): Dimension of the embedding layer for each word in the context.
          hidden_size (int): Size of the hidden layer.  
    """
    super().__init__()
    self.vocab_size = vocab_size
    self.n_gram = context_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_size
    # embedding
    self.C = nn.Embedding(vocab_size, embedding_dim)
    # affine layers for tanh
    self.dH = nn.Linear(context_size * embedding_dim, hidden_size) #d+Hx
    self.U = nn.Linear(hidden_size, vocab_size, bias=False)
    self.relu = nn.ReLU()
    #simplification of Bengio NLPM: y = URelu(d+Hx)
    self.model = nn.Sequential(self.dH ,self.relu,self.U)
        
  def forward(self, inputs):
    #Args:          inputs is a LongTensor of shape (embedding_dim, context_size)
    X = self.C( inputs) 
    X = X.view(-1, self.embedding_dim * self.n_gram)
    return self.model(X)


In [None]:
class LanguageModel(nn.Module):
  def __init__(self, vocab_size, context_size, embedding_dim, hidden_size):
    #inspiration from   #https://gist.github.com/naturale0/b0c15b0940c23e40d8775acfdb5a575e
    # https://blog.krybot.com/a?ID=16e33a23-7668-4223-8068-774deb29fd5d
    """  
    Implements the Neural Language Model proposed by Bengio et al."
    Args: vocab_size (int): Size of the input vocabulary.
          context_size (int): Size of the sequence to consider as context for prediction.
          embedding_dim (int): Dimension of the embedding layer for each word in the context.
          hidden_size (int): Size of the hidden layer.  
    """
    super().__init__()
    self.vocab_size = vocab_size
    self.n_gram = context_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_size
    # embedding
    self.C = nn.Embedding(vocab_size, embedding_dim)
    # affine layers for tanh
    self.dH = nn.Linear(context_size * embedding_dim, hidden_size) #d+Hx
    # affine layer for residual connection
    self.bW = nn.Linear(context_size * embedding_dim, vocab_size) # b+Wx       
    self.U = nn.Linear(hidden_size, vocab_size, bias=False)
    self.relu = nn.ReLU()
    #from Bengio NLPM: y = b+Wx+Utanh(d+Hx) 
    self.seq = nn.Sequential(self.dH ,self.relu,self.U)
        
  def forward(self, inputs):
    #Args:          inputs is a LongTensor of shape (embedding_dim, context_size)
    X = self.C( inputs) 
    X = X.view(-1, self.embedding_dim * self.n_gram)
    #mod of Bengio NLPM: y = b+Wx URelu(d+Hx)
    return self.seq(X)# +self.bW(X)


In [None]:
class LanguageModel(torch.nn.Module):
  def __init__(self, vocab_size, context_size, embedding_dim, hidden_size):
    #inspiration from   #https://gist.github.com/naturale0/b0c15b0940c23e40d8775acfdb5a575e
    # https://blog.krybot.com/a?ID=16e33a23-7668-4223-8068-774deb29fd5d
    """  
    Implements the Neural Language Model proposed by Bengio et al."
    Args: vocab_size (int): Size of the input vocabulary.
          context_size (int): Size of the sequence to consider as context for prediction.
          embedding_dim (int): Dimension of the embedding layer for each word in the context.
          hidden_size (int): Size of the hidden layer.  
    """
    super().__init__()
    self.vocab_size = vocab_size
    self.n_gram = context_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_size
    # embedding
    self.C = nn.Embedding(vocab_size, embedding_dim)
    # affine layers for tanh
    self.dH = nn.Linear(context_size * embedding_dim, hidden_size) #d+Hx
    self.hl1 = nn.Linear(hidden_size, hidden_size*2) # ADICIONAL LAYER inspired From Larisa
    self.hl2 = nn.Linear(hidden_size*2, hidden_size*4) # second ADICIONAL LAYER
    self.U = nn.Linear(hidden_size*4, vocab_size, bias=False)
    self.relu = nn.ReLU()
    self.model = nn.Sequential(self.dH ,self.relu,self.hl1,self.relu,self.hl2,self.relu,self.U)

  def forward(self, inputs):
    #Args:          inputs is a LongTensor of shape (embedding_dim, context_size)
    X = self.C( inputs) 
    X = X.view(-1, self.embedding_dim * self.n_gram)
    return self.model(X)

## Teste o modelo com um exemplo

In [None]:
model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    context_size=context_size,
    embedding_dim=32,
    hidden_size=64,
).to(device)

sample_train, _ = next(iter(DataLoader(training_dataset)))
sample_train_gpu = sample_train.to(device)
model(sample_train_gpu).shape

torch.Size([1, 29794])

In [None]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of model parameters: {num_params}')

Number of model parameters: 8640512


## Assert da Perplexidade


In [None]:
import torch.nn.functional as F

def perplexity(logits, target):
  """
  Computes the perplexity.
  Args:   logits: a FloatTensor of shape (batch_size, vocab_size)
          target: a LongTensor of shape (batch_size,)
  Returns: A float corresponding to the perplexity.
  """
  return torch.exp(F.cross_entropy(logits, target))#  #perplexity  = torch.exp(loss)

n_examples = 1000

sample_train, target_token_ids = next(iter(DataLoader(training_dataset, batch_size=n_examples)))
sample_train_gpu = sample_train.to(device)
target_token_ids = target_token_ids.to(device)
logits = model(sample_train_gpu)
logits.shape,target_token_ids.shape



(torch.Size([1000, 29794]), torch.Size([1000]))

In [None]:
my_perplexity = perplexity(logits=logits, target=target_token_ids)

print(f'my perplexity:              {int(my_perplexity)}')
print(f'correct initial perplexity: {tokenizer.vocab_size}')

assert math.isclose(my_perplexity, tokenizer.vocab_size, abs_tol=2000)
print('Passou o no assert da perplexidade')

my perplexity:              29677
correct initial perplexity: 29794
Passou o no assert da perplexidade


## Laço de Treinamento e Validação

In [None]:
max_examples = 190_000_000
eval_every_steps = 5000
lr = 5e-5
compare=1e10

model = LanguageModel(
    vocab_size=tokenizer.vocab_size,
    context_size=context_size,
    embedding_dim=128,
    hidden_size=128,
).to(device)

train_loader = DataLoader(training_dataset, batch_size=1024, shuffle=True, drop_last=True)
validation_loader = DataLoader(valid_dataset, batch_size=1024)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def train_step(input, target):
    model.train()
    model.zero_grad()

    logits = model(input.to(device))
    loss = nn.functional.cross_entropy(logits, target.to(device))
    loss.backward()
    optimizer.step()

    return loss.item()


def validation_step(input, target):
    model.eval()
    logits = model(input)
    loss = nn.functional.cross_entropy(logits, target)
    return loss.item()


train_losses = []
n_examples = 0
step = 0
while n_examples < max_examples:
    for input, target in train_loader:
        loss = train_step(input.to(device), target.to(device)) 
        train_losses.append(loss)
        
        if step % eval_every_steps == 0:
            train_ppl = np.exp(np.average(train_losses))

            with torch.no_grad():
                valid_ppl = np.exp(np.average([
                    validation_step(input.to(device), target.to(device))
                    for input, target in validation_loader]))
            print(f'{step} steps; {n_examples} examples so far; train ppl: {train_ppl:.2f}, valid ppl: {valid_ppl:.2f}')
            train_losses = []

        n_examples += len(input)  # Increment of batch size
        step += 1
        if n_examples >= max_examples:
            break

0 steps; 0 examples so far; train ppl: 29760.51, valid ppl: 29671.56
5000 steps; 5120000 examples so far; train ppl: 1520.59, valid ppl: 1079.24
10000 steps; 10240000 examples so far; train ppl: 888.15, valid ppl: 768.68
15000 steps; 15360000 examples so far; train ppl: 692.95, valid ppl: 634.49
20000 steps; 20480000 examples so far; train ppl: 586.70, valid ppl: 541.54
25000 steps; 25600000 examples so far; train ppl: 507.01, valid ppl: 472.24
30000 steps; 30720000 examples so far; train ppl: 443.36, valid ppl: 419.76
35000 steps; 35840000 examples so far; train ppl: 396.77, valid ppl: 381.04
40000 steps; 40960000 examples so far; train ppl: 363.70, valid ppl: 350.59
45000 steps; 46080000 examples so far; train ppl: 337.37, valid ppl: 326.05
50000 steps; 51200000 examples so far; train ppl: 316.63, valid ppl: 306.58
55000 steps; 56320000 examples so far; train ppl: 297.30, valid ppl: 291.97
60000 steps; 61440000 examples so far; train ppl: 277.84, valid ppl: 278.70
65000 steps; 665600

## Avaliação final no dataset de teste


Bonus: o modelo com menor perplexidade no dataset de testes ganhará 0.5 ponto na nota final.

In [None]:
# Load datasets
context_size = 9

valid_examples = 100
test_examples = 100
texts = open('sample_brwac.txt').readlines()

test_texts = texts[-test_examples:]

test_dataset = MyDataset(texts=test_texts, tokenizer=tokenizer, context_size=context_size)


In [None]:
test_loader = DataLoader(test_dataset, batch_size=64)

def validation_step(input, target):
    model.eval()
    logits = model(input)
    loss = nn.functional.cross_entropy(logits, target)
    return loss.item()

with torch.no_grad():
    test_ppl = np.exp(np.average([
        validation_step(input.to(device), target.to(device))
        for input, target in test_loader
    ]))

print(f'test perplexity: {test_ppl}')

## Teste seu modelo com uma sentença

Escolha uma sentença gerada pelo modelo que ache interessante.

In [None]:
def printpromptpredic(prompt,max_output_tokens):
  for _ in range(max_output_tokens):
      input_ids = tokenize(text=prompt, tokenizer=tokenizer)
      input_ids_truncated = input_ids[-context_size:]  # Usamos apenas os últimos <context_size> tokens como entrada para o modelo.
      logits = model(torch.LongTensor([input_ids_truncated]).to(device)) # From Patric Ferreira
      # Ao usarmos o argmax, a saída do modelo em cada passo é token de maior probabilidade.
      # Isso se chama decodificação gulosa (greedy decoding).
      predicted_id = torch.argmax(logits).item()
      input_ids += [predicted_id]  # Concatenamos a entrada com o token escolhido nesse passo.
      prompt = tokenizer.decode(input_ids)
      print(prompt)

In [None]:
prompt = 'Eu gosto de comer pizza pois me faz''# Ex: '
max_output_tokens = 10

printpromptpredic(prompt,10)

In [None]:
prompt = 'A grama da vizinho é sempre é sempre mais verde'# 
printpromptpredic(prompt,10)

In [None]:
prompt = 'Gato escaldado tem medo de água fria'#  tem medo de água fria
printpromptpredic(prompt,10)

In [None]:
prompt = 'Ontem fui ao restaurante comer um prato delicioso, pedi'#  tem medo de água fria
printpromptpredic(prompt,10)