Baseado em:
- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- https://github.com/bentrevett/pytorch-seq2seq

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!python -m spacy download en_core_web_sm
!python -m spacy download pt_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')


In [3]:
import os
from os import path
import datetime
import string
import random

import tarfile
import requests
import numpy as np
import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Dataset, Field, BucketIterator, Example

from tqdm.notebook import tqdm

In [4]:
gz_url = 'https://www.statmt.org/europarl/v7/pt-en.tgz'
gz_path = 'gdrive/MyDrive/NLP/seq2seq/pt-en.tgz'
extracted_dir = os.path.splitext(gz_path)[0]

if not path.isdir(extracted_dir):
    os.makedirs(path.dirname(extracted_dir), exist_ok=True)
    if not path.isfile(gz_path):
        resp = requests.get(gz_url, stream=True)
        resp.raise_for_status()

        progress_bar = tqdm(desc='Downloading data', total=int(resp.headers.get('content-length', 0)), unit='iB', unit_scale=True)
        with open(gz_path, 'wb') as f:
            for chunk in resp.iter_content(chunk_size=4096):
                progress_bar.update(len(chunk))
                f.write(chunk)
        print('Downloaded data to:', gz_path)

    os.makedirs(extracted_dir, exist_ok=True)
    with tarfile.open(gz_path, 'r') as tar:
        dirs = [member for member in tar.getmembers()]
        tar.extractall(path=extracted_dir, members=dirs)
    print('Extracted data to:', extracted_dir)

In [5]:
nlp_pt = spacy.load('pt_core_news_sm')
field_pt = Field(tokenize=lambda text: [token.text for token in nlp_pt.tokenizer(text)],
                 init_token='<sos>',
                 eos_token='<eos>',
                 lower=True)

nlp_en = spacy.load('en_core_web_sm')
field_en = Field(tokenize=lambda text: [token.text for token in nlp_en.tokenizer(text)],
                 init_token='<sos>',
                 eos_token='<eos>',
                 lower=True)

fields = [('pt', field_pt), ('en', field_en)]

In [6]:
examples = []
with open(path.join(extracted_dir, 'europarl-v7.pt-en.pt')) as file_pt:
    with open(path.join(extracted_dir, 'europarl-v7.pt-en.en')) as file_en:
        for i, (line_pt, line_en) in enumerate(zip(file_pt, file_en)):
            if i >= 100_000: break
            if i % 1000 == 0: print('\rLine', i, end='')
            if line_pt != '' and line_en != '':
                examples.append(Example.fromlist([line_pt, line_en], fields))

Line 99000

In [7]:
data = Dataset(examples, fields)
random.seed(42)
train_data, val_data, test_data = data.split(split_ratio=[.6, .1, .3])
print('[pt]', train_data[0].pt)
print('[en]', train_data[0].en)
print(f'Sizes: train={len(train_data)}, validation={len(val_data)}, test={len(test_data)}')

field_pt.build_vocab(train_data, min_freq = 2)
field_en.build_vocab(train_data, min_freq = 2)
print(f'Vocabulary size: pt={len(field_pt.vocab)}, en={len(field_en.vocab)}')

[pt] ['estou', 'convencida', 'de', 'que', 'deve', 'ser', 'feito', 'um', 'esforço', 'especial', 'para', 'proteger', 'melhor', 'os', 'jovens', 'atletas', '.']
[en] ['and', 'i', 'am', 'convinced', 'that', 'we', 'must', 'make', 'a', 'special', 'effort', 'to', 'protect', 'young', 'athletes', 'more', 'effectively', '.']
Sizes: train=60000, validation=30000, test=10000
Vocabulary size: pt=24287, en=15718


In [8]:
print(field_pt.vocab.itos)
print(field_pt.vocab.stoi)

['<unk>', '<pad>', '<sos>', '<eos>', ',', 'a', 'de', '.', 'que', 'o', 'e', 'da', 'do', 'os', 'em', 'para', 'não', 'uma', 'é', 'um', 'no', 'dos', 'as', 'à', 'se', 'com', 'por', 'na', 'das', 'comissão', 'como', 'senhor', 'mais', 'este', 'esta', 'd', 'também', 'ser', 'presidente', 'mas', 'sobre', 'europeia', 'união', 'nos', 'n', 'parlamento', '-', 'muito', 'ou', '"', 'são', 'relatório', 'isso', 'pelo', 'conselho', 'sua', 'política', 'já', ')', 'todos', 'europeu', '\xad', ':', 'foi', 'países', 's', 'pela', 'entre', 'questão', '(', 'esse', 'há', 'senhora', 'está', 'forma', 'gostaria', 'seu', 'deputado', 'tem', 'europa', 'ainda', 'ter', '?', 'facto', 'essa', 'direitos', 'parte', 'seja', 'trabalho', 'fazer', 'proposta', 'nas', 'importante', 'temos', 'mesmo', 'lugar', 'vez', 'quando', 'deve', 'desenvolvimento', 'bem', 'pode', 'aqui', 'situação', 'só', 'estados\xadmembros', 'apenas', 'contra', 'anos', 'nossa', 'debate', 'nível', 'assim', 'matéria', 'caso', 'sem', 'comissário', 'grupo', 'acordo'

In [9]:
BATCH_SIZE = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_batches, val_batches, test_batches = BucketIterator.splits((train_data, val_data, test_data),
                                                                 shuffle=True,
                                                                 sort_key=lambda x: len(x.pt),
                                                                 batch_size=BATCH_SIZE,
                                                                 device=device)

next(iter(train_batches))


[torchtext.data.batch.Batch of size 128]
	[.pt]:[torch.cuda.LongTensor of size 93x128 (GPU 0)]
	[.en]:[torch.cuda.LongTensor of size 98x128 (GPU 0)]

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, lstm_layers=1):
        super().__init__()

        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, lstm_layers)

    def forward(self, input, hc):
        embedded = self.embedding(input)
        out, hc = self.lstm(embedded, hc)
        return out, hc

In [11]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, lstm_layers=1):
        super().__init__()

        self.output_dim = output_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, lstm_layers)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hc):
        embedded = self.embedding(input.unsqueeze(0))
        out, hc = self.lstm(embedded, hc)
        out = self.out(out.squeeze(0))
        return out, hc

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.device = device
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, input, target=None, teacher_forcing_ratio=0.5, maxlen=100):
        # dimension variables
        batch_size = input.shape[1] # input and target shapes: (len, batch_size)
        target_vocab_size = self.decoder.output_dim

        # decoder output
        decoded = torch.zeros(maxlen, batch_size, target_vocab_size).to(self.device)
        
        # decoder input state == encoder output state
        _, hc = self.encoder(input, hc=None)
        
        input = torch.as_tensor([field_pt.vocab.stoi['<sos>']] * input.shape[1], device=self.device)
        for t in range(1, maxlen):
            out, hc = self.decoder(input, hc)
            decoded[t] = out
            
            if random.random() < teacher_forcing_ratio:
                input = target[t]
            else:
                input = out.argmax(1)
        
        return decoded

In [13]:
def train(model, batches, optimizer):
    model.train()
    
    epoch_loss = 0
    for batch in tqdm(batches, desc='training'):
        optimizer.zero_grad()
        out = model(batch.pt, target=batch.en, maxlen=batch.en.shape[0])

        # remove the batch_size dimension from the data:
        #   trg = [trg len, batch size]
        #   output = [trg len, batch size, output dim]
        # and remove the first token (<sos>)
        out = out[1:].view(-1, out.shape[2])
        target = batch.en[1:].view(-1)
        
        loss = F.cross_entropy(out, target)
        loss.backward()  
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(batches)

In [14]:
def evaluate(model, batches):
    model.eval()
    
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(batches, desc='evaluation'):
            out = model(batch.pt, target=batch.en, maxlen=batch.en.shape[0])

            out = out[1:].view(-1, out.shape[2])
            target = batch.en[1:].view(-1)

            loss = F.cross_entropy(out, target)
            epoch_loss += loss.item()
        
    return epoch_loss / len(batches)

In [15]:
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
LSTM_LAYERS = 1
HID_DIM = 256

enc = Encoder(len(field_pt.vocab), ENC_EMB_DIM, HID_DIM, LSTM_LAYERS)
dec = Decoder(len(field_en.vocab), DEC_EMB_DIM, HID_DIM, LSTM_LAYERS)

model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters())

In [16]:
N_EPOCHS = 25

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_batches, optimizer)
    valid_loss = evaluate(model, val_batches)
    
    print(f'Epoch {epoch+1:2}',
          f'Train loss: {train_loss:6.3f}',
          f'Val. loss: {valid_loss:6.3f}',
          sep='   ')

os.makedirs('gdrive/MyDrive/NLP/seq2seq/', exist_ok=True)
model_file = "gdrive/MyDrive/NLP/seq2seq/" + datetime.datetime.now().strftime("%Y-%m-%d-%H%M")
torch.save(model.state_dict(), model_file)
print('Saved to', model_file)

HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  1   Train loss:  2.017   Val. loss:  3.435


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  2   Train loss:  1.734   Val. loss:  3.287


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  3   Train loss:  1.658   Val. loss:  3.204


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  4   Train loss:  1.638   Val. loss:  3.156


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  5   Train loss:  1.601   Val. loss:  3.101


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  6   Train loss:  1.577   Val. loss:  3.066


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  7   Train loss:  1.549   Val. loss:  3.034


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  8   Train loss:  1.522   Val. loss:  3.024


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch  9   Train loss:  1.502   Val. loss:  2.996


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 10   Train loss:  1.505   Val. loss:  2.988


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 11   Train loss:  1.462   Val. loss:  2.959


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 12   Train loss:  1.452   Val. loss:  2.934


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 13   Train loss:  1.435   Val. loss:  2.928


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 14   Train loss:  1.431   Val. loss:  2.937


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 15   Train loss:  1.411   Val. loss:  2.920


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 16   Train loss:  1.393   Val. loss:  2.935


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 17   Train loss:  1.367   Val. loss:  2.947


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 18   Train loss:  1.375   Val. loss:  2.913


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 19   Train loss:  1.370   Val. loss:  2.923


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 20   Train loss:  1.370   Val. loss:  2.929


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 21   Train loss:  1.341   Val. loss:  2.922


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 22   Train loss:  1.334   Val. loss:  2.926


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 23   Train loss:  1.319   Val. loss:  2.934


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 24   Train loss:  1.325   Val. loss:  2.934


HBox(children=(FloatProgress(value=0.0, description='training', max=469.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='evaluation', max=235.0, style=ProgressStyle(description_w…


Epoch 25   Train loss:  1.308   Val. loss:  2.943
Saved to gdrive/MyDrive/NLP/seq2seq/2021-02-13-0618


In [17]:
evaluate(model, test_batches)

HBox(children=(FloatProgress(value=0.0, description='evaluation', max=79.0, style=ProgressStyle(description_wi…




2.926042244404177

In [18]:
sos_num = field_pt.vocab.stoi['<sos>']
eos_num = field_pt.vocab.stoi['<eos>']

def predict(model, sentence):
    model.eval()
    
    with torch.no_grad():
        nums = [sos_num] + [field_pt.vocab.stoi[token] for token in field_pt.tokenize(sentence)] + [eos_num]
        nums = torch.as_tensor(nums).to(device).view(-1, 1)

        out = model(nums, teacher_forcing_ratio=0, maxlen=100)

    out_nums = out.argmax(2).squeeze(1)
    out_words = [field_en.vocab.itos[num] for num in out_nums[1:]]
    if '<eos>' in out_words:
        out_words = out_words[:out_words.index('<eos>')]
    return ' '.join(out_words)

In [19]:
print(predict(model, 'qual é o seu nome?'))
print(predict(model, 'bom dia'))
print(predict(model, 'eu estou em casa'))

this is the ?
moss election assembly
i will be to see .


In [20]:
for i in range(10):
    pt = ' '.join(train_data[i].pt)
    en = predict(model, pt)
    print('[PT]', pt)
    print('[EN]', en)
    print()

[PT] estou convencida de que deve ser feito um esforço especial para proteger melhor os jovens atletas .
[EN] i am convinced that we must be able to develop a young people in the region .

[PT] sei muito bem dar valor a isso .
[EN] you want to see it .

[PT] o grupo confederal da esquerda unitária / esquerda nórdica verde , a que pertenço , tem dificuldade em considerar que a gestão económica de 1997 tenha sido significativamente melhor do que a de 1996 .
[EN] the confederal group of the european united left / nordic green left , the the of the the of the the the the the the the of the .

[PT] quando se disponibilizam verbas e é prometida ajuda , mas esta não chega a o destino ou chega demasiado tarde devido a os obstáculos burocráticos , é a credibilidade da união europeia que é posta em causa , quer a nível interno , quer a nível externo nas relações com os seus parceiros .
[EN] as a member of the european union , the is not only the the but but it is not to the the , but also the th

In [21]:
for i in range(10):
    pt = ' '.join(test_data[i].pt)
    en = predict(model, pt)
    print('[PT]', pt)
    print('[EN]', en)
    print()

[PT] os artigos 87º e 88º não fazem qualquer referência à energia nuclear .
[EN] the have not been the the and and and the the of the .

[PT] os jovens médicos não escolhem estas condições e horários de trabalho , eles são­lhes impostos .
[EN] and and not not be and and and and and and to the .

[PT] essa a razão por que me congratulo com o facto de a maior parte d essas verbas virem a ser despendidas à taxa de 50 % em vez de o serem à taxa de 75 % .
[EN] my report , that , that the the of the % of the % of the member states should be able to the the of the . .

[PT] também não tive oportunidade de falar com o meu bom amigo , o senhor westendorp .
[EN] i would like to say that i am not to the the of the , i have already mentioned .

[PT] enquanto os dois anteriores se referiam mais à liberdade e à s possibilidades de crescimento d este sector , o presente relatório aborda o seu lado mais sombrio ­ e com razão , a meu ver .
[EN] as a the the the the of the the of the the debate , a deba