In [1]:
#german to English
#encoder - decoder

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
#preprocessing
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
#텐서보드 출력목적

from torch.utils.tensorboard import SummaryWriter
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

ModuleNotFoundError: ignored

In [31]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [7]:
#tokenizer
spacy_ger = spacy.load('de')
spacy_en = spacy.load('en')

In [8]:
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
german = Field(tokenize = tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize = tokenizer_en, lower=True, init_token='<sos>', eos_token='<eos>')

In [10]:
train_data , valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields = (german, english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 867kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 243kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 237kB/s]


In [11]:
#빈도수 2회 이상 단어만 취급
german.build_vocab(train_data, max_size=10000, min_freq = 2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [17]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
    super(Encoder,self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)


  def forward(self, x):
    #x shape(seqlen, N)
    # emb shape - seqlen, n ,embedding_size
    embedding = self.dropout(self.embedding(x))


    outputs, (hidden, cell) = self.rnn(embedding)
    return hidden, cell

class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size,num_layers, dropout):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self,x,hidden,cell):
    #shape of x : N -> 1,N
    #인코더에는 문장이 들어오나 디코더에서는 단어 하나씩 나오기때문에
    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))
    #emb shape (1,N,emb size), LSTM -> hidden cell state
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
    #outputs -> 1, N, hidden size
    predictions = self.fc(outputs)
    #shape of predictions - > 1,N, length of vocab
    predictions = predictions.squeeze(0)

    #hidden, cell -> 다음 state에 전달
    return predictions, hidden, cell


class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder= decoder

  def forward(self, source, target, teacher_force_ratio = 0.5):
    batch_size = source.shape[1] # tar_len, N
    target_len = target.shape[0]
    targat_vocab_size = len(english.vocab)
    
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    
    hidden, cell = self.encoder(source)
    
    # start token
    x = target[0]
    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x,hidden, cell)
      outputs[t] = output
      # (N, english_vocab_size)
      best_guess = output.argmax(1)
      x = target[t] if random.random() <teacher_force_ratio else best_guess

    return outputs

In [30]:


#hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64

load_model = False
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024

num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

#tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
  (train_data, valid_data, test_data), batch_size = batch_size, sort_within_batch = True,
  sort_key = lambda x : len(x.src),
  device = device
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)


model = Seq2Seq(encoder_net, decoder_net).to(device)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

if load_model:
  load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
for epoch in range(num_epochs):
  print(f'Epoch [{epoch} / {num_epochs}]')
  checkpoint = {'state_dict' : model.state_dict(), 'optimizer':optimizer.state_dict()}
  save_checkpoint(checkpoint)
  model.eval()

  translated_sentence = translate_sentence(model, sentence,german, english, device , max_length=50)

  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(inp_data, target)
    #target_len, batch_size, output_dim

    #output[0] - > start token
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    writer.add_scalar('Training loss', loss, global_step=step)
    step += 1
    

Epoch [0 / 20]


NameError: ignored