# Домашнее задание №9

Переписать загрузку данных с python функций на Dataset и Dataloader и применить сеть с attention

In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import re

In [56]:
n_epochs = 5
num_samples = 10000
data_path = 'data/fra-eng/fra.txt'
print_loss_n = 100

In [59]:
class TranslateDataset(torch.utils.data.Dataset):
    def __init__(self, file_name=data_path, num_samples=num_samples):

        texts = []
        text_words = []

        input_vocab = set()
        output_vocab = set()
        
        print('Загружаем ', file_name)
        
        with open(file_name, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')

        for line in lines[:num_samples]:
            input_text, output_text, _ = line.split('\t')

            texts.append((input_text, output_text))
            
            input_words = re.findall(r'\w+', input_text) 
            output_words = re.findall(r'\w+', output_text) 
            text_words.append((input_words, output_words))
            
            for word in input_words:
                input_vocab.add(word)
            for word in output_words:
                output_vocab.add(word)
    
        input_vocab2index = {word: i+2 for i, word in enumerate(input_vocab)}
        output_vocab2index = {word: i+2 for i, word in enumerate(output_vocab)}        

        def ws2i(words, vocab):
            indexes = [vocab.get(word, 0) for word in words] + [1]
            return torch.tensor(indexes, dtype=torch.long).view(-1, 1)
        
        self.texts = texts
        self.encoded_texts = [ (ws2i(p[0], input_vocab2index), ws2i(p[1], output_vocab2index)) for p in text_words ]
        self.input_vocab2index = input_vocab2index
        self.output_vocab2index = output_vocab2index
        self.input_vocabulary_size = len(self.input_vocab2index) + 2
        self.output_vocabulary_size = len(self.output_vocab2index) + 2
        
        print('Загружен ', file_name)
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        return self.encoded_texts[index] + self.texts[index]
    


In [60]:
ds = TranslateDataset()
ds[1]

Загружаем  data/fra-eng/fra.txt
Загружен  data/fra-eng/fra.txt


(tensor([[1022],
         [   1]]),
 tensor([[3922],
         [   1]]),
 'Hi.',
 'Salut !')

In [61]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        embedded_hidden = torch.cat((embedded[0], hidden[0]), 1)

        attn_weights = self.attn(embedded_hidden)
        
        attn_weights = F.softmax(attn_weights, dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)

        output = self.attn_combine(output).unsqueeze(0)

        #output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [62]:
def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]])

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        
        topv, topi = decoder_output.topk(1)
        
        decoder_input = topi.squeeze().detach()  # detach from history as input
        
        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == 1:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [70]:
encoder = EncoderRNN(ds.input_vocabulary_size, 30)
attn_decoder1 = AttnDecoderRNN(30, ds.output_vocabulary_size, dropout_p=0.1)

#attn_decoder1 = DecoderRNN(len(output_vocab2index)+2, 30)

encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01)
decoder_optimizer = torch.optim.SGD(attn_decoder1.parameters(), lr=0.01)
criterion = nn.NLLLoss()

print_loss_total = 0

dl = torch.utils.data.DataLoader(ds, shuffle=True, batch_size=1)

for epoch in range(n_epochs):
    print ('===================== Эпоха %d ====================' % epoch)
    for i, ([input_tensor], [target_tensor], _, _) in enumerate(dl):
        loss = train_step(input_tensor, target_tensor, encoder,
                   attn_decoder1, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if (i + 1) % print_loss_n == 0:
            print_loss_avg = print_loss_total / print_loss_n
            print_loss_total = 0
            print('(%d) %.4f' % (i + 1, print_loss_avg))


(100) 5.4154
(200) 5.1222
(300) 5.6370
(400) 4.0456
(500) 3.5234
(600) 4.1145
(700) 3.6378
(800) 3.8981
(900) 4.2480
(1000) 3.6808
(1100) 4.1150
(1200) 4.0673
(1300) 4.0941
(1400) 3.8393
(1500) 3.8872
(1600) 3.9260
(1700) 3.6480
(1800) 3.9480
(1900) 3.6985
(2000) 3.9410
(2100) 3.5867
(2200) 4.1149
(2300) 3.8778
(2400) 4.0312
(2500) 3.9037
(2600) 3.9700
(2700) 3.8139
(2800) 3.8772
(2900) 3.7586
(3000) 3.9776
(3100) 3.5799
(3200) 3.8271
(3300) 3.9695
(3400) 3.7598
(3500) 3.6860
(3600) 3.8502
(3700) 3.7590
(3800) 3.8219
(3900) 3.9608
(4000) 3.5760
(4100) 3.6564
(4200) 3.5464
(4300) 3.6801
(4400) 3.6075
(4500) 3.6561
(4600) 3.8032
(4700) 3.5211
(4800) 3.4753
(4900) 3.6683
(5000) 3.6893
(5100) 3.7497
(5200) 3.6908
(5300) 3.5174
(5400) 3.6350
(5500) 3.7080
(5600) 3.7867
(5700) 3.4593
(5800) 3.6339
(5900) 3.6768
(6000) 3.7534
(6100) 3.5430
(6200) 3.4145
(6300) 3.7565
(6400) 3.6783
(6500) 3.6395
(6600) 3.5592
(6700) 3.7854
(6800) 3.5755
(6900) 3.7337
(7000) 3.5576
(7100) 3.8092
(7200) 3.5965
(