In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.optim import Adam

In [None]:
torch.manual_seed(0)
random.seed(0)

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1k1quangHHsq25rJOTLShK3H0hN1mVqp9/view?usp=sharing -O train.csv
!gdown --fuzzy https://drive.google.com/file/d/1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g/view?usp=drive_link -O test.csv

Downloading...
From: https://drive.google.com/uc?id=1k1quangHHsq25rJOTLShK3H0hN1mVqp9
To: /content/train.csv
100% 38.0k/38.0k [00:00<00:00, 106MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g
To: /content/test.csv
100% 134k/134k [00:00<00:00, 43.0MB/s]


In [None]:
import pandas as pd


train_dataset = pd.read_csv('train.csv').values
test_dataset = pd.read_csv('test.csv')

In [None]:
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

MAX_LENGTH

41

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [None]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class Decoder(nn.Module):

    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        output = self.embedding(x).view(1, 1, -1)
        output = self.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [None]:
def train_single(
        input_tensor, target_tensor,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer,
        criterion
):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = False

    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, elem)
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, elem)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / len(target_tensor)

In [None]:
def train(encoder, decoder, n_epochs=5, print_every=100):
    encoder.train()
    decoder.train()

    encoder_optimizer = Adam(encoder.parameters(), lr=1e-3)
    decoder_optimizer = Adam(decoder.parameters(), lr=1e-3)

    criterion = nn.NLLLoss()

    for epoch in range(n_epochs):
        print_loss_total = 0

        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')
        training_pairs = [
            pair2tensor(x) for x in train_dataset
        ]

        for i, training_pair in enumerate(training_pairs):
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train_single(
                input_tensor, target_tensor,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer,
                criterion
            )
            print_loss_total += loss

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}')

In [None]:
encoder_model = Encoder(input_lang.n_words, 128).to(device)
decoder_model = Decoder(128, output_lang.n_words).to(device)

train(encoder_model, decoder_model, n_epochs=5)

Epoch [01/05]
Training (9.0%) loss: 1.8857
Training (18.2%) loss: 1.3371
Training (27.3%) loss: 0.7938
Training (36.4%) loss: 0.6853
Training (45.6%) loss: 0.6303
Training (54.7%) loss: 0.6166
Training (63.8%) loss: 0.5897
Training (73.0%) loss: 0.5801
Training (82.1%) loss: 0.5567
Training (91.2%) loss: 0.5281
Epoch [02/05]
Training (9.0%) loss: 0.5171
Training (18.2%) loss: 0.5142
Training (27.3%) loss: 0.5084
Training (36.4%) loss: 0.4830
Training (45.6%) loss: 0.4673
Training (54.7%) loss: 0.4715
Training (63.8%) loss: 0.4440
Training (73.0%) loss: 0.4389
Training (82.1%) loss: 0.4135
Training (91.2%) loss: 0.3978
Epoch [03/05]
Training (9.0%) loss: 0.3883
Training (18.2%) loss: 0.3977
Training (27.3%) loss: 0.3787
Training (36.4%) loss: 0.3604
Training (45.6%) loss: 0.3322
Training (54.7%) loss: 0.3425
Training (63.8%) loss: 0.3195
Training (73.0%) loss: 0.2974
Training (82.1%) loss: 0.2920
Training (91.2%) loss: 0.2986
Epoch [04/05]
Training (9.0%) loss: 0.2889
Training (18.2%) l

In [None]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    decoded_words = []

    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        _, topi = decoder_output.data.topk(1)

        decoded_words.append(output_lang.index2word[topi.item()])

        if topi.item() == EOS_token:
            break

        decoder_input = topi.squeeze().detach()

    return decoded_words


def predict_(encoder, decoder, dataset):
    result = []

    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])

    return result

In [None]:
test_dataset = pd.read_csv('test.csv')

In [None]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [None]:
test_prediction = [''.join(x) for x in test_prediction]

In [None]:
test_dataset['label'] = test_prediction

In [None]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)