In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.optim import Adam

In [2]:
torch.manual_seed(0)
random.seed(0)

In [3]:
import pandas as pd


train_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/train.csv').values
test_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv')

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR

train_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/train.csv').values
test_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv').values
# train_dataset, valid_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)
valid_dataset = train_dataset
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {'SOS': 0, 'EOS': 1}
        self.index2word = {0: 'SOS', 1: 'EOS'}

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super().__init__()
        self.hidden_size = hidden_size
        self.max_length = max_length
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self, x, hidden, encoder_outputs):
        embedded = self.embedding(x).view(1, 1, -1)
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = self.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden, attn_weights

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]

def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

def train_single(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    num_correct = 0
    num_total = len(target_tensor)
    encoder_hidden = encoder.init_hidden()
    encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
    for ei, elem in enumerate(input_tensor):
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)
        if ei < MAX_LENGTH:
            encoder_outputs[ei] = encoder_output[0, 0]
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    use_teacher_forcing = False
    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, elem)
            _, topi = decoder_output.data.topk(1)
            if topi.item() == elem.item():
                num_correct += 1
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, elem)
            if topi.item() == elem.item():
                num_correct += 1
            if decoder_input.item() == EOS_token:
                break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / num_total, num_correct, num_total

@torch.no_grad()
def validate_single(input_tensor, target_tensor, encoder, decoder, criterion):
    encoder.eval()
    decoder.eval()
    loss = 0
    num_correct = 0
    num_total = len(target_tensor)
    encoder_hidden = encoder.init_hidden()
    encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
    for ei, elem in enumerate(input_tensor):
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)
        if ei < MAX_LENGTH:
            encoder_outputs[ei] = encoder_output[0, 0]
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    for elem in target_tensor:
        decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, elem)
        _, topi = decoder_output.data.topk(1)
        if topi.item() == elem.item():
            num_correct += 1
        decoder_input = topi.squeeze().detach()
        if decoder_input.item() == EOS_token:
            break
    return loss.item() / num_total, num_correct, num_total

@torch.no_grad()
def validate(encoder, decoder):
    encoder.eval()
    decoder.eval()
    criterion = nn.NLLLoss()
    total_loss = 0
    total_correct = 0
    total_count = 0
    validation_pairs = [pair2tensor(x) for x in valid_dataset]
    for input_tensor, target_tensor in validation_pairs:
        loss, num_correct, num_total = validate_single(input_tensor, target_tensor, encoder, decoder, criterion)
        total_loss += loss
        total_correct += num_correct
        total_count += num_total
    avg_loss = total_loss / len(validation_pairs)
    accuracy = 100 * total_correct / total_count if total_count > 0 else 0
    print(f'Validation loss: {avg_loss:.4f}, accuracy: {accuracy:.2f}%')

def train(encoder, decoder, n_epochs=5, print_every=100):
    encoder.train()
    decoder.train()
    encoder_optimizer = Adam(encoder.parameters(), lr=1e-3)
    decoder_optimizer = Adam(decoder.parameters(), lr=1e-3)
    encoder_scheduler = StepLR(encoder_optimizer, step_size=5, gamma=0.5)
    decoder_scheduler = StepLR(decoder_optimizer, step_size=5, gamma=0.5)
    criterion = nn.NLLLoss(ignore_index=0)

    for epoch in range(n_epochs):
        encoder.train()
        decoder.train()
        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')
        print_loss_total = 0
        correct = 0
        total = 0
        training_pairs = [pair2tensor(x) for x in train_dataset]
        
        for i, training_pair in enumerate(training_pairs):
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]
            loss, num_correct, num_total = train_single(
                input_tensor, target_tensor, encoder, decoder, 
                encoder_optimizer, decoder_optimizer, criterion
            )
            print_loss_total += loss
            correct += num_correct
            total += num_total

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                accuracy = 100 * correct / total if total > 0 else 0
                print_loss_total = 0
                correct, total = 0, 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}, accuracy: {accuracy:.2f}%')

        validate(encoder, decoder)
        encoder_scheduler.step()
        decoder_scheduler.step()
        encoder.train()
        decoder.train()

encoder_model = Encoder(input_lang.n_words, 256).to(device)
decoder_model = Decoder(256, output_lang.n_words, MAX_LENGTH).to(device)
train(encoder_model, decoder_model, n_epochs=15)

@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()
    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    for ei, elem in enumerate(input_tensor):
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)
        if ei < max_length:
            encoder_outputs[ei] = encoder_output[0, 0]
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    decoded_words = []
    for di in range(max_length):
        decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
        _, topi = decoder_output.data.topk(1)
        decoded_words.append(output_lang.index2word[topi.item()])
        if topi.item() == EOS_token:
            break
        decoder_input = topi.squeeze().detach()
    return decoded_words

def predict_(encoder, decoder, dataset):
    result = []
    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])
    return result

human 82
iso 13
Epoch [01/15]
Training (9.0%) loss: 1.7726, accuracy: 28.27%
Training (18.2%) loss: 1.0414, accuracy: 61.73%
Training (27.3%) loss: 0.7140, accuracy: 71.73%
Training (36.4%) loss: 0.6584, accuracy: 75.45%
Training (45.6%) loss: 0.6103, accuracy: 76.82%
Training (54.7%) loss: 0.6043, accuracy: 77.64%
Training (63.8%) loss: 0.5824, accuracy: 77.55%
Training (73.0%) loss: 0.5466, accuracy: 79.09%
Training (82.1%) loss: 0.5309, accuracy: 80.45%
Training (91.2%) loss: 0.4942, accuracy: 82.45%
Validation loss: 0.4567, accuracy: 84.25%
Epoch [02/15]
Training (9.0%) loss: 0.4589, accuracy: 83.82%
Training (18.2%) loss: 0.4588, accuracy: 83.64%
Training (27.3%) loss: 0.4295, accuracy: 85.91%
Training (36.4%) loss: 0.4240, accuracy: 85.82%
Training (45.6%) loss: 0.3734, accuracy: 87.18%
Training (54.7%) loss: 0.3743, accuracy: 87.91%
Training (63.8%) loss: 0.3548, accuracy: 88.09%
Training (73.0%) loss: 0.3207, accuracy: 88.73%
Training (82.1%) loss: 0.3031, accuracy: 90.27%
Trai

# Test

In [5]:
test_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv')

In [6]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [7]:
test_prediction = [''.join(x) for x in test_prediction]

In [8]:
test_dataset['label'] = test_prediction

In [9]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)

In [10]:
test_dataset

Unnamed: 0,id,data,label
0,0,24 января 2007,24-01-2007
1,1,le six mars 2049,18-03-2049
2,2,le dix 05 2077,10-05-2077
3,3,27 июня 2049,27-06-2049
4,4,08 гыйнварда 2077,08-01-2077
...,...,...,...
4671,4671,am fünfzehnten januar 2049,15-01-2049
4672,4672,тугызынчы 05 2049,09-05-2049
4673,4673,der achzehnte 02 2007,18-02-2007
4674,4674,vierzehnter 12 2049,14-12-2049
