In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW

In [2]:
torch.manual_seed(0)
random.seed(0)

In [3]:
!gdown --fuzzy https://drive.google.com/file/d/1k1quangHHsq25rJOTLShK3H0hN1mVqp9/view?usp=sharing -O train.csv
!gdown --fuzzy https://drive.google.com/file/d/1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g/view?usp=drive_link -O test.csv

zsh:1: no matches found: https://drive.google.com/file/d/1k1quangHHsq25rJOTLShK3H0hN1mVqp9/view?usp=sharing
zsh:1: no matches found: https://drive.google.com/file/d/1NsJXT6eBrXDKXKggfWjwcYuoU84CII3g/view?usp=drive_link


In [4]:
import pandas as pd


train_dataset = pd.read_csv('train.csv').values
test_dataset = pd.read_csv('test.csv')

In [5]:
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

MAX_LENGTH

41

In [6]:
SOS_token = 0
EOS_token = 1


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [7]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.batchnorm_out = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)

    def init_hidden(self, batch_size):
        return (
            torch.zeros(2, batch_size, self.hidden_size, device=device),
            torch.zeros(2, batch_size, self.hidden_size, device=device)
        )

    def forward(self, x, lengths):
        # x: (seq_len, batch)
        embedded = self.embedding(x)                  # (seq_len, batch, hidden)
        embedded = self.dropout(embedded)

        # LSTM
        packed = pack_padded_sequence(embedded, lengths, enforce_sorted=False)
        outputs, hidden = self.lstm(packed, self.init_hidden(batch_size=x.size(1)))
        outputs, _ = pad_packed_sequence(outputs)     # (seq_len, batch, hidden*2)

        # Apply BatchNorm to LSTM outputs
        seq_len, batch, feat = outputs.shape
        outputs = self.batchnorm_out(outputs.view(-1, feat))
        outputs = outputs.view(seq_len, batch, feat)

        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size, dropout=0.1):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 3, hidden_size)
        self.v = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, hidden, encoder_outputs):
        # hidden: (1, batch, hidden_size)
        # encoder_outputs: (seq_len, batch, hidden_size * 2)
        seq_len, batch_size, _ = encoder_outputs.size()

        hidden = hidden.repeat(seq_len, 1, 1)  # (seq_len, batch, hidden)
        energy = torch.cat((hidden, encoder_outputs), dim=2)  # (seq_len, batch, hidden*3)
        energy = torch.tanh(self.attn(energy))  # (seq_len, batch, hidden)
        energy = self.dropout(energy)
        attention = self.v(energy).squeeze(2)  # (seq_len, batch)

        return F.softmax(attention, dim=0)  # (seq_len, batch)

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_size, dropout)
        self.lstm = nn.LSTM(hidden_size + hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, encoder_outputs):
        if x.dim() == 0:
            x = x.unsqueeze(0)  # защита от скаляра

        embedded = self.embedding(x).unsqueeze(0)  # (1, batch, hidden_size)
        embedded = self.dropout(embedded)

        attn_weights = self.attention(hidden[0], encoder_outputs)  # (seq_len, batch)
        attn_weights = attn_weights.transpose(0, 1).unsqueeze(1)   # (batch, 1, seq_len)

        encoder_outputs = encoder_outputs.transpose(0, 1)          # (batch, seq_len, hidden*2)
        context = torch.bmm(attn_weights, encoder_outputs)         # (batch, 1, hidden*2)
        context = context.transpose(0, 1)                          # (1, batch, hidden*2)

        lstm_input = torch.cat((embedded, context), dim=2)         # (1, batch, hidden*3)
        output, hidden = self.lstm(lstm_input, hidden)

        output = output.squeeze(0)             # (batch, hidden)
        output = self.out(output)              # (batch, vocab_size)
        output = F.log_softmax(output, dim=1)

        return output, hidden, attn_weights.squeeze(1)

In [11]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [12]:
def train_single(inputs, input_lengths, targets, target_lengths,
                 encoder, decoder, encoder_opt, decoder_opt, criterion,
                 teacher_forcing_ratio=0.5):
    encoder_opt.zero_grad()
    decoder_opt.zero_grad()
    loss = 0

    batch_size = inputs.size(1)

    # Encoder
    encoder_outputs, encoder_hidden = encoder(inputs, input_lengths)

    # Decoder init
    decoder_input = torch.tensor([SOS_token] * batch_size, device=device)
    decoder_hidden = (encoder_hidden[0][:1], encoder_hidden[1][:1])  # только forward

    max_target_len = targets.size(0)
    use_teacher_forcing = random.random() < teacher_forcing_ratio

    for t in range(max_target_len):
        decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

        # print(decoder_output)

        # ⛑️ ВАЖНО: правильная форма для NLLLoss
        loss += criterion(decoder_output, targets[t].view(-1))

        if use_teacher_forcing:
            decoder_input = targets[t].view(-1)
        else:
            decoder_input = decoder_output.argmax(1).detach()

    loss.backward()
    encoder_opt.step()
    decoder_opt.step()
    return loss.item() / max_target_len

In [13]:
from torch.utils.data import DataLoader

def collate_fn(pairs):
    input_seqs, target_seqs = zip(*[pair2tensor(pair) for pair in pairs])

    input_seqs = [seq.squeeze(1).long() for seq in input_seqs]
    target_seqs = [seq.squeeze(1).long() for seq in target_seqs]

    input_lengths = [len(seq) for seq in input_seqs]
    target_lengths = [len(seq) for seq in target_seqs]

    padded_inputs = pad_sequence(input_seqs, padding_value=EOS_token)
    padded_targets = pad_sequence(target_seqs, padding_value=EOS_token)

    return padded_inputs, input_lengths, padded_targets, target_lengths

def train(encoder, decoder, n_epochs=5, batch_size=32, print_every=8):
    encoder.train()
    decoder.train()

    encoder_opt = AdamW(encoder.parameters(), lr=1e-3, weight_decay=0.01)
    decoder_opt = AdamW(decoder.parameters(), lr=1e-3, weight_decay=0.01)
    criterion = nn.NLLLoss()

    train_data = train_dataset
    dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    scheduler_enc = torch.optim.lr_scheduler.CosineAnnealingLR(encoder_opt, len(dataloader) * n_epochs, 1e-7)
    scheduler_dec = torch.optim.lr_scheduler.CosineAnnealingLR(decoder_opt, len(dataloader) * n_epochs, 1e-7)

    for epoch in range(n_epochs):
        total_loss = 0
        print(f"\nEpoch [{epoch + 1}/{n_epochs}]")

        for i, (inputs, input_lengths, targets, target_lengths) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)

            loss = train_single(
                inputs, input_lengths, targets, target_lengths,
                encoder, decoder, encoder_opt, decoder_opt, criterion
            )

            total_loss += loss

            scheduler_enc.step()
            scheduler_dec.step()

            if (i + 1) % print_every == 0:
                avg_loss = total_loss / print_every
                print(f"[{i + 1}/{len(dataloader)}] Loss: {avg_loss:.4f}")
                total_loss = 0

In [14]:
encoder_model = Encoder(input_lang.n_words, 128).to(device)
decoder_model = Decoder(128, output_lang.n_words).to(device)

train(encoder_model, decoder_model, n_epochs=256)


Epoch [1/256]
[8/35] Loss: 2.3436
[16/35] Loss: 2.0273
[24/35] Loss: 1.8642
[32/35] Loss: 1.6397

Epoch [2/256]
[8/35] Loss: 1.4867
[16/35] Loss: 1.3741
[24/35] Loss: 1.3759
[32/35] Loss: 1.3447

Epoch [3/256]
[8/35] Loss: 1.0168
[16/35] Loss: 0.8429
[24/35] Loss: 0.7430
[32/35] Loss: 0.6715

Epoch [4/256]
[8/35] Loss: 0.6064
[16/35] Loss: 0.5774
[24/35] Loss: 0.5537
[32/35] Loss: 0.5244

Epoch [5/256]
[8/35] Loss: 0.4863
[16/35] Loss: 0.4578
[24/35] Loss: 0.4383
[32/35] Loss: 0.4010

Epoch [6/256]
[8/35] Loss: 0.3359
[16/35] Loss: 0.2930
[24/35] Loss: 0.2616
[32/35] Loss: 0.2355

Epoch [7/256]
[8/35] Loss: 0.1907
[16/35] Loss: 0.1728
[24/35] Loss: 0.1298
[32/35] Loss: 0.1134

Epoch [8/256]
[8/35] Loss: 0.0962
[16/35] Loss: 0.1001
[24/35] Loss: 0.0762
[32/35] Loss: 0.0842

Epoch [9/256]
[8/35] Loss: 0.0753
[16/35] Loss: 0.0724
[24/35] Loss: 0.0588
[32/35] Loss: 0.0418

Epoch [10/256]
[8/35] Loss: 0.0379
[16/35] Loss: 0.0313
[24/35] Loss: 0.0358
[32/35] Loss: 0.0537

Epoch [11/256]
[8/

In [15]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    # Подготовка входа
    input_tensor = sentence2tensor(input_lang, sentence).squeeze(1)  # (seq_len,)
    input_length = [input_tensor.size(0)]

    input_tensor = input_tensor.unsqueeze(1)  # (seq_len, 1) — батч из 1
    encoder_outputs, encoder_hidden = encoder(input_tensor, input_length)

    # Начало декодирования
    decoder_input = torch.tensor([SOS_token], device=device)  # (1,)
    decoder_hidden = (encoder_hidden[0][:1], encoder_hidden[1][:1])  # только forward слой
    decoded_indices = []

    for _ in range(max_length):
        decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)  # topi: (1,)
        predicted_index = topi.item()
        if predicted_index == EOS_token:
            break
        decoded_indices.append(predicted_index)
        decoder_input = topi.squeeze().detach()

    # Преобразуем индексы в символы
    decoded_words = [output_lang.index2word[i] for i in decoded_indices]
    return decoded_words

def predict_(encoder, decoder, dataset):
    result = []
    for sentence in dataset:
        output = evaluate(encoder, decoder, sentence)
        result.append(output[:10])  # ограничим 10 символами
    return result

In [16]:
test_dataset = pd.read_csv('test.csv')

In [17]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [18]:
test_prediction = [''.join(x) for x in test_prediction]

In [19]:
test_dataset['label'] = test_prediction

In [20]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)