In [1]:
%%capture
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil

In [2]:
import os
os.environ['TORCHDYNAMO_DISABLE'] = '1' 

In [3]:
import os
import time
import math
import copy
import spacy
import GPUtil
import pandas as pd
from typing import *
from itertools import chain

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, Dataset

import altair as alt
from altair import Chart

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используемое устройство: {device}")

Используемое устройство: cuda


In [5]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

In [6]:
class ResidualConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [7]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
        
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
        
    return torch.matmul(p_attn, value), p_attn

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(4)])
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
            
        batch_size = query.size(0)
        
        query, key, value = [
            lin(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]
        
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        
        x = x.transpose(1, 2).contiguous() \
             .view(batch_size, -1, self.h * self.d_k)
             
        return self.linears[-1](x)

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w2(self.dropout(F.relu(self.w1(x))))

In [10]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].detach()
        return self.dropout(x)

In [12]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = nn.ModuleList([ResidualConnection(size, dropout) for _ in range(2)])
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

class Encoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [13]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = nn.ModuleList([ResidualConnection(size, dropout) for _ in range(3)])

    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

class Decoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [14]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    model = Transformer(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        nn.Linear(d_model, tgt_vocab)
    )
    
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [15]:
class Batch:
    def __init__(self, src, tgt=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        tgt_mask = (tgt != pad).unsqueeze(-2)
        seq_len = tgt.size(-1)
        look_ahead_mask = torch.tril(torch.ones(seq_len, seq_len)).bool()
        return tgt_mask & look_ahead_mask.to(tgt.device)

def data_gen(V, batch, nbatches):
    for _ in range(nbatches):
        data = torch.randint(1, V, (batch, 10))
        data[:, 0] = 1
        src = data.clone()
        tgt = data.clone()
        yield Batch(src, tgt, 0)

In [16]:
class SimpleLossCompute:
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.reshape(-1, x.size(-1)), 
                             y.reshape(-1)) / norm
        return loss.data * norm, loss

def run_epoch(data_iter, model, loss_compute, optimizer=None, scheduler=None):
    total_loss = 0
    total_tokens = 0
    
    for i, batch in enumerate(data_iter):
        out = model(batch.src, batch.tgt, 
                   batch.src_mask, batch.tgt_mask)
                   
        loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
        
        if optimizer is not None:
            optimizer.zero_grad()
            loss_node.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        
    return total_loss / total_tokens

In [17]:
def example_synthetic():
    V = 11
    model = make_model(V, V, N=2)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)
    
    batch_size = 80
    for epoch in range(20):
        model.train()
        run_epoch(data_gen(V, batch_size, 20), model, 
                 SimpleLossCompute(model.generator, criterion), 
                 optimizer)
        
        model.eval()
        loss = run_epoch(data_gen(V, batch_size, 5), model, 
                        SimpleLossCompute(model.generator, criterion), 
                        None)
        print(f"Epoch {epoch+1} | Loss: {loss:.2f}")
    
    model.eval()
    src = torch.tensor([[1,2,3,4,5,6,7,8,9,10]], dtype=torch.long)
    src_mask = torch.ones(1, 1, 10, dtype=torch.bool)
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)
    
    for i in range(9):
        out = model.decode(memory, src_mask, 
                          ys, 
                          Batch.make_std_mask(ys, 0))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        ys = torch.cat([ys, next_word.unsqueeze(1)], dim=1)
    
    print("Example Output:", ys)

In [18]:
#example_synthetic()

In [22]:
import os
import time
import math
import copy
from itertools import chain
from typing import Optional, Tuple

import spacy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset

import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import altair as alt
from altair import Chart

In [23]:
print("🚀 Загрузка Tatoeba en-ru...")
try:
    dataset = load_dataset(
        "Helsinki-NLP/tatoeba",
        lang1="en", 
        lang2="ru",
        trust_remote_code=True
    )
except Exception as e:
    print(f"❌ Ошибка: {e}")
    raise

print("\n🔍 Структура датасета:")
print(dataset)
print("\nПример данных:")
for i in range(2):
    print(f"EN: {dataset['train'][i]['translation']['en']}")
    print(f"RU: {dataset['train'][i]['translation']['ru']}\n")

🚀 Загрузка Tatoeba en-ru...


README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

tatoeba.py:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


🔍 Структура датасета:
DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 523656
    })
})

Пример данных:
EN: For once in my life I'm doing a good deed... And it is useless.
RU: Один раз в жизни я делаю хорошее дело... И оно бесполезно.

EN: Let's try something.
RU: Давайте что-нибудь попробуем!



In [24]:
spacy_en = spacy.blank('en')
spacy_ru = spacy.blank('ru')

def tokenize(text, lang):
    try:
        return [tok.text.lower() for tok in lang.tokenizer(text) if tok.text.strip()]
    except:
        return []

In [25]:
MAX_LENGTH = 40
SAMPLE_SIZE = 5000 

print(f"\n🔧 Обработка первых {SAMPLE_SIZE} примеров...")

def process_batch(batch):
    # Извлекаем тексты из структуры translation с ключами 'en' и 'ru'
    en_texts = [item['en'] for item in batch['translation']]
    ru_texts = [item['ru'] for item in batch['translation']]
    
    processed = {'en_tokens': [], 'ru_tokens': []}
    
    for en, ru in zip(en_texts, ru_texts):
        try:
            en_toks = [tok.text.lower() for tok in spacy_en.tokenizer(en) if tok.text.strip()][:MAX_LENGTH]
            ru_toks = [tok.text.lower() for tok in spacy_ru.tokenizer(ru) if tok.text.strip()][:MAX_LENGTH]
        except Exception as e:
            print(f"Ошибка токенизации: {e}")
            continue
        
        if 3 <= len(en_toks) <= MAX_LENGTH and 3 <= len(ru_toks) <= MAX_LENGTH:
            processed['en_tokens'].append(en_toks)
            processed['ru_tokens'].append(ru_toks)
    
    return processed


🔧 Обработка первых 5000 примеров...


In [26]:
processed_data = dataset['train'].map(
    process_batch,
    batched=True,
    batch_size=1000,
    remove_columns=dataset['train'].column_names
).filter(lambda x: len(x['en_tokens']) > 0)

final_data = {
    'en_tokens': processed_data['en_tokens'][:SAMPLE_SIZE],
    'ru_tokens': processed_data['ru_tokens'][:SAMPLE_SIZE]
}

print(f"✅ Осталось примеров: {len(final_data['en_tokens'])}")

Map:   0%|          | 0/523656 [00:00<?, ? examples/s]

Filter:   0%|          | 0/521963 [00:00<?, ? examples/s]

✅ Осталось примеров: 5000


In [47]:
from collections import defaultdict

def build_compact_vocab(tokens_list, max_vocab=8000):
    counter = Counter()
    for tokens in tokens_list:
        counter.update(tokens)
    
    vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + [tok for tok, cnt in counter.most_common(max_vocab-4)]
    token_to_idx = defaultdict(lambda: 3, {tok:i for i, tok in enumerate(vocab)})
    return token_to_idx, vocab  # Теперь возвращаем и словарь, и список

print("\n📘 Создаем словари...")
en_vocab, en_vocab_list = build_compact_vocab(final_data['en_tokens'])
ru_vocab, ru_vocab_list = build_compact_vocab(final_data['ru_tokens'])

print(f"Размеры словарей:")
print(f"Английский: {len(en_vocab)}")
print(f"Русский: {len(ru_vocab)}")


📘 Создаем словари...
Размеры словарей:
Английский: 3135
Русский: 6096


In [29]:
print("Топ-10 частых токенов:")
print("EN:", list(en_vocab.keys())[4:14])  # Пропускаем служебные
print("RU:", list(ru_vocab.keys())[4:14])


def coverage(vocab, tokens_list):
    total = sum(len(t) for t in tokens_list)
    covered = sum(len([t for t in tokens if t in vocab]) for tokens in tokens_list)
    return covered / total

print(f"EN Coverage: {coverage(en_vocab, final_data['en_tokens']):.2%}")
print(f"RU Coverage: {coverage(ru_vocab, final_data['ru_tokens']):.2%}")

Топ-10 частых токенов:
EN: ['.', 'you', 'i', 'the', 'to', '?', 'a', 'your', 'is', 'do']
RU: ['.', ',', '?', 'ты', 'я', 'не', 'в', 'что', 'вы', 'на']
EN Coverage: 100.00%
RU Coverage: 100.00%


In [55]:
class TranslationDataset(Dataset):
    def __init__(self, en_tokens, ru_tokens, en_vocab, ru_vocab):
        self.en_tokens = en_tokens
        self.ru_tokens = ru_tokens
        self.en_vocab = en_vocab
        self.ru_vocab = ru_vocab
        self.pad_idx = en_vocab['<pad>']

    def __len__(self):
        return len(self.en_tokens)

    def __getitem__(self, idx):
        en = [self.en_vocab['<sos>']] + \
             [self.en_vocab.get(tok, self.en_vocab['<unk>']) for tok in self.en_tokens[idx]] + \
             [self.en_vocab['<eos>']]
        
        ru = [self.ru_vocab['<sos>']] + \
             [self.ru_vocab.get(tok, self.ru_vocab['<unk>']) for tok in self.ru_tokens[idx]] + \
             [self.ru_vocab['<eos>']]
        
        return torch.LongTensor(en), torch.LongTensor(ru)

def collate_fn(batch):
    en_batch, ru_batch = zip(*batch)
    
    en_padded = pad_sequence(en_batch, padding_value=en_vocab['<pad>'], batch_first=True)
    ru_padded = pad_sequence(ru_batch, padding_value=ru_vocab['<pad>'], batch_first=True)
    
    return Batch(
        src=en_padded,
        tgt=ru_padded,
        pad=en_vocab['<pad>']
    )

model = make_model(
    src_vocab=len(en_vocab),
    tgt_vocab=len(ru_vocab),
    N=6,
    d_model=512,
    d_ff=2048,
    h=8,
    dropout=0.1).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=ru_vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
scheduler = LambdaLR(optimizer, lr_lambda=lambda step: min(
    (step + 1e-8)**-0.5,  # Добавляем epsilon чтобы избежать деления на ноль
    (step + 1) * (4000**-1.5)  # Начинаем с шага 1
))

train_dataset = TranslationDataset(
    final_data['en_tokens'],
    final_data['ru_tokens'],
    en_vocab,
    ru_vocab
)

train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=4
)

def train_real_data(model, train_loader, epochs=20):
    model.train()
    best_loss = float('inf')
    
    for epoch in range(epochs):
        start_time = time.time()
        total_loss = 0
        total_tokens = 0
        
        for i, batch in enumerate(train_loader):
            batch = Batch(
                src=batch.src.to(device),
                tgt=batch.tgt.to(device),
                pad=en_vocab['<pad>']
            )
            
            optimizer.zero_grad()
            output = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
            
            loss, loss_node = SimpleLossCompute(model.generator, criterion)(
                output, batch.tgt_y, batch.ntokens
            )
            
            loss_node.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            total_loss += loss
            total_tokens += batch.ntokens
            
            if i % 50 == 0:
                avg_loss = total_loss / total_tokens
                print(f"Epoch {epoch+1} | Batch {i} | Loss: {avg_loss:.4f}")
        
        avg_loss = total_loss / total_tokens
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), "best_model.pth")
        
        print(f"Epoch {epoch+1} завершена | Loss: {avg_loss:.4f} | Время: {time.time()-start_time:.1f}s")
        
        test_sentence = "Hello world"
        translate(
            model, 
            test_sentence, 
            en_vocab, 
            ru_vocab, 
            ru_vocab_list,  # Список должен быть создан заранее
            spacy_en        # Токенизатор
        )

def translate(model, sentence, en_vocab, ru_vocab, ru_vocab_list, spacy_tokenizer, max_len=50):
    model.eval()
    tokens = [tok.text.lower() for tok in spacy_tokenizer(sentence)]
    
    src = torch.LongTensor([
        [en_vocab['<sos>']] + 
        [en_vocab.get(tok, en_vocab['<unk>']) for tok in tokens] + 
        [en_vocab['<eos>']
    ]]).to(device)
    
    src_mask = (src != en_vocab['<pad>']).unsqueeze(-2)
    
    with torch.no_grad():
        memory = model.encode(src, src_mask)
    
    ys = torch.ones(1, 1, dtype=torch.long).fill_(ru_vocab['<sos>']).to(device)
    
    for _ in range(max_len-1):
        seq_len = ys.size(1)
        look_ahead_mask = torch.tril(
            torch.ones((seq_len, seq_len), dtype=torch.bool, device=device
        ))
        tgt_mask = (ys != ru_vocab['<pad>']).unsqueeze(-2) & look_ahead_mask
        
        with torch.no_grad():
            out = model.decode(memory, src_mask, ys, tgt_mask)
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
        
        ys = torch.cat([ys, next_word.unsqueeze(1)], dim=1)
        
        if next_word == ru_vocab['<eos>']:
            break
    
    translation = ' '.join([ru_vocab_list[idx] for idx in ys[0].cpu().numpy() 
                          if idx not in [ru_vocab['<sos>'], ru_vocab['<eos>']]])
    print(f"Перевод: '{translation}'")

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using dvice: {device}")

train_real_data(model.to(device), train_loader, epochs=20)

Используемое устройство: cuda
Epoch 1 | Batch 0 | Loss: 0.0085
Epoch 1 завершена | Loss: 0.0087 | Время: 14.6s
Перевод: 'взгляда взгляда взгляда взгляда взгляда неотразима подходят придёшь неотразима взгляда неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима подходят подходят подходят подходят подходят придёшь тоже подходят придёшь неотразима частично ну́жно подходят неотразима подходят подходят подходят подходят придёшь неотразима подходят придёшь неотразима частично частично частично'
Epoch 2 | Batch 0 | Loss: 0.0085
Epoch 2 завершена | Loss: 0.0087 | Время: 14.1s
Перевод: 'взгляда взгляда взгляда взгляда взгляда неотразима подходят придёшь неотразима взгляда неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима неотразима подходят подходят подходят подходят подходят придёшь тоже подходят придёшь неотразима частично 