## Загрузка данных

In [1]:
import re
import spacy
import numpy as np
import pandas as pd
from datasets import load_dataset

from dataclasses import dataclass
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import chain
from typing import List, Dict, Tuple
from collections import Counter, defaultdict

In [2]:
def load_translation_dataset():
    print("Loading Tatoeba en-ru...")
    try:
        dataset = load_dataset("Helsinki-NLP/tatoeba", lang1="en", lang2="ru", trust_remote_code=True)
        
    except Exception as e:
        print(f"Error while loading dataset: {e}")
        raise
    
    print("\nDataset structure:")
    print(dataset)
    
    print("\nData sample:")
    for i in range(2):
        print(f"EN: {dataset['train'][i]['translation']['en']}")
        print(f"RU: {dataset['train'][i]['translation']['ru']}\n")

    return dataset

In [3]:
dataset = load_translation_dataset()

Loading Tatoeba en-ru...


README.md:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

tatoeba.py:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 523656
    })
})

Data sample:
EN: For once in my life I'm doing a good deed... And it is useless.
RU: Один раз в жизни я делаю хорошее дело... И оно бесполезно.

EN: Let's try something.
RU: Давайте что-нибудь попробуем!



In [4]:
from transformers import AutoTokenizer

def prepare_data_with_hf(
    dataset,
    model_name: str = "Helsinki-NLP/opus-mt-en-ru",
    max_length: int = 128,
    batch_size: int = 32
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Убедимся, что все специальные токены определены
    special_tokens = {
        'bos_token': '<s>',
        'eos_token': '</s>',
        'pad_token': '<pad>',
        'unk_token': '<unk>'
    }
    
    # Добавляем отсутствующие токены
    tokenizer.add_special_tokens(special_tokens)
    
    def preprocess_function(examples):
        source_texts = [item['en'] for item in examples['translation']]
        
        # Убедимся, что целевые тексты начинаются с BOS и заканчиваются EOS
        target_texts = [
            f"{item['ru']}"  # Убираем явное добавление BOS/EOS
            for item in examples['translation']
        ]
        
        # Токенизация исходных текстов
        source_encoding = tokenizer(
            source_texts,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='np'
        )
        
        # Токенизация целевых текстов
        target_encoding = tokenizer(
            target_texts,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='np'
        )
        
        return {
            'input_ids': source_encoding['input_ids'],
            'attention_mask': source_encoding['attention_mask'],
            'labels': target_encoding['input_ids'],
            'decoder_attention_mask': target_encoding['attention_mask']
        }
    
    processed_dataset = dataset['train'].map(
        preprocess_function,
        batched=True,
        batch_size=batch_size,
        remove_columns=dataset['train'].column_names
    )
    
    return processed_dataset, tokenizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [5]:
processed_data, hf_tokenizer = prepare_data_with_hf(dataset)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



Map:   0%|          | 0/523656 [00:00<?, ? examples/s]

## Архитектура модели

In [6]:
%%capture
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

In [7]:
import os
import time
import math
import copy
import spacy
import GPUtil
import pandas as pd
from typing import *
from itertools import chain

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, Dataset

import altair as alt
from altair import Chart

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].detach()
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q: [batch_size, num_heads, seq_len, head_dim]
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = F.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        
        # Линейные преобразования
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Вычисление внимания
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Объединение голов
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        # Финальное линейное преобразование
        output = self.W_o(attn_output)
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear1(x)))
        x = self.linear2(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x, mask=None):
        # Self attention
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Self attention (маскированное)
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Cross attention (с выходом энкодера)
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        # Feed forward
        ffn_output = self.ffn(x)
        x = self.norm3(x + self.dropout(ffn_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8, num_layers=6):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=0.1)
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads) for _ in range(num_layers)])
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
            
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Проверка размерностей входных данных
        batch_size = src.size(0)
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        # Энкодинг
        src_emb = self.positional_encoding(self.encoder_embedding(src))
        enc_output = src_emb
        
        for layer in self.encoder_layers:
            enc_output = layer(enc_output, src_mask)
        
        # Декодинг
        tgt_emb = self.positional_encoding(self.decoder_embedding(tgt))
        dec_output = tgt_emb
        
        for layer in self.decoder_layers:
            dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
        
        # Финальный слой
        output = self.fc_out(dec_output)
        return output

In [9]:
def test_transformer():
    # Проверяем доступность CUDA и инициализируем устройство
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Установка seed для воспроизводимости
    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
    torch.manual_seed(42)
    
    # Конфигурация
    batch_size = 2
    seq_len = 10
    d_model = 512
    num_heads = 8
    src_vocab_size = 100
    tgt_vocab_size = 100
    num_layers = 2

    # Генерация синтетических данных
    src = torch.randint(0, src_vocab_size, (batch_size, seq_len)).to(device)
    tgt = torch.randint(0, tgt_vocab_size, (batch_size, seq_len)).to(device)
    
    # Генерация масок
    src_mask = torch.ones(batch_size, 1, 1, seq_len).to(device)
    tgt_mask = torch.tril(torch.ones(seq_len, seq_len)).expand(batch_size, 1, seq_len, seq_len).to(device)

    # Инициализация модели
    transformer = Transformer(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        d_model=d_model,
        num_heads=num_heads,
        num_layers=num_layers
    ).to(device)

    print("="*50)
    print("1. Тест Positional Encoding")
    pe = PositionalEncoding(d_model, dropout=0.1).to(device)
    x = torch.randn(1, seq_len, d_model).to(device)
    print(f"До PE: mean={x.mean().item():.4f}, std={x.std().item():.4f}")
    x_pe = pe(x)
    print(f"После PE: mean={x_pe.mean().item():.4f}, std={x_pe.std().item():.4f}")
    print(f"Форма PE: {x_pe.shape} (должна быть [1, {seq_len}, {d_model}])")
    
    print("\n2. Тест Multi-Head Attention")
    mha = MultiHeadAttention(d_model, num_heads).to(device)
    q = k = v = torch.randn(batch_size, seq_len, d_model).to(device)
    attn_output = mha(q, k, v)
    print(f"Форма выхода внимания: {attn_output.shape} (должна быть {q.shape})")
    print(f"Максимальное значение: {attn_output.max().item():.4f}")
    print(f"Минимальное значение: {attn_output.min().item():.4f}")

    print("\n3. Тест Encoder Layer")
    encoder_layer = EncoderLayer(d_model, num_heads).to(device)
    enc_input = torch.randn(batch_size, seq_len, d_model).to(device)
    enc_output = encoder_layer(enc_input)
    print(f"Форма выхода энкодера: {enc_output.shape} (должна быть {enc_input.shape})")
    print(f"Изменение данных: {torch.allclose(enc_input, enc_output, atol=1e-4)} (должно быть False)")

    print("\n4. Тест Decoder Layer")
    decoder_layer = DecoderLayer(d_model, num_heads).to(device)
    dec_input = torch.randn(batch_size, seq_len, d_model).to(device)
    dec_output = decoder_layer(dec_input, enc_output, src_mask, tgt_mask)
    print(f"Форма выхода декодера: {dec_output.shape} (должна быть {dec_input.shape})")
    print(f"Норма выходных данных: {dec_output.norm().item():.4f}")

    print("\n5. Полный тест Transformer")
    print("Входные данные:")
    print(f"src: {src.shape} (max={src.max().item()}, min={src.min().item()})")
    print(f"tgt: {tgt.shape} (max={tgt.max().item()}, min={tgt.min().item()})")
    
    output = transformer(src, tgt, src_mask, tgt_mask)
    print("\nПроверка формы выхода:")
    print(f"Ожидаемая форма: ({batch_size}, {seq_len}, {tgt_vocab_size})")
    print(f"Реальная форма:   {output.shape}")
    
    print("\nПроверка градиентов:")
    dummy_loss = output.sum()
    dummy_loss.backward()
    has_gradients = any(p.grad is not None for p in transformer.parameters())
    print(f"Градиенты вычислены: {has_gradients} (должно быть True)")

    print("\n6. Проверка параметров модели:")
    total_params = sum(p.numel() for p in transformer.parameters())
    print(f"Всего параметров: {total_params}")
    print(f"Параметры энкодера: {sum(p.numel() for p in transformer.encoder_embedding.parameters())}")
    print(f"Параметры декодера: {sum(p.numel() for p in transformer.decoder_embedding.parameters())}")

    print("\nТест завершен!")

In [10]:
test_transformer()

Using device: cuda
1. Тест Positional Encoding
До PE: mean=-0.0316, std=1.0065
После PE: mean=0.4401, std=1.2013
Форма PE: torch.Size([1, 10, 512]) (должна быть [1, 10, 512])

2. Тест Multi-Head Attention
Форма выхода внимания: torch.Size([2, 10, 512]) (должна быть torch.Size([2, 10, 512]))
Максимальное значение: 0.3949
Минимальное значение: -0.4440

3. Тест Encoder Layer
Форма выхода энкодера: torch.Size([2, 10, 512]) (должна быть torch.Size([2, 10, 512]))
Изменение данных: False (должно быть False)

4. Тест Decoder Layer
Форма выхода декодера: torch.Size([2, 10, 512]) (должна быть torch.Size([2, 10, 512]))
Норма выходных данных: 101.1924

5. Полный тест Transformer
Входные данные:
src: torch.Size([2, 10]) (max=95, min=6)
tgt: torch.Size([2, 10]) (max=99, min=10)

Проверка формы выхода:
Ожидаемая форма: (2, 10, 100)
Реальная форма:   torch.Size([2, 10, 100])

Проверка градиентов:
Градиенты вычислены: True (должно быть True)

6. Проверка параметров модели:
Всего параметров: 14866532
Па

## Обучение

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm.auto import tqdm

In [21]:
class TranslationDataset(Dataset):
    def __init__(self, processed_data):
        self.input_ids = processed_data['input_ids']
        self.attention_mask = processed_data['attention_mask']
        self.labels = processed_data['labels']
        self.decoder_attention_mask = processed_data['decoder_attention_mask']
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'decoder_attention_mask': torch.tensor(self.decoder_attention_mask[idx], dtype=torch.long)
        }

def create_masks(src_mask, tgt_mask):
    src_mask = src_mask.unsqueeze(1).unsqueeze(2)
    
    device = tgt_mask.device
    seq_len = tgt_mask.shape[1]
    causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=device), diagonal=1).bool()
    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, seq_len]
    
    tgt_mask = tgt_mask.unsqueeze(1).unsqueeze(2) & ~causal_mask
    
    return src_mask, tgt_mask

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc='Training')
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)
        
        # Вывод отладочной информации для первого батча
        if batch_idx == 0:
            print("\nДиагностика первого батча:")
            print(f"input_ids shape: {input_ids.shape}")
            print(f"labels shape: {labels.shape}")
            print(f"input_ids range: [{input_ids.min()}, {input_ids.max()}]")
            print(f"labels range: [{labels.min()}, {labels.max()}]")
            print(f"Vocab size: {model.fc_out.out_features}")
        
        # Создаем маски
        src_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        tgt_mask = decoder_attention_mask[:, :-1].unsqueeze(1).unsqueeze(2)
        
        # Forward pass с проверкой размерностей
        outputs = model(
            input_ids,
            labels[:, :-1],
            src_mask,
            tgt_mask
        )
        
        if batch_idx == 0:
            print(f"\nПосле forward pass:")
            print(f"outputs shape before view: {outputs.shape}")
        
        # Подготовка выходов и меток
        outputs = outputs.contiguous().view(-1, outputs.size(-1))
        target = labels[:, 1:].contiguous().view(-1)
        
        if batch_idx == 0:
            print(f"\nПеред loss:")
            print(f"outputs shape after view: {outputs.shape}")
            print(f"target shape: {target.shape}")
            print(f"target range: [{target.min()}, {target.max()}]")
            
            # Проверка на некорректные индексы
            invalid_indices = (target >= outputs.size(-1)).sum()
            print(f"Number of invalid indices in target: {invalid_indices}")
        
        # Исключаем padding tokens
        valid_targets = target != tokenizer.pad_token_id
        outputs = outputs[valid_targets]
        target = target[valid_targets]
        
        # Вычисляем loss
        try:
            loss = criterion(outputs, target)
        except RuntimeError as e:
            print(f"\nОшибка при вычислении loss:")
            print(f"outputs shape: {outputs.shape}")
            print(f"target shape: {target.shape}")
            print(f"target unique values: {torch.unique(target)}")
            raise e
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)


def translate_sentence(model, tokenizer, sentence, device, max_length=128):
    model.eval()
    
    # Токенизация с учетом всех специальных токенов
    inputs = tokenizer(sentence, 
                      return_tensors="pt", 
                      max_length=max_length,
                      padding='max_length',
                      truncation=True)
    
    src = inputs['input_ids'].to(device)
    src_mask = inputs['attention_mask'].to(device)
    
    # Инициализация декодера
    decoder_input = torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long, device=device)
    
    with torch.no_grad():
        for i in range(max_length):
            try:
                # Создаем маски
                src_mask_4d = src_mask.unsqueeze(1).unsqueeze(2)  # [1, 1, 1, src_len]
                
                # Каузальная маска для декодера
                tgt_seq_len = decoder_input.size(1)
                tgt_mask = torch.triu(torch.ones(tgt_seq_len, tgt_seq_len, device=device), diagonal=1).bool()
                tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, tgt_len, tgt_len]
                
                # Проверка размерностей перед forward pass
                print(f"Step {i}:")
                print(f"src shape: {src.shape}")  # Должно быть [1, seq_len]
                print(f"decoder_input shape: {decoder_input.shape}")
                print(f"src_mask shape: {src_mask_4d.shape}")
                print(f"tgt_mask shape: {tgt_mask.shape}")
                
                # Forward pass
                output = model(src, decoder_input, src_mask_4d, tgt_mask)
                
                # Получаем следующий токен
                next_token = output[:, -1].argmax(-1).unsqueeze(1)
                decoder_input = torch.cat([decoder_input, next_token], dim=1)
                
                if next_token.item() == tokenizer.eos_token_id:
                    break
                    
            except Exception as e:
                print(f"Error at step {i}:")
                print(f"Shapes - src: {src.shape}, decoder: {decoder_input.shape}")
                print(f"Masks - src: {src_mask_4d.shape}, tgt: {tgt_mask.shape}")
                raise e
                
    return tokenizer.decode(decoder_input[0].cpu().numpy(), skip_special_tokens=True)

In [22]:
translation = translate_sentence(model, 
                                tokenizer, 
                                "Hello, how are you?", 
                                device='cuda')
print(translation)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 0.0001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processed_data, tokenizer = prepare_data_with_hf(dataset)
train_dataset = TranslationDataset(processed_data)

print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id}")
print(f"PAD token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")

sample = tokenizer.decode(train_dataset[0]['input_ids'])
print("\nSample input:", sample)

BOS token: <s> (id: 62518)
EOS token: </s> (id: 0
PAD token: <pad> (id: 62517)

Sample input: For once in my life I'm doing a good deed... And it is useless.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [14]:
## Full dataset
"""
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

model = Transformer(
    src_vocab_size=tokenizer.vocab_size,
    tgt_vocab_size=tokenizer.vocab_size,
    d_model=512,
    num_heads=8,
    num_layers=6
).to(DEVICE)

optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

test_sentence = "Hello, how are you today?"
"""

'\ntrain_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)\n\nmodel = Transformer(\n    src_vocab_size=tokenizer.vocab_size,\n    tgt_vocab_size=tokenizer.vocab_size,\n    d_model=512,\n    num_heads=8,\n    num_layers=6\n).to(DEVICE)\n\noptimizer = Adam(model.parameters(), lr=LEARNING_RATE)\ncriterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)\n\ntest_sentence = "Hello, how are you today?"\n'

In [15]:
## Subset

SUBSET_SIZE = 100

train_subset = torch.utils.data.Subset(train_dataset, indices=range(SUBSET_SIZE))

train_dataloader = DataLoader(
    train_subset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=2,
    pin_memory=True 
)

model = Transformer(
    src_vocab_size=tokenizer.vocab_size,
    tgt_vocab_size=tokenizer.vocab_size,
    d_model=512,
    num_heads=8,
    num_layers=6
).to(DEVICE)

optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='mean')

test_sentence = "Hello, how are you today?"

In [16]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, DEVICE)
    print(f"Training loss: {train_loss:.4f}")

    test_sentence = "Hello, how are you today?"
    translation = translate_sentence(model, tokenizer, test_sentence, DEVICE)

    print(f"\nTest translation:")
    print(f"Input: {test_sentence}")
    print(f"Output: {translation}")


Epoch 1/5


Training:   0%|          | 0/4 [00:00<?, ?it/s]


Диагностика первого батча:
input_ids shape: torch.Size([32, 128])
labels shape: torch.Size([32, 128])
input_ids range: [0, 62517]
labels range: [0, 62517]
Vocab size: 62518

После forward pass:
outputs shape before view: torch.Size([32, 127, 62518])

Перед loss:
outputs shape after view: torch.Size([4064, 62518])
target shape: torch.Size([4064])
target range: [0, 62517]
Number of invalid indices in target: 0
Training loss: 9.5407


NameError: name 'tgt_seq_len' is not defined