# Load Data

In [None]:
import kagglehub
import os
import shutil
import pandas as pd
import re
from string import punctuation

In [None]:
# Path dataset cache Kaggle (sudah disediakan otomatis)
cached_path = "/kaggle/input/englishrussian-dictionary-for-machine-translate"
target_dir = "/content/englishrussian-dictionary"

# Jika dataset belum tersedia di cache, baru unduh
if not os.path.exists(cached_path):
    print("Downloading dataset via kagglehub...")
    path = kagglehub.dataset_download("hijest/englishrussian-dictionary-for-machine-translate")
    print("Downloaded and cached at:", path)
else:
    print("Dataset already available at:", cached_path)
    path = cached_path

# Salin dataset ke /content agar bisa dimodifikasi (karena /kaggle/input bersifat read-only)
if not os.path.exists(target_dir):
    shutil.copytree(path, target_dir)

print("Dataset copied to working directory:", target_dir)


Downloading dataset via kagglehub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/hijest/englishrussian-dictionary-for-machine-translate?dataset_version_number=1...


100%|██████████| 11.9M/11.9M [00:02<00:00, 5.51MB/s]

Extracting files...





Downloaded and cached at: /root/.cache/kagglehub/datasets/hijest/englishrussian-dictionary-for-machine-translate/versions/1
Dataset copied to working directory: /content/englishrussian-dictionary


In [None]:
data=pd.read_csv('../content/englishrussian-dictionary/rus.txt',delimiter='\t',header=None)

In [None]:
data.head()

Unnamed: 0,0,1,2
0,Go.,Марш!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Иди.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Идите.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Здравствуйте.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,Привет!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [None]:
data=data.iloc[:,:2]

In [None]:
data.head()

Unnamed: 0,0,1
0,Go.,Марш!
1,Go.,Иди.
2,Go.,Идите.
3,Hi.,Здравствуйте.
4,Hi.,Привет!


In [None]:
data.rename(columns={0: "English", 1: "Russian"},inplace=True)

In [None]:
data.columns

Index(['English', 'Russian'], dtype='object')

## Text Preprocessing - Data Preparation

### Fungsi `clean_string()`
Fungsi ini melakukan pembersihan teks dengan beberapa langkah:
1. **Normalisasi karakter khusus**: Mengganti no-break space (`\u202f`) dengan spasi biasa
2. **Konversi ke huruf kecil**: Semua karakter diubah menjadi lowercase untuk konsistensi
3. **Penghapusan tanda baca dan angka**: Menghilangkan karakter punctuation, guillemets (« »), dan digit (0-9)
4. **Penghapusan spasi ganda**: Menggunakan regex untuk mengganti multiple whitespace menjadi single space
5. **Trimming**: Menghapus spasi di awal dan akhir string

### Penerapan Preprocessing
Setelah fungsi dibuat, preprocessing diterapkan pada dataset:
- Kolom `English` dan `Russian` dikonversi ke tipe string
- Fungsi `clean_string()` diaplikasikan ke setiap baris menggunakan `apply()` dan lambda function
- Hasilnya adalah teks yang bersih, uniform, dan siap untuk tokenisasi

In [None]:
# Clean the string
def clean_string(string):
    # Replace no-break space with space
    string = string.replace("\u202f"," ")
    # Converts all uppercase characters into lowercase characters
    string = string.lower()

    # Delete the punctuation and the numbers
    for p in punctuation + "«»" + "0123456789":
        string = string.replace(p," ")

    # Eliminate duplicate whitespaces using wildcards
    string = re.sub("\s+"," ", string)
    # Remove spaces at the beginning and at the end of the string
    string = string.strip()

    return string
#-------------------------------------------------------------------------------
# object to string
data['English'] = data['English'].astype(str)
data['Russian'] = data['Russian'].astype(str)

# Clean the sentences
data['English'] = data['English'].apply(lambda x: clean_string(x))
data['Russian'] = data['Russian'].apply(lambda x: clean_string(x))

data.tail()

  string = re.sub("\s+"," ", string)


Unnamed: 0,English,Russian
363381,in today s world we have to equip all our kids...,в современном мире перед нами стоит задача дат...
363382,death is something that we re often discourage...,смерть это зачастую то разговоры или даже мысл...
363383,at a moment when our economy is growing our bu...,в тот момент когда наша экономика растёт наши ...
363384,since there are usually multiple websites on a...,поскольку сайтов посвящённых какой либо теме к...
363385,doubtless there exists in this world precisely...,несомненно для каждого мужчины в этом мире где...


In [None]:
# === Simpan dataset hasil cleaning ke CSV ===
save_path = "/content/englishrussian-dictionary/eng_rus_cleaned.csv"
data.to_csv(save_path, index=False)
print(f"Cleaned dataset saved to: {save_path}")
print(data.head())

# === Konfigurasi dasar ===
import torch
import random
import numpy as np

DATA_PATH = save_path
SRC_COL = "English"
TGT_COL = "Russian"

BATCH_SIZE = 100  # maksimal sesuai instruksi
NUM_EPOCHS = 1    # hanya 1 epoch
LR = 1e-4
EMB_SIZE = 128
NHEAD = 4
FFN_HID_DIM = 256
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
DROPOUT = 0.1
MAX_LEN = 100
MIN_FREQ = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
SAVE_MODEL = False
MODEL_SAVE_PATH = "transformer_eng_ru.pt"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print("Device in use:", DEVICE)


Cleaned dataset saved to: /content/englishrussian-dictionary/eng_rus_cleaned.csv
  English       Russian
0      go          марш
1      go           иди
2      go         идите
3      hi  здравствуйте
4      hi        привет
Device in use: cuda


## Tokenisasi dan Vocabulary Building

Setelah pembersihan data, tahap selanjutnya adalah mempersiapkan sistem tokenisasi dan vocabulary untuk mengubah teks menjadi representasi numerik.

### Fungsi `normalize_text()` dan `tokenize()`
- **`normalize_text()`**: Melakukan normalisasi tingkat lanjut dengan:
  - Konversi ke lowercase dan trimming
  - Penghapusan multiple whitespace
  - Pemisahan tanda baca dengan spasi (contoh: "hello!" → "hello !")
  
- **`tokenize()`**: Fungsi sederhana untuk memecah teks menjadi token berdasarkan spasi setelah dinormalisasi

### Class `Vocab`
Class ini bertanggung jawab untuk membangun dan mengelola vocabulary:

**Atribut utama:**
- `itos` (int-to-string): List yang memetakan index ke token
- `stoi` (string-to-int): Dictionary yang memetakan token ke index
- `freqs`: Dictionary untuk menghitung frekuensi kemunculan setiap token
- `min_freq`: Threshold minimum frekuensi agar token dimasukkan ke vocabulary

**Special tokens:**
- `<pad>`: Padding token untuk menyamakan panjang sequence
- `<sos>`: Start-of-sequence token
- `<eos>`: End-of-sequence token
- `<unk>`: Unknown token untuk kata yang tidak ada di vocabulary

**Method penting:**
- `add_sentence()`: Menambahkan token dari kalimat dan menghitung frekuensinya
- `build()`: Membangun vocabulary final dengan filter berdasarkan `min_freq`
- `tokens_to_ids()`: Konversi token menjadi ID numerik
- `ids_to_tokens()`: Konversi ID numerik kembali menjadi token

Vocabulary dibangun untuk source (English) dan target (Russian) secara terpisah karena keduanya memiliki kosakata yang berbeda.

In [None]:
# -----------------------------
# Tokenisasi dan Normalisasi
# -----------------------------
import re
from typing import List

def normalize_text(s: str) -> str:
    """Normalisasi dasar untuk teks (huruf kecil, buang spasi ganda, pisahkan tanda baca)."""
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"([.,!?;:()\"«»-])", r" \1 ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize(s: str) -> List[str]:
    """Tokenisasi sederhana berbasis spasi."""
    s = normalize_text(s)
    return s.split()


# -----------------------------
# Vocabulary Builder
# -----------------------------
class Vocab:
    def __init__(self, min_freq=1, specials=["<pad>", "<sos>", "<eos>", "<unk>"]):
        self.min_freq = min_freq
        self.itos = list(specials)
        self.stoi = {tok: i for i, tok in enumerate(self.itos)}
        self.freqs = {}

    def add_sentence(self, tokens: List[str]):
        for t in tokens:
            self.freqs[t] = self.freqs.get(t, 0) + 1

    def build(self):
        items = [(tok, cnt) for tok, cnt in self.freqs.items() if cnt >= self.min_freq]
        items.sort(key=lambda x: (-x[1], x[0]))  # sort by frequency desc
        for tok, _ in items:
            if tok not in self.stoi:
                self.stoi[tok] = len(self.itos)
                self.itos.append(tok)

    def __len__(self):
        return len(self.itos)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:
        """Ubah daftar token menjadi daftar ID."""
        return [self.stoi.get(t, self.stoi["<unk>"]) for t in tokens]

    def ids_to_tokens(self, ids: List[int]) -> List[str]:
        """Ubah daftar ID menjadi token kembali."""
        return [self.itos[i] if i < len(self.itos) else "<unk>" for i in ids]


In [None]:
# Bangun vocabulary dari dataset
src_vocab = Vocab(min_freq=MIN_FREQ)
tgt_vocab = Vocab(min_freq=MIN_FREQ)

for s in data[SRC_COL]:
    src_vocab.add_sentence(tokenize(s)[:MAX_LEN])

for t in data[TGT_COL]:
    tgt_vocab.add_sentence(tokenize(t)[:(MAX_LEN - 2)])  # -2 untuk <sos> & <eos>

src_vocab.build()
tgt_vocab.build()

print(f"Source vocab size (English): {len(src_vocab)}")
print(f"Target vocab size (Russian): {len(tgt_vocab)}")


Source vocab size (English): 11338
Target vocab size (Russian): 31619


## Dataset dan DataLoader Preparation

Tahap ini mempersiapkan data agar dapat dibatch dan diproses efisien oleh model.

### Class `TranslationDataset`
Custom PyTorch Dataset untuk pasangan kalimat translation (English → Russian):
- Menyimpan pasangan teks source dan target
- Menyimpan referensi ke vocabulary untuk konversi token-ID
- Membatasi panjang maksimum sequence untuk efisiensi
- Method `__getitem__()` mengembalikan token (bukan ID) untuk fleksibilitas preprocessing di collate function

### Fungsi `collate_fn()`
Fungsi ini sangat krusial untuk batch processing:

**Proses yang dilakukan:**
1. **Menentukan panjang maksimum** dalam batch untuk source dan target
2. **Konversi token ke ID** menggunakan vocabulary
3. **Penambahan special tokens**:
   - Target input: Menambahkan `<sos>` di awal
   - Target output: Menambahkan `<eos>` di akhir
4. **Padding**: Menambahkan `<pad>` token hingga semua sequence dalam batch memiliki panjang sama
5. **Transpose**: Mengubah shape dari `(batch, seq_len)` menjadi `(seq_len, batch)` sesuai format PyTorch Transformer
6. **Padding mask**: Membuat boolean mask untuk menandai posisi padding (True = padding, False = token aktual)

### Data Splitting
Data dibagi menjadi:
- **Train**: 80% untuk training model
- **Validation**: 10% untuk monitoring performance selama training
- **Test**: 10% untuk evaluasi akhir

DataLoader dibuat dengan batch size dan shuffle sesuai kebutuhan training.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# -----------------------------
# Dataset Class
# -----------------------------
class TranslationDataset(Dataset):
    """Dataset untuk pasangan kalimat English → Russian."""
    def __init__(self, src_texts, tgt_texts, src_vocab, tgt_vocab, max_len=MAX_LEN):
        assert len(src_texts) == len(tgt_texts)
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_tokens = tokenize(self.src_texts[idx])[:self.max_len]
        tgt_tokens = tokenize(self.tgt_texts[idx])[:(self.max_len - 2)]  # reserve for <sos>/<eos>
        return src_tokens, tgt_tokens


# -----------------------------
# Fungsi Collate untuk batching
# -----------------------------
def collate_fn(batch, src_vocab, tgt_vocab):
    """Menyiapkan batch data agar cocok untuk nn.Transformer."""
    pad_id = src_vocab.stoi["<pad>"]
    sos_id = tgt_vocab.stoi["<sos>"]
    eos_id = tgt_vocab.stoi["<eos>"]

    src_seqs, tgt_in_seqs, tgt_out_seqs = [], [], []

    max_src_len = max(len(s[0]) for s in batch)
    max_tgt_len = max(len(s[1]) for s in batch) + 2  # tambah 2 utk <sos> dan <eos>

    for src_tokens, tgt_tokens in batch:
        # Source
        src_ids = src_vocab.tokens_to_ids(src_tokens)
        src_padded = src_ids + [pad_id] * (max_src_len - len(src_ids))
        src_seqs.append(src_padded)

        # Target
        tgt_ids = tgt_vocab.tokens_to_ids(tgt_tokens)
        tgt_input = [sos_id] + tgt_ids + [pad_id] * (max_tgt_len - len(tgt_ids) - 1)
        tgt_output = tgt_ids + [eos_id] + [pad_id] * (max_tgt_len - len(tgt_ids) - 1)

        tgt_in_seqs.append(tgt_input)
        tgt_out_seqs.append(tgt_output)

    # Convert ke tensor (seq_len, batch_size)
    src_tensor = torch.LongTensor(src_seqs).transpose(0, 1)
    tgt_in_tensor = torch.LongTensor(tgt_in_seqs).transpose(0, 1)
    tgt_out_tensor = torch.LongTensor(tgt_out_seqs).transpose(0, 1)

    # Padding mask: True di posisi <pad>
    src_pad_mask = (src_tensor.transpose(0, 1) == pad_id)
    tgt_pad_mask = (tgt_in_tensor.transpose(0, 1) == pad_id)

    return src_tensor, tgt_in_tensor, tgt_out_tensor, src_pad_mask, tgt_pad_mask


In [None]:
import random

# Shuffle dan split dataset
pairs = list(zip(data[SRC_COL].tolist(), data[TGT_COL].tolist()))
random.shuffle(pairs)

train_split = int(0.8 * len(pairs))
val_split = int(0.1 * len(pairs))

train_data = pairs[:train_split]
val_data = pairs[train_split:train_split + val_split]
test_data = pairs[train_split + val_split:]

train_src, train_tgt = zip(*train_data)
val_src, val_tgt = zip(*val_data)
test_src, test_tgt = zip(*test_data)

# Buat dataset
train_dataset = TranslationDataset(train_src, train_tgt, src_vocab, tgt_vocab)
val_dataset = TranslationDataset(val_src, val_tgt, src_vocab, tgt_vocab)
test_dataset = TranslationDataset(test_src, test_tgt, src_vocab, tgt_vocab)

# Buat DataLoader
collate = lambda batch: collate_fn(batch, src_vocab, tgt_vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)

print(f"Train batches: {len(train_loader)} | Val batches: {len(val_loader)} | Test batches: {len(test_loader)}")


Train batches: 2908 | Val batches: 364 | Test batches: 364


## Arsitektur Transformer - Definisi Class

### Class `PositionalEncoding`

Positional Encoding menambahkan informasi posisi ke dalam token embeddings karena Transformer tidak memiliki mekanisme urutan bawaan seperti RNN/LSTM.

**Implementasi:**
- Menggunakan sinusoidal functions (sin dan cos) seperti pada paper "Attention is All You Need"
- Formula:
  - PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
  - PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
- Positional encoding ditambahkan langsung ke embedding dengan operasi penjumlahan
- Menggunakan `register_buffer()` agar tensor tidak dianggap sebagai parameter yang perlu di-train
- Mendukung embedding size ganjil dan genap

**Input/Output:**
- Input: `(seq_len, batch, emb_size)`
- Output: sama dengan input, dengan positional information terintegrasi

---

### Class `TransformerModel`

Class utama yang merangkum seluruh arsitektur Transformer untuk task machine translation.

**Komponen utama:**

1. **Token Embeddings:**
   - `src_tok_emb`: Embedding layer untuk source language (English)
   - `tgt_tok_emb`: Embedding layer untuk target language (Russian)
   - Embedding di-scale dengan √(emb_size) untuk stabilitas training

2. **Positional Encoding:**
   - Instance dari `PositionalEncoding` class
   - Diterapkan setelah embedding untuk menambahkan informasi posisi

3. **Transformer Core:**
   - Menggunakan `nn.Transformer` dari PyTorch
   - Konfigurasi:
     - `d_model`: Dimensi embedding (EMB_SIZE = 128)
     - `nhead`: Jumlah attention heads (NHEAD = 4)
     - `num_encoder_layers`: Jumlah layer encoder (1)
     - `num_decoder_layers`: Jumlah layer decoder (1)
     - `dim_feedforward`: Dimensi hidden layer FFN (FFN_HID_DIM = 256)
     - `dropout`: Dropout rate untuk regularisasi (0.1)
     - `batch_first=False`: Format input (seq_len, batch, emb_size)

4. **Generator (Output Layer):**
   - Linear layer yang memetakan output decoder ke vocabulary size
   - Output: logits untuk setiap token di vocabulary target

**Method penting:**

- **`forward()`**: Full forward pass untuk training
  - Input: source sequence, target input sequence, masks
  - Process: embedding → positional encoding → transformer → generator
  - Output: logits dengan shape (T, N, V_tgt)

- **`encode()`**: Hanya menjalankan encoder
  - Digunakan saat inference untuk encode source sentence sekali
  - Return: memory (encoded representation)

- **`decode()`**: Hanya menjalankan decoder
  - Digunakan saat inference untuk generate token secara iteratif
  - Input: target sequence (yang sudah digenerate), memory dari encoder

**Inisialisasi parameter:**
- Menggunakan Xavier uniform initialization untuk stabilitas training
- Diterapkan ke semua parameter dengan dimensi > 1

In [None]:
# -----------------------------
# Positional Encoding + Transformer model
# -----------------------------
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    """
    Sinusoidal positional encoding (same style as "Attention is All You Need").
    Adds positional information to token embeddings.
    """
    def __init__(self, emb_size: int, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(maxlen, emb_size)
        position = torch.arange(0, maxlen, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() * (-math.log(10000.0) / emb_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        if emb_size % 2 == 1:
            # handle odd embedding sizes
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # (maxlen, 1, emb_size)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (seq_len, batch, emb_size)
        returns: same shape with positional encodings added
        """
        return x + self.pe[:x.size(0)].to(x.device)


class TransformerModel(nn.Module):
    """
    Simple wrapper around torch.nn.Transformer with learned token embeddings
    and a final linear generator to predict target token logits.
    """
    def __init__(
        self,
        src_vocab_size: int,
        tgt_vocab_size: int,
        emb_size: int = EMB_SIZE,
        nhead: int = NHEAD,
        num_encoder_layers: int = NUM_ENCODER_LAYERS,
        num_decoder_layers: int = NUM_DECODER_LAYERS,
        dim_feedforward: int = FFN_HID_DIM,
        dropout: float = DROPOUT,
        max_len: int = MAX_LEN
    ):
        super(TransformerModel, self).__init__()
        self.emb_size = emb_size
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.pos_enc = PositionalEncoding(emb_size, maxlen=max_len + 5)

        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=False  # we use (seq_len, batch, emb)
        )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)

        self._reset_parameters()

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(
        self,
        src: torch.LongTensor,
        tgt: torch.LongTensor,
        src_key_padding_mask: torch.BoolTensor = None,
        tgt_key_padding_mask: torch.BoolTensor = None,
        tgt_mask: torch.Tensor = None
    ) -> torch.Tensor:
        """
        src: (S, N)
        tgt: (T, N) -- input to decoder (with <sos> at start)
        returns logits: (T, N, V_tgt)
        """
        src_emb = self.src_tok_emb(src) * math.sqrt(self.emb_size)   # (S, N, E)
        tgt_emb = self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size)   # (T, N, E)

        src_emb = self.pos_enc(src_emb)
        tgt_emb = self.pos_enc(tgt_emb)

        output = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=src_key_padding_mask
        )  # (T, N, E)

        logits = self.generator(output)  # (T, N, V_tgt)
        return logits

    def encode(self, src: torch.LongTensor, src_key_padding_mask: torch.BoolTensor):
        """Return encoded memory for a given src (useful during inference)."""
        src_emb = self.src_tok_emb(src) * math.sqrt(self.emb_size)
        src_emb = self.pos_enc(src_emb)
        return self.transformer.encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

    def decode(self, tgt: torch.LongTensor, memory: torch.Tensor,
               tgt_key_padding_mask: torch.BoolTensor = None,
               memory_key_padding_mask: torch.BoolTensor = None,
               tgt_mask: torch.Tensor = None):
        """Run only the decoder stack (useful during greedy inference)."""
        tgt_emb = self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size)
        tgt_emb = self.pos_enc(tgt_emb)
        return self.transformer.decoder(
            tgt=tgt_emb,
            memory=memory,
            tgt_mask=tgt_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )


In [None]:
# -----------------------------
# Mask helper: subsequent mask untuk decoder autoregressive
# -----------------------------
def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
    """
    Returns an upper-triangular matrix filled with -inf (masked) and 0 (allowed),
    shape (sz, sz). This is the mask used to prevent attention to future tokens.
    """
    mask = torch.triu(torch.full((sz, sz), float("-inf")), diagonal=1)
    return mask.to(DEVICE)


## Training Process - Pelatihan Model

Proses training dilakukan dengan monitoring per-batch untuk memantau performa model secara detail.

### Fungsi `generate_square_subsequent_mask()`
Membuat causal mask (subsequent mask) untuk decoder:
- Mencegah attention melihat token di masa depan (future tokens)
- Menggunakan upper-triangular matrix dengan nilai -inf untuk posisi yang di-mask
- Essential untuk autoregressive generation yang benar

---

### Fungsi `compute_loss_and_acc()`
Fungsi evaluasi untuk validation/test set:

**Proses:**
1. Set model ke mode evaluasi (`model.eval()`)
2. Matikan gradient computation (`torch.no_grad()`)
3. Iterasi melalui semua batch dalam dataloader
4. Untuk setiap batch:
   - Forward pass dengan causal mask
   - Hitung loss (ignore padding tokens)
   - Hitung accuracy dari prediksi yang benar
5. Aggregate loss dan accuracy dari semua batch
6. Return ke mode training

**Metrik:**
- **Loss**: Cross-entropy loss rata-rata per token (excluding padding)
- **Accuracy**: Persentase token yang diprediksi dengan benar

---

### Fungsi `train_one_epoch()`
Fungsi training untuk satu epoch dengan **monitoring per-batch**:

**Alur training per batch:**

1. **Data Loading**: Ambil batch data (src, tgt_in, tgt_out, masks)
2. **Move to Device**: Transfer semua tensor ke GPU/CPU
3. **Create Causal Mask**: Generate subsequent mask untuk decoder
4. **Forward Pass**:
   - Zero gradient dari optimizer
   - Pass data melalui model
   - Flatten logits dan target untuk loss computation
5. **Backward Pass**:
   - Hitung loss dengan CrossEntropyLoss
   - Backpropagation: `loss.backward()`
   - Gradient clipping untuk stabilitas (max norm = 1.0)
   - Update weights: `optimizer.step()`
6. **Compute Training Metrics**:
   - Hitung accuracy batch saat ini
   - Track correct predictions vs total tokens
7. **Validation per Batch**:
   - Jalankan evaluasi pada validation set
   - Dapatkan ValLoss dan ValAcc
8. **Display Metrics**:
   - **TrainLoss**: Loss dari batch saat ini
   - **TrainAcc**: Accuracy dari batch saat ini
   - **ValLoss**: Loss dari seluruh validation set
   - **ValAcc**: Accuracy dari seluruh validation set

**Output format:**
```
Batch 1/450 | TrainLoss: 5.2341 | TrainAcc: 0.1234 | ValLoss: 5.1234 | ValAcc: 0.1456
Batch 2/450 | TrainLoss: 5.0123 | TrainAcc: 0.1567 | ValLoss: 4.9876 | ValAcc: 0.1678
...
```

---

### Fungsi `train_with_logging()`
Versi enhanced dari `train_one_epoch()` dengan logging untuk visualisasi:

**Fitur tambahan:**
- Menyimpan semua metrics ke dictionary untuk plotting
- Log fields: batch number, train_loss, train_acc, val_loss, val_acc
- Return logs untuk visualisasi grafik

### Fungsi `plot_training_logs()`
Visualisasi metrics training:
- **Plot 1**: Training Loss vs Validation Loss per batch
- **Plot 2**: Training Accuracy vs Validation Accuracy per batch
- Membantu mengidentifikasi overfitting atau underfitting
- Monitoring konvergensi model secara visual

In [None]:
# -----------------------------
# Training dan Evaluasi
# -----------------------------
import torch.nn.functional as F

# Fungsi evaluasi (val/test)
def compute_loss_and_acc(model, dataloader, criterion, tgt_vocab, device=DEVICE):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    correct_tokens = 0
    pad_id = tgt_vocab.stoi["<pad>"]

    with torch.no_grad():
        for src, tgt_in, tgt_out, src_mask, tgt_mask in dataloader:
            src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
            src_mask, tgt_mask = src_mask.to(device), tgt_mask.to(device)

            tgt_seq_len = tgt_in.size(0)
            mask = generate_square_subsequent_mask(tgt_seq_len)

            logits = model(
                src, tgt_in,
                src_key_padding_mask=src_mask,
                tgt_key_padding_mask=tgt_mask,
                tgt_mask=mask
            )

            logits_flat = logits.reshape(-1, logits.size(-1))
            tgt_flat = tgt_out.reshape(-1)

            loss = criterion(logits_flat, tgt_flat)
            total_loss += loss.item() * (tgt_flat != pad_id).sum().item()

            preds = logits_flat.argmax(dim=1)
            mask = tgt_flat != pad_id
            correct_tokens += (preds[mask] == tgt_flat[mask]).sum().item()
            total_tokens += mask.sum().item()

    avg_loss = total_loss / max(total_tokens, 1)
    avg_acc = correct_tokens / max(total_tokens, 1)
    model.train()
    return avg_loss, avg_acc



# Fungsi training 1 epoch
def train_one_epoch(model, optimizer, criterion, train_loader, val_loader, tgt_vocab, device=DEVICE):
    model.train()
    pad_id = tgt_vocab.stoi["<pad>"]

    for batch_idx, (src, tgt_in, tgt_out, src_mask, tgt_mask) in enumerate(train_loader, start=1):
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
        src_mask, tgt_mask = src_mask.to(device), tgt_mask.to(device)

        tgt_seq_len = tgt_in.size(0)
        mask = generate_square_subsequent_mask(tgt_seq_len)

        optimizer.zero_grad()
        logits = model(
            src, tgt_in,
            src_key_padding_mask=src_mask,
            tgt_key_padding_mask=tgt_mask,
            tgt_mask=mask
        )

        # gunakan reshape untuk keamanan
        logits_flat = logits.reshape(-1, logits.size(-1))
        tgt_flat = tgt_out.reshape(-1)

        loss = criterion(logits_flat, tgt_flat)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Hitung akurasi batch
        with torch.no_grad():
            preds = logits_flat.argmax(dim=1)
            mask = tgt_flat != pad_id
            correct = (preds[mask] == tgt_flat[mask]).sum().item()
            total = mask.sum().item()
            train_acc = correct / max(total, 1)

        # Evaluasi val per batch
        val_loss, val_acc = compute_loss_and_acc(model, val_loader, criterion, tgt_vocab, device)

        print(f"Batch {batch_idx}/{len(train_loader)} | "
              f"TrainLoss: {loss.item():.4f} | TrainAcc: {train_acc:.4f} | "
              f"ValLoss: {val_loss:.4f} | ValAcc: {val_acc:.4f}")



In [None]:
import matplotlib.pyplot as plt

# -----------------------------
# Training dengan logging hasil per batch
# -----------------------------
def train_with_logging(model, optimizer, criterion, train_loader, val_loader, tgt_vocab, device=DEVICE):
    model.train()
    pad_id = tgt_vocab.stoi["<pad>"]

    # Simpan log
    logs = {
        "batch": [],
        "train_loss": [],
        "train_acc": [],
        "val_loss": [],
        "val_acc": []
    }

    for batch_idx, (src, tgt_in, tgt_out, src_mask, tgt_mask) in enumerate(train_loader, start=1):
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
        src_mask, tgt_mask = src_mask.to(device), tgt_mask.to(device)

        tgt_seq_len = tgt_in.size(0)
        mask = generate_square_subsequent_mask(tgt_seq_len)

        optimizer.zero_grad()
        logits = model(
            src, tgt_in,
            src_key_padding_mask=src_mask,
            tgt_key_padding_mask=tgt_mask,
            tgt_mask=mask
        )

        logits_flat = logits.view(-1, logits.size(-1))
        tgt_flat = tgt_out.view(-1)
        loss = criterion(logits_flat, tgt_flat)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Hitung akurasi batch
        with torch.no_grad():
            preds = logits_flat.argmax(dim=1)
            mask = tgt_flat != pad_id
            correct = (preds[mask] == tgt_flat[mask]).sum().item()
            total = mask.sum().item()
            train_acc = correct / max(total, 1)

        # Evaluasi val per batch
        val_loss, val_acc = compute_loss_and_acc(model, val_loader, criterion, tgt_vocab, device)

        # Simpan ke log
        logs["batch"].append(batch_idx)
        logs["train_loss"].append(loss.item())
        logs["train_acc"].append(train_acc)
        logs["val_loss"].append(val_loss)
        logs["val_acc"].append(val_acc)

        print(f"Batch {batch_idx}/{len(train_loader)} | "
              f"TrainLoss: {loss.item():.4f} | TrainAcc: {train_acc:.4f} | "
              f"ValLoss: {val_loss:.4f} | ValAcc: {val_acc:.4f}")

    return logs


# -----------------------------
# Fungsi visualisasi grafik loss & accuracy
# -----------------------------
def plot_training_logs(logs):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(logs["batch"], logs["train_loss"], label="Train Loss")
    plt.plot(logs["batch"], logs["val_loss"], label="Val Loss")
    plt.xlabel("Batch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss per Batch")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(logs["batch"], logs["train_acc"], label="Train Accuracy")
    plt.plot(logs["batch"], logs["val_acc"], label="Val Accuracy")
    plt.xlabel("Batch")
    plt.ylabel("Accuracy")
    plt.title("Training & Validation Accuracy per Batch")
    plt.legend()
    plt.tight_layout()
    plt.show()


## Model Inference - Proses Translation

Setelah model di-train, tahap inference digunakan untuk melakukan translation dari English ke Russian.

### Fungsi `translate_sentence()`
Implementasi **greedy decoding** untuk generate translation secara autoregressive.

**Alur proses inference:**

#### 1. Preprocessing Input
```python
tokens = tokenize(sentence)[:max_len]
src_ids = src_vocab.tokens_to_ids(tokens)
src_tensor = torch.LongTensor([src_ids]).transpose(0, 1).to(device)
```
- Tokenisasi kalimat input
- Konversi token ke ID numerik
- Buat tensor dengan shape (seq_len, 1)
- Transfer ke device (GPU/CPU)

#### 2. Encode Source Sentence (Sekali)
```python
memory = model.encode(src_tensor, src_mask)
```
- Jalankan encoder untuk mendapatkan representation dari source sentence
- Memory ini akan digunakan berulang kali di decoder
- Efisien karena encoding hanya dilakukan sekali

#### 3. Greedy Decoding (Iteratif)
Proses generate token satu per satu:

**Inisialisasi:**
- Mulai dengan `<sos>` token sebagai input decoder
- `ys = [[<sos>]]`

**Loop untuk setiap posisi (max_len):**

a. **Generate Causal Mask**
   - Buat subsequent mask sesuai panjang sequence saat ini
   - Mencegah attention ke future positions

b. **Decode**
   - Pass current sequence melalui decoder
   - Input: sequence yang sudah digenerate, memory dari encoder
   - Output: hidden states

c. **Generate Logits**
   - Pass hidden state terakhir melalui generator layer
   - Dapatkan probability distribution untuk semua vocabulary

d. **Select Next Token (Greedy)**
   - Pilih token dengan probability tertinggi: `argmax(logits)`
   - Ini adalah strategi "greedy" - selalu pilih yang terbaik

e. **Append Token**
   - Tambahkan token baru ke sequence
   - `ys = torch.cat([ys, next_token])`

f. **Check Stopping Condition**
   - Jika token adalah `<eos>`, stop generation
   - Atau jika sudah mencapai max_len

#### 4. Post-processing Output
```python
out_ids.remove(<sos>)
out_ids = out_ids[:out_ids.index(<eos>)]
tokens_out = tgt_vocab.ids_to_tokens(out_ids)
return " ".join(tokens_out)
```
- Hapus `<sos>` token dari hasil
- Potong sequence setelah `<eos>` token
- Konversi ID kembali ke token strings
- Join token menjadi kalimat lengkap

**Karakteristik Greedy Decoding:**
- ✅ Cepat dan deterministik
- ✅ Sederhana untuk diimplementasikan
- ❌ Tidak selalu menghasilkan translation terbaik secara global
- ❌ Tidak bisa "melihat ke belakang" untuk memperbaiki pilihan sebelumnya

**Alternative strategies** (tidak diimplementasikan di sini):
- **Beam Search**: Maintain top-k candidates untuk hasil lebih baik
- **Sampling**: Random sampling dari distribution untuk variasi output
- **Temperature**: Control randomness dalam generation

In [None]:
# -----------------------------
# Fungsi terjemahan (greedy decoding)
# -----------------------------
def translate_sentence(model, sentence, src_vocab, tgt_vocab, max_len=MAX_LEN, device=DEVICE):
    model.eval()
    tokens = tokenize(sentence)[:max_len]
    src_ids = src_vocab.tokens_to_ids(tokens)
    src_tensor = torch.LongTensor([src_ids]).transpose(0, 1).to(device)
    src_mask = (src_tensor.transpose(0, 1) == src_vocab.stoi["<pad>"]).to(device)

    with torch.no_grad():
        memory = model.encode(src_tensor, src_mask)
        ys = torch.LongTensor([[tgt_vocab.stoi["<sos>"]]]).to(device)

        for i in range(max_len):
            tgt_mask = generate_square_subsequent_mask(ys.size(0))
            out = model.decode(
                ys, memory,
                tgt_key_padding_mask=(ys.transpose(0, 1) == tgt_vocab.stoi["<pad>"]),
                memory_key_padding_mask=src_mask,
                tgt_mask=tgt_mask
            )
            out = out.transpose(0, 1)
            logits = model.generator(out[:, -1, :])
            _, next_word = torch.max(logits, dim=1)
            next_word = next_word.item()
            ys = torch.cat([ys, torch.LongTensor([[next_word]]).to(device)], dim=0)
            if next_word == tgt_vocab.stoi["<eos>"]:
                break

    out_ids = ys.squeeze(1).cpu().numpy().tolist()
    if tgt_vocab.stoi["<sos>"] in out_ids:
        out_ids.remove(tgt_vocab.stoi["<sos>"])
    if tgt_vocab.stoi["<eos>"] in out_ids:
        out_ids = out_ids[:out_ids.index(tgt_vocab.stoi["<eos>"])]

    tokens_out = tgt_vocab.ids_to_tokens(out_ids)
    return " ".join(tokens_out)


In [None]:
# -----------------------------
# Main training loop
# -----------------------------
def main():
    print("Loading dataset...")
    df = pd.read_csv(DATA_PATH)
    src_texts = df[SRC_COL].tolist()
    tgt_texts = df[TGT_COL].tolist()

    # Split ulang (jaga konsistensi)
    pairs = list(zip(src_texts, tgt_texts))
    random.shuffle(pairs)
    train_split = int(0.8 * len(pairs))
    val_split = int(0.1 * len(pairs))

    train_data = pairs[:train_split]
    val_data = pairs[train_split:train_split + val_split]
    test_data = pairs[train_split + val_split:]

    train_src, train_tgt = zip(*train_data)
    val_src, val_tgt = zip(*val_data)
    test_src, test_tgt = zip(*test_data)

    # Dataset dan DataLoader
    train_dataset = TranslationDataset(train_src, train_tgt, src_vocab, tgt_vocab)
    val_dataset = TranslationDataset(val_src, val_tgt, src_vocab, tgt_vocab)
    test_dataset = TranslationDataset(test_src, test_tgt, src_vocab, tgt_vocab)

    collate = lambda b: collate_fn(b, src_vocab, tgt_vocab)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)

    # Model setup
    model = TransformerModel(len(src_vocab), len(tgt_vocab)).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    pad_id = tgt_vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_id)

    # Training 1 epoch
    print("Starting training...\n")
    for epoch in range(1, NUM_EPOCHS + 1):
        print(f"=== Epoch {epoch}/{NUM_EPOCHS} ===")
        train_one_epoch(model, optimizer, criterion, train_loader, val_loader, tgt_vocab)

    # Evaluasi akhir
    test_loss, test_acc = compute_loss_and_acc(model, test_loader, criterion, tgt_vocab)
    print(f"\nFinal Test Loss: {test_loss:.4f} | Final Test Accuracy: {test_acc:.4f}")

    # Contoh inferensi
    print("\n--- Translation Examples ---")
    examples = [
        "Hello, how are you?",
        "I love programming and coffee.",
        "This is a simple test."
    ]
    for ex in examples:
        translation = translate_sentence(model, ex, src_vocab, tgt_vocab)
        print(f"English: {ex}")
        print(f"Russian: {translation}\n")


# Jalankan training
if __name__ == "__main__":
    main()


Loading dataset...




Starting training...

=== Epoch 1/1 ===




Batch 1/2908 | TrainLoss: 10.3474 | TrainAcc: 0.0000 | ValLoss: 10.3330 | ValAcc: 0.0000
Batch 2/2908 | TrainLoss: 10.3341 | TrainAcc: 0.0016 | ValLoss: 10.3111 | ValAcc: 0.0000
Batch 3/2908 | TrainLoss: 10.3074 | TrainAcc: 0.0000 | ValLoss: 10.2892 | ValAcc: 0.0001
Batch 4/2908 | TrainLoss: 10.2875 | TrainAcc: 0.0049 | ValLoss: 10.2676 | ValAcc: 0.0228
Batch 5/2908 | TrainLoss: 10.2743 | TrainAcc: 0.0179 | ValLoss: 10.2464 | ValAcc: 0.0923
Batch 6/2908 | TrainLoss: 10.2572 | TrainAcc: 0.0580 | ValLoss: 10.2257 | ValAcc: 0.1447
Batch 7/2908 | TrainLoss: 10.2254 | TrainAcc: 0.1302 | ValLoss: 10.2055 | ValAcc: 0.1601
Batch 8/2908 | TrainLoss: 10.2193 | TrainAcc: 0.1564 | ValLoss: 10.1859 | ValAcc: 0.1606
Batch 9/2908 | TrainLoss: 10.1997 | TrainAcc: 0.1616 | ValLoss: 10.1670 | ValAcc: 0.1606
Batch 10/2908 | TrainLoss: 10.1825 | TrainAcc: 0.1493 | ValLoss: 10.1486 | ValAcc: 0.1606
Batch 11/2908 | TrainLoss: 10.1526 | TrainAcc: 0.1634 | ValLoss: 10.1307 | ValAcc: 0.1606
Batch 12/2908 | Tra