In [1]:
import math
import torch
import torch.nn as nn

class EmbeddingWithProjection(nn.Module):
    def __init__(self, vocab_size, d_embed, d_model,  
                 max_position_embeddings =512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.d_embed = d_embed
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(self.vocab_size, self.d_embed)
        self.projection = nn.Linear(self.d_embed, self.d_model)
        self.scaling = float(math.sqrt(self.d_model))

        self.layernorm = nn.LayerNorm(self.d_model)
        self.dropout = nn.Dropout(p=dropout)

    @staticmethod
    def create_positional_encoding(seq_length, d_model, batch_size=1):
        # Create position indices: [seq_length, 1]
        position = torch.arange(seq_length).unsqueeze(1).float()
        
        # Create dimension indices: [1, d_model//2]
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * 
            (-math.log(10000.0) / d_model)
        )
        
        # Create empty tensor: [seq_length, d_model]
        pe = torch.zeros(seq_length, d_model)
        
        # Compute sin and cos
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Add batch dimension and expand: [batch_size, seq_length, d_model]
        pe = pe.unsqueeze(0).expand(batch_size, -1, -1)
        
        return pe

    
    
    def forward(self, x):
        assert x.dtype == torch.long, f"Input tensor must have dtype torch.long, got {x.dtype}"
        batch_size, seq_length = x.size() # [batch, seq_length]

        # token embedding
        token_embedding = self.embedding(x)                                                            #[2, 16, 1024]     
        # project the scaled token embedding to the d_model space
        token_embedding =  self.projection(token_embedding) * self.scaling                             #[2, 16, 768]

        # add positional encodings to projected, 
        # scaled embeddings before applying layer norm and dropout.
        device = x.device  # lấy thiết bị của input (CPU/GPU)
        positional_encoding = self.create_positional_encoding(seq_length, self.d_model, batch_size).to(device)
            #[2, 16, 768]
        
        # In addition, we apply dropout to the sums of the embeddings 
        # in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1.
        normalized_sum = self.layernorm(token_embedding + positional_encoding)
        final_output = self.dropout(normalized_sum)
        return final_output

In [2]:
class TransformerAttention(nn.Module):
    """
    Transformer Scaled Dot Product Attention Module
    Args:
        d_model: Total dimension of the model.
        num_head: Number of attention heads.
        dropout: Dropout rate for attention scores.
        bias: Whether to include bias in linear projections.

    Inputs:
        sequence: input sequence for self-attention and the query for cross-attention
        key_value_state: input for the key, values for cross-attention
    """
    def __init__(self, d_model, num_head, dropout=0.1, bias=True): # infer d_k, d_v, d_q from d_model
        super().__init__()  # Missing in the original implementation
        assert d_model % num_head == 0, "d_model must be divisible by num_head"
        self.d_model = d_model
        self.num_head = num_head
        self.d_head=d_model//num_head
        self.dropout_rate = dropout  # Store dropout rate separately

        # linear transformations
        self.q_proj = nn.Linear(d_model, d_model, bias=bias)
        self.k_proj = nn.Linear(d_model, d_model, bias=bias)
        self.v_proj = nn.Linear(d_model, d_model, bias=bias)
        self.output_proj = nn.Linear(d_model, d_model, bias=bias)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout)

        # Initiialize scaler
        self.scaler = float(1.0 / math.sqrt(self.d_head)) # Store as float in initialization
        

    def forward(self, sequence, key_value_states = None, att_mask=None):
        """Input shape: [batch_size, seq_len, d_model=num_head * d_head]"""
        batch_size, seq_len, model_dim = sequence.size()

        # Check only critical input dimensions
        assert model_dim == self.d_model, f"Input dimension {model_dim} doesn't match model dimension {self.d_model}"
        if key_value_states is not None:
            assert key_value_states.size(-1) == self.d_model, \
            f"Cross attention key/value dimension {key_value_states.size(-1)} doesn't match model dimension {self.d_model}"


        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None
        
        # Linear projections and reshape for multi-head
        Q_state = self.q_proj(sequence)
        if is_cross_attention:
            kv_seq_len = key_value_states.size(1)
            K_state = self.k_proj(key_value_states)
            V_state = self.v_proj(key_value_states)
        else:
            kv_seq_len = seq_len
            K_state = self.k_proj(sequence)
            V_state = self.v_proj(sequence)

        #[batch_size, self.num_head, seq_len, self.d_head]
        Q_state = Q_state.view(batch_size, seq_len, self.num_head, self.d_head).transpose(1,2) 
            
        # in cross-attention, key/value sequence length might be different from query sequence length
        K_state = K_state.view(batch_size, kv_seq_len, self.num_head, self.d_head).transpose(1,2)
        V_state = V_state.view(batch_size, kv_seq_len, self.num_head, self.d_head).transpose(1,2)

        # Scale Q by 1/sqrt(d_k)
        Q_state = Q_state * self.scaler
    
    
        # Compute attention matrix: QK^T
        self.att_matrix = torch.matmul(Q_state, K_state.transpose(-1,-2)) 

    
        # apply attention mask to attention matrix
        if att_mask is not None and not isinstance(att_mask, torch.Tensor):
            raise TypeError("att_mask must be a torch.Tensor")

        if att_mask is not None:
            self.att_matrix = self.att_matrix + att_mask
        
        # apply softmax to the last dimension to get the attention score: softmax(QK^T)
        att_score = F.softmax(self.att_matrix, dim = -1)
    
        # apply drop out to attention score
        att_score = self.dropout(att_score)
    
        # get final output: softmax(QK^T)V
        att_output = torch.matmul(att_score, V_state)
    
        # concatinate all attention heads
        att_output = att_output.transpose(1, 2)
        att_output = att_output.contiguous().view(batch_size, seq_len, self.num_head*self.d_head) 
    
        # final linear transformation to the concatenated output
        att_output = self.output_proj(att_output)

        assert att_output.size() == (batch_size, seq_len, self.d_model), \
        f"Final output shape {att_output.size()} incorrect"

        return att_output

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class FFN(nn.Module):
    """
    Position-wise Feed-Forward Networks
    This consists of two linear transformations with a ReLU activation in between.
    
    FFN(x) = max(0, xW1 + b1 )W2 + b2
    d_model: embedding dimension (e.g., 512)
    d_ff: feed-forward dimension (e.g., 2048)
    
    """
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.d_model=d_model
        self.d_ff= d_ff
        
        # Linear transformation y = xW+b
        self.fc1 = nn.Linear(self.d_model, self.d_ff, bias = True)
        self.fc2 = nn.Linear(self.d_ff, self.d_model, bias = True)
        
        # for potential speed up
        # Pre-normalize the weights (can help with training stability)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)


    def forward(self, input):
        # check input and first FF layer dimension matching
        batch_size, seq_length, d_input = input.size()
        assert self.d_model == d_input, "d_model must be the same dimension as the input"

        # First linear transformation followed by ReLU
        # There's no need for explicit torch.max() as F.relu() already implements max(0,x)
        f1 = F.relu(self.fc1(input))

        # max(0, xW_1 + b_1)W_2 + b_2 
        f2 =  self.fc2(f1)

        return f2

        

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoder(nn.Module):
    """
    Encoder layer of the Transformer
    Sublayers: TransformerAttention
               Residual LayerNorm
               FNN
               Residual LayerNorm
    Args:
            d_model: 512 model hidden dimension
            d_embed: 512 embedding dimension, same as d_model in transformer framework
            d_ff: 2048 hidden dimension of the feed forward network
            num_head: 8 Number of attention heads.
            dropout:  0.1 dropout rate 
            
            bias: Whether to include bias in linear projections.
              
    """

    def __init__(
        self, d_model, d_ff,
        num_head, dropout=0.1,
        bias=True
    ):
        super().__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        
        # attention sublayer
        self.att = TransformerAttention(
            d_model = d_model,
            num_head = num_head,
            dropout = dropout,
            bias = bias
        )
        
        # FFN sublayer
        self.ffn = FFN(
            d_model = d_model,
            d_ff = d_ff
        )

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout)

        # layer-normalization layer
        self.LayerNorm_att = nn.LayerNorm(self.d_model)
        self.LayerNorm_ffn = nn.LayerNorm(self.d_model)

        
    def forward(self, embed_input, padding_mask=None):
       
        batch_size, seq_len, _ = embed_input.size()
        
        ## First sublayer: self attion 
        att_sublayer = self.att(sequence = embed_input, key_value_states = None, 
                                att_mask = padding_mask)  # [batch_size, sequence_length, d_model]
        
        # apply dropout before layer normalization for each sublayer
        att_sublayer = self.dropout(att_sublayer)
        # Residual layer normalization
        att_normalized = self.LayerNorm_att(embed_input + att_sublayer)           # [batch_size, sequence_length, d_model]
        
        ## Second sublayer: FFN
        ffn_sublayer = self.ffn(att_normalized)                                   # [batch_size, sequence_length, d_model]
        ffn_sublayer = self.dropout(ffn_sublayer)
        ffn_normalized = self.LayerNorm_ffn(att_normalized + ffn_sublayer )       # [batch_size, sequence_length, d_model]
    

        return ffn_normalized
net = TransformerEncoder( d_model = 512, d_ff =2048, num_head=8, dropout=0.1, bias=True )
print(net)

TransformerEncoder(
  (att): TransformerAttention(
    (q_proj): Linear(in_features=512, out_features=512, bias=True)
    (k_proj): Linear(in_features=512, out_features=512, bias=True)
    (v_proj): Linear(in_features=512, out_features=512, bias=True)
    (output_proj): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ffn): FFN(
    (fc1): Linear(in_features=512, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=512, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (LayerNorm_att): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (LayerNorm_ffn): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerDecoder(nn.Module):
    """
    Decoder layer of the Transformer
    Sublayers: TransformerAttention with self-attention
               Residual LayerNorm
               TransformerAttention with cross-attention
               Residual LayerNorm
               FNN
               Residual LayerNorm
    Args:
            d_model: 512 model hidden dimension
            d_embed: 512 embedding dimension, same as d_model in transformer framework
            d_ff: 2048 hidden dimension of the feed forward network
            num_head: 8 Number of attention heads.
            dropout:  0.1 dropout rate 
            
            bias: Whether to include bias in linear projections.
              
    """

    def __init__(
        self, d_model, d_ff,
        num_head, dropout=0.1,
        bias=True
    ):
        super().__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        
        # attention sublayer
        self.att = TransformerAttention(
            d_model = d_model,
            num_head = num_head,
            dropout = dropout,
            bias = bias
        )
        
        # FFN sublayer
        self.ffn = FFN(
            d_model = d_model,
            d_ff = d_ff
        )

        
        # Dropout layer
        self.dropout = nn.Dropout(p=dropout)

        # layer-normalization layer
        self.LayerNorm_att1 = nn.LayerNorm(self.d_model)
        self.LayerNorm_att2 = nn.LayerNorm(self.d_model)
        self.LayerNorm_ffn = nn.LayerNorm(self.d_model)

    @staticmethod
    def create_causal_mask(seq_len):
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf'))
        return mask

    
    def forward(self, embed_input, cross_input, padding_mask=None):
        """
        Args:
        embed_input: Decoder input sequence [batch_size, seq_len, d_model]
        cross_input: Encoder output sequence [batch_size, encoder_seq_len, d_model]
        casual_attention_mask: Causal mask for self-attention [batch_size, seq_len, seq_len]
        padding_mask: Padding mask for cross-attention [batch_size, seq_len, encoder_seq_len]
        Returns:
        Tensor: Decoded output [batch_size, seq_len, d_model]
        """
        batch_size, seq_len, _ = embed_input.size()
        
        assert embed_input.size(-1) == self.d_model, f"Input dimension {embed_input.size(-1)} doesn't match model dimension {self.d_model}"
        assert cross_input.size(-1) == self.d_model, "Encoder output dimension doesn't match model dimension"


        # Generate and expand causal mask for self-attention
        causal_mask = self.create_causal_mask(seq_len).to(embed_input.device)  # [seq_len, seq_len]
        causal_mask = causal_mask.unsqueeze(0).unsqueeze(1)  # [1, 1, seq_len, seq_len]


        ## First sublayer: self attion 
        # After embedding and positional encoding, input sequence feed into current attention sublayer
        # Or, the output of the previous encoder/decoder feed into current attention sublayer
        att_sublayer1 = self.att(sequence = embed_input, key_value_states = None, 
                                att_mask = causal_mask)  # [batch_size, num_head, sequence_length, d_model]
        # apply dropout before layer normalization for each sublayer
        att_sublayer1 = self.dropout(att_sublayer1)
        # Residual layer normalization
        att_normalized1 = self.LayerNorm_att1(embed_input + att_sublayer1)           # [batch_size, sequence_length, d_model]

        ## Second sublayer: cross attention
        # Query from the output of previous attention output, or training data
        # Key, Value from output of Encoder of the same layer
        att_sublayer2 = self.att(sequence = att_normalized1, key_value_states = cross_input, 
                                att_mask = padding_mask)  # [batch_size, sequence_length, d_model]
        # apply dropout before layer normalization for each sublayer
        att_sublayer2 = self.dropout(att_sublayer2)
        # Residual layer normalization
        att_normalized2 = self.LayerNorm_att2(att_normalized1 + att_sublayer2)           # [batch_size, sequence_length, d_model]
        
        
        # Third sublayer: FFN
        ffn_sublayer = self.ffn(att_normalized2)                                   # [batch_size, sequence_length, d_model]
        ffn_sublayer = self.dropout(ffn_sublayer)
        ffn_normalized = self.LayerNorm_ffn(att_normalized2 + ffn_sublayer )       # [batch_size, sequence_length, d_model]
    

        return ffn_normalized
net = TransformerDecoder( d_model = 512, d_ff =2048, num_head=8, dropout=0.1, bias=True )
print(net)

TransformerDecoder(
  (att): TransformerAttention(
    (q_proj): Linear(in_features=512, out_features=512, bias=True)
    (k_proj): Linear(in_features=512, out_features=512, bias=True)
    (v_proj): Linear(in_features=512, out_features=512, bias=True)
    (output_proj): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ffn): FFN(
    (fc1): Linear(in_features=512, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=512, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (LayerNorm_att1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (LayerNorm_att2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (LayerNorm_ffn): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)


In [6]:
class TransformerEncoderDecoder(nn.Module):
    """
    Encoder-Decoder stack of the Transformer
    Sublayers:  Encoder x 6
                Decoder x 6
    Args:
            d_model: 512 model hidden dimension
            d_embed: 512 embedding dimension, same as d_model in transformer framework
            d_ff: 2048 hidden dimension of the feed forward network
            num_head: 8 Number of attention heads.
            dropout:  0.1 dropout rate 
            
            bias: Whether to include bias in linear projections.
              
    """
    def __init__(
        self, num_layer,
        d_model, d_ff,
        num_head, dropout=0.1,
        bias=True
    ):
        super().__init__()
        self.num_layer = num_layer
        self.d_model = d_model
        self.d_ff = d_ff
        self.num_head = num_head
        self.dropout = dropout
        self.bias = bias
        
        # Encoder stack
        self.encoder_stack = nn.ModuleList([ TransformerEncoder(
                                        d_model = self.d_model, 
                                        d_ff = self.d_ff,
                                        num_head = self.num_head, 
                                        dropout = self.dropout,
                                        bias = self.bias) for _ in range(self.num_layer)])

        # Decoder stack
        self.decoder_stack = nn.ModuleList([ TransformerDecoder(
                                        d_model = self.d_model, 
                                        d_ff = self.d_ff,
                                        num_head = self.num_head, 
                                        dropout = self.dropout,
                                        bias = self.bias) for _ in range(self.num_layer)])

    
    def forward(self, embed_encoder_input, embed_decoder_input, padding_mask=None):
        # Process through all encoder layers first
        encoder_output = embed_encoder_input
        for encoder in self.encoder_stack:
            encoder_output = encoder(encoder_output, padding_mask)
        
        # Use final encoder output for all decoder layers
        decoder_output = embed_decoder_input
        for decoder in self.decoder_stack:
            decoder_output = decoder(decoder_output, encoder_output, padding_mask)
        
        return decoder_output

In [7]:
class Transformer(nn.Module):
    def __init__(
        self, 
        num_layer,
        d_model, d_embed, d_ff,
        num_head,
        src_vocab_size, 
        tgt_vocab_size,
        max_position_embeddings=512,
        dropout=0.1,
        bias=True
    ):
        super().__init__()
        
        self.tgt_vocab_size = tgt_vocab_size
        
        # Source and target embeddings
        self.src_embedding = EmbeddingWithProjection(
            vocab_size=src_vocab_size,
            d_embed=d_embed,
            d_model=d_model,
            max_position_embeddings=max_position_embeddings,
            dropout=dropout
        )
        
        self.tgt_embedding = EmbeddingWithProjection(
            vocab_size=tgt_vocab_size,
            d_embed=d_embed,
            d_model=d_model,
            max_position_embeddings=max_position_embeddings,
            dropout=dropout
        )
        
        # Encoder-Decoder stack
        self.encoder_decoder = TransformerEncoderDecoder(
            num_layer=num_layer,
            d_model=d_model,
            d_ff=d_ff,
            num_head=num_head,
            dropout=dropout,
            bias=bias
        )
        
        # Output projection and softmax
        self.output_projection = nn.Linear(d_model, tgt_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def shift_target_right(self, tgt_tokens):
        # Shift target tokens right by padding with zeros at the beginning
        batch_size, seq_len = tgt_tokens.size()
        
        # Create start token (zeros)
        start_tokens = torch.zeros(batch_size, 1, dtype=tgt_tokens.dtype, device=tgt_tokens.device)
        
        # Concatenate start token and remove last token
        shifted_tokens = torch.cat([start_tokens, tgt_tokens[:, :-1]], dim=1)
        
        return shifted_tokens
        
    def forward(self, src_tokens, tgt_tokens, padding_mask=None):
        """
        Args:
            src_tokens: source sequence [batch_size, src_len]
            tgt_tokens: target sequence [batch_size, tgt_len]
            padding_mask: padding mask [batch_size, 1, 1, seq_len]
        Returns:
            output: [batch_size, tgt_len, tgt_vocab_size] log probabilities
        """
        # Shift target tokens right for teacher forcing
        shifted_tgt_tokens = self.shift_target_right(tgt_tokens)
        
        # Embed source and target sequences
        src_embedding = self.src_embedding(src_tokens)
        tgt_embedding = self.tgt_embedding(shifted_tgt_tokens)
        
        # Pass through encoder-decoder stack
        decoder_output = self.encoder_decoder(
            embed_encoder_input=src_embedding,
            embed_decoder_input=tgt_embedding,
            padding_mask=padding_mask
        )
        
        # Project to vocabulary size and apply log softmax
        logits = self.output_projection(decoder_output)
        log_probs = self.softmax(logits)
        
        return log_probs

In [8]:
import os
import re
from pathlib import Path
from random import Random

BASE_PATH = "/kaggle/input/evb-corpus-news/EVBCorpus_v1/EVBNews"
OUT_DIR = Path("processed_evbcorpus")
OUT_DIR.mkdir(exist_ok=True)

def extract_pairs(file_path):
    """Trích xuất các cặp câu song ngữ <s id='en...'> và <s id='vn...'>"""
    text = Path(file_path).read_text(encoding="utf-8", errors="ignore")

    en_sentences = re.findall(r"<s id='en\d+'>(.*?)</s>", text, flags=re.DOTALL)
    vi_sentences = re.findall(r"<s id='vn\d+'>(.*?)</s>", text, flags=re.DOTALL)

    pairs = []
    for en, vi in zip(en_sentences, vi_sentences):
        en = " ".join(en.strip().split())
        vi = " ".join(vi.strip().split())
        if en and vi:
            pairs.append((en, vi))
    return pairs

all_pairs = []
for filename in sorted(os.listdir(BASE_PATH)):
    if filename.endswith(".sgml"):
        file_path = os.path.join(BASE_PATH, filename)
        all_pairs.extend(extract_pairs(file_path))

print(f"Tổng số cặp câu trích xuất: {len(all_pairs)}")

rnd = Random(42)
rnd.shuffle(all_pairs)
n = len(all_pairs)
n_val = int(0.01 * n)
n_test = int(0.01 * n)

train_pairs = all_pairs[: n - n_val - n_test]
val_pairs = all_pairs[n - n_val - n_test : n - n_test]
test_pairs = all_pairs[n - n_test :]

def write_split(pairs, prefix):
    src_path = OUT_DIR / f"{prefix}.en"
    tgt_path = OUT_DIR / f"{prefix}.vi"
    with open(src_path, "w", encoding="utf-8") as fe, open(tgt_path, "w", encoding="utf-8") as fv:
        for src, tgt in pairs:
            fe.write(src + "\n")
            fv.write(tgt + "\n")

write_split(train_pairs, "train")
write_split(val_pairs, "val")
write_split(test_pairs, "test")

print(f"Train/Val/Test: {len(train_pairs)} / {len(val_pairs)} / {len(test_pairs)}")
print(f"Dữ liệu đã lưu trong: {OUT_DIR.resolve()}")


Tổng số cặp câu trích xuất: 45308
Train/Val/Test: 44402 / 453 / 453
Dữ liệu đã lưu trong: /kaggle/working/processed_evbcorpus


In [9]:
!pip install -q transformers==4.44.2 datasets tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 w

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm

In [None]:
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#Dataset
class TranslationDataset(Dataset):
    def __init__(self, src_path, tgt_path, tokenizer, max_len=64):
        self.src_texts = open(src_path, encoding="utf-8").read().strip().split("\n")
        self.tgt_texts = open(tgt_path, encoding="utf-8").read().strip().split("\n")
        assert len(self.src_texts) == len(self.tgt_texts), "⚠️ Số dòng src/tgt không khớp!"
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src = self.src_texts[idx]
        tgt = self.tgt_texts[idx]

        src_enc = self.tokenizer(src, truncation=True, max_length=self.max_len, return_tensors="pt")
        tgt_enc = self.tokenizer(tgt, truncation=True, max_length=self.max_len, return_tensors="pt")

        return {
            "src_input": src_enc["input_ids"].squeeze(0),
            "tgt_input": tgt_enc["input_ids"].squeeze(0),
            "tgt_label": tgt_enc["input_ids"].squeeze(0),
        }

#Collate function
def collate_fn(batch):
    src_batch = [item["src_input"] for item in batch]
    tgt_batch = [item["tgt_input"] for item in batch]
    label_batch = [item["tgt_label"] for item in batch]

    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    label_batch = torch.nn.utils.rnn.pad_sequence(label_batch, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {
        "src_input": src_batch,
        "tgt_input": tgt_batch,
        "tgt_label": label_batch
    }

# Dataloader
train_dataset = TranslationDataset("processed_evbcorpus/train.en", "processed_evbcorpus/train.vi", tokenizer)
val_dataset = TranslationDataset("processed_evbcorpus/val.en", "processed_evbcorpus/val.vi", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



Config 1: d_model=128, embed=128, num_layer=4, 50 epoch

In [12]:
src_vocab_size = len(tokenizer)
tgt_vocab_size = len(tokenizer)

model = Transformer(
    num_layer=4,
    d_model=128,
    d_embed=128,
    d_ff=256,
    num_head=8,
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    dropout=0.2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#Loss + Optimizer
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

#Training Loop
for epoch in range(50):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        src = batch["src_input"].to(device)
        tgt_out = batch["tgt_label"].to(device)
        tgt_in = tgt_out[:, :-1]
        labels = tgt_out[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_in)  
        loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            src = batch["src_input"].to(device)
            tgt_out = batch["tgt_label"].to(device)
            tgt_in = tgt_out[:, :-1]
            labels = tgt_out[:, 1:]
            output = model(src, tgt_in)
            loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
            val_loss += loss.item()

    print(f" Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

Epoch 1: 100%|██████████| 2776/2776 [05:08<00:00,  9.01it/s]


 Epoch 1 | Train Loss: 6.4927 | Val Loss: 6.0462


Epoch 2: 100%|██████████| 2776/2776 [05:03<00:00,  9.15it/s]


 Epoch 2 | Train Loss: 5.9541 | Val Loss: 5.7784


Epoch 3: 100%|██████████| 2776/2776 [05:03<00:00,  9.15it/s]


 Epoch 3 | Train Loss: 5.7236 | Val Loss: 5.5940


Epoch 4: 100%|██████████| 2776/2776 [05:03<00:00,  9.14it/s]


 Epoch 4 | Train Loss: 5.5482 | Val Loss: 5.5053


Epoch 5: 100%|██████████| 2776/2776 [05:04<00:00,  9.12it/s]


 Epoch 5 | Train Loss: 5.4039 | Val Loss: 5.3935


Epoch 6: 100%|██████████| 2776/2776 [05:03<00:00,  9.15it/s]


 Epoch 6 | Train Loss: 5.2893 | Val Loss: 5.3416


Epoch 7: 100%|██████████| 2776/2776 [05:05<00:00,  9.08it/s]


 Epoch 7 | Train Loss: 5.1914 | Val Loss: 5.3137


Epoch 8: 100%|██████████| 2776/2776 [05:04<00:00,  9.13it/s]


 Epoch 8 | Train Loss: 5.1083 | Val Loss: 5.2451


Epoch 9: 100%|██████████| 2776/2776 [05:06<00:00,  9.07it/s]


 Epoch 9 | Train Loss: 5.0315 | Val Loss: 5.1856


Epoch 10: 100%|██████████| 2776/2776 [05:07<00:00,  9.04it/s]


 Epoch 10 | Train Loss: 4.9644 | Val Loss: 5.1498


Epoch 11: 100%|██████████| 2776/2776 [05:07<00:00,  9.04it/s]


 Epoch 11 | Train Loss: 4.8986 | Val Loss: 5.1124


Epoch 12: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 12 | Train Loss: 4.8382 | Val Loss: 5.0575


Epoch 13: 100%|██████████| 2776/2776 [05:06<00:00,  9.05it/s]


 Epoch 13 | Train Loss: 4.7733 | Val Loss: 5.0060


Epoch 14: 100%|██████████| 2776/2776 [05:04<00:00,  9.11it/s]


 Epoch 14 | Train Loss: 4.7144 | Val Loss: 4.9684


Epoch 15: 100%|██████████| 2776/2776 [05:02<00:00,  9.18it/s]


 Epoch 15 | Train Loss: 4.6577 | Val Loss: 4.9554


Epoch 16: 100%|██████████| 2776/2776 [05:03<00:00,  9.14it/s]


 Epoch 16 | Train Loss: 4.6049 | Val Loss: 4.8696


Epoch 17: 100%|██████████| 2776/2776 [05:05<00:00,  9.10it/s]


 Epoch 17 | Train Loss: 4.5552 | Val Loss: 4.8466


Epoch 18: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 18 | Train Loss: 4.5077 | Val Loss: 4.8064


Epoch 19: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 19 | Train Loss: 4.4645 | Val Loss: 4.7813


Epoch 20: 100%|██████████| 2776/2776 [05:07<00:00,  9.01it/s]


 Epoch 20 | Train Loss: 4.4206 | Val Loss: 4.7279


Epoch 21: 100%|██████████| 2776/2776 [05:08<00:00,  9.00it/s]


 Epoch 21 | Train Loss: 4.3804 | Val Loss: 4.7144


Epoch 22: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 22 | Train Loss: 4.3432 | Val Loss: 4.6962


Epoch 23: 100%|██████████| 2776/2776 [05:05<00:00,  9.09it/s]


 Epoch 23 | Train Loss: 4.3029 | Val Loss: 4.6380


Epoch 24: 100%|██████████| 2776/2776 [05:03<00:00,  9.14it/s]


 Epoch 24 | Train Loss: 4.2698 | Val Loss: 4.6262


Epoch 25: 100%|██████████| 2776/2776 [05:04<00:00,  9.13it/s]


 Epoch 25 | Train Loss: 4.2355 | Val Loss: 4.6026


Epoch 26: 100%|██████████| 2776/2776 [05:07<00:00,  9.04it/s]


 Epoch 26 | Train Loss: 4.2035 | Val Loss: 4.5366


Epoch 27: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 27 | Train Loss: 4.1723 | Val Loss: 4.5481


Epoch 28: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 28 | Train Loss: 4.1415 | Val Loss: 4.5097


Epoch 29: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 29 | Train Loss: 4.1140 | Val Loss: 4.4846


Epoch 30: 100%|██████████| 2776/2776 [05:04<00:00,  9.12it/s]


 Epoch 30 | Train Loss: 4.0892 | Val Loss: 4.4458


Epoch 31: 100%|██████████| 2776/2776 [05:07<00:00,  9.04it/s]


 Epoch 31 | Train Loss: 4.0625 | Val Loss: 4.4787


Epoch 32: 100%|██████████| 2776/2776 [05:08<00:00,  9.01it/s]


 Epoch 32 | Train Loss: 4.0378 | Val Loss: 4.4252


Epoch 33: 100%|██████████| 2776/2776 [05:06<00:00,  9.05it/s]


 Epoch 33 | Train Loss: 4.0119 | Val Loss: 4.3964


Epoch 34: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 34 | Train Loss: 3.9875 | Val Loss: 4.3945


Epoch 35: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 35 | Train Loss: 3.9657 | Val Loss: 4.3699


Epoch 36: 100%|██████████| 2776/2776 [05:06<00:00,  9.05it/s]


 Epoch 36 | Train Loss: 3.9458 | Val Loss: 4.3687


Epoch 37: 100%|██████████| 2776/2776 [05:07<00:00,  9.01it/s]


 Epoch 37 | Train Loss: 3.9239 | Val Loss: 4.3472


Epoch 38: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 38 | Train Loss: 3.9017 | Val Loss: 4.3274


Epoch 39: 100%|██████████| 2776/2776 [05:05<00:00,  9.07it/s]


 Epoch 39 | Train Loss: 3.8839 | Val Loss: 4.2925


Epoch 40: 100%|██████████| 2776/2776 [05:05<00:00,  9.10it/s]


 Epoch 40 | Train Loss: 3.8641 | Val Loss: 4.2576


Epoch 41: 100%|██████████| 2776/2776 [05:04<00:00,  9.11it/s]


 Epoch 41 | Train Loss: 3.8458 | Val Loss: 4.2479


Epoch 42: 100%|██████████| 2776/2776 [05:05<00:00,  9.09it/s]


 Epoch 42 | Train Loss: 3.8261 | Val Loss: 4.2801


Epoch 43: 100%|██████████| 2776/2776 [05:07<00:00,  9.03it/s]


 Epoch 43 | Train Loss: 3.8084 | Val Loss: 4.2282


Epoch 44: 100%|██████████| 2776/2776 [05:06<00:00,  9.05it/s]


 Epoch 44 | Train Loss: 3.7924 | Val Loss: 4.2047


Epoch 45: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 45 | Train Loss: 3.7746 | Val Loss: 4.2022


Epoch 46: 100%|██████████| 2776/2776 [05:07<00:00,  9.04it/s]


 Epoch 46 | Train Loss: 3.7586 | Val Loss: 4.1663


Epoch 47: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 47 | Train Loss: 3.7437 | Val Loss: 4.2019


Epoch 48: 100%|██████████| 2776/2776 [05:07<00:00,  9.01it/s]


 Epoch 48 | Train Loss: 3.7291 | Val Loss: 4.1497


Epoch 49: 100%|██████████| 2776/2776 [05:07<00:00,  9.02it/s]


 Epoch 49 | Train Loss: 3.7140 | Val Loss: 4.1747


Epoch 50: 100%|██████████| 2776/2776 [05:05<00:00,  9.09it/s]


 Epoch 50 | Train Loss: 3.7009 | Val Loss: 4.1339


In [13]:
# ===== 8. Test Evaluation =====
test_dataset = TranslationDataset(
    "processed_evbcorpus/test.en", "processed_evbcorpus/test.vi", tokenizer
)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

model.eval()
test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        src = batch["src_input"].to(device)
        tgt_out = batch["tgt_label"].to(device)
        tgt_in = tgt_out[:, :-1]
        labels = tgt_out[:, 1:]

        output = model(src, tgt_in)
        loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
        test_loss += loss.item()

print(f" Test Loss: {test_loss / len(test_loader):.4f}")


Testing: 100%|██████████| 29/29 [00:01<00:00, 24.08it/s]

 Test Loss: 4.0633





In [14]:
def translate(sentence, max_len=64):
    model.eval()
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=max_len).to(device)

    src = tokens["input_ids"]

    tgt = torch.tensor([[tokenizer.cls_token_id]], device=device)

    for _ in range(max_len):
        with torch.no_grad():
            out = model(src, tgt)
            next_token = out[:, -1, :].argmax(dim=-1)
            tgt = torch.cat([tgt, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == tokenizer.sep_token_id:
                break

    decoded = tokenizer.decode(tgt[0], skip_special_tokens=True)
    return decoded


In [15]:
!pip install -q sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
import sacrebleu

references, hypotheses = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating BLEU"):
        src_texts = [tokenizer.decode(x, skip_special_tokens=True) for x in batch["src_input"]]
        tgt_texts = [tokenizer.decode(x, skip_special_tokens=True) for x in batch["tgt_label"]]

        for s, t in zip(src_texts, tgt_texts):
            pred = translate(s)
            references.append(t)
            hypotheses.append(pred)

bleu = sacrebleu.corpus_bleu(hypotheses, [references])
print(f" BLEU score: {bleu.score:.2f}")


Evaluating BLEU: 100%|██████████| 29/29 [02:30<00:00,  5.18s/it]

 BLEU score: 1.08





config 2: d_model=256, d_embed=256, num_layer=4, 20 epoch

In [17]:
src_vocab_size = len(tokenizer)
tgt_vocab_size = len(tokenizer)

model = Transformer(
    num_layer=4,
    d_model=256,
    d_embed=256,
    d_ff=512,
    num_head=8,
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    dropout=0.2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loss + Optimizer 
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Training Loop 
for epoch in range(20):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        src = batch["src_input"].to(device)
        tgt_out = batch["tgt_label"].to(device)

        tgt_in = tgt_out[:, :-1]
        labels = tgt_out[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_in) 
        loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            src = batch["src_input"].to(device)
            tgt_out = batch["tgt_label"].to(device)
            tgt_in = tgt_out[:, :-1]
            labels = tgt_out[:, 1:]
            output = model(src, tgt_in)
            loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
            val_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

Epoch 1: 100%|██████████| 2776/2776 [07:14<00:00,  6.40it/s]


Epoch 1 | Train Loss: 6.2924 | Val Loss: 5.8048


Epoch 2: 100%|██████████| 2776/2776 [07:13<00:00,  6.40it/s]


Epoch 2 | Train Loss: 5.6650 | Val Loss: 5.5249


Epoch 3: 100%|██████████| 2776/2776 [07:14<00:00,  6.40it/s]


Epoch 3 | Train Loss: 5.3549 | Val Loss: 5.3206


Epoch 4: 100%|██████████| 2776/2776 [07:15<00:00,  6.38it/s]


Epoch 4 | Train Loss: 5.1333 | Val Loss: 5.1909


Epoch 5: 100%|██████████| 2776/2776 [07:15<00:00,  6.37it/s]


Epoch 5 | Train Loss: 4.9569 | Val Loss: 5.1062


Epoch 6: 100%|██████████| 2776/2776 [07:14<00:00,  6.38it/s]


Epoch 6 | Train Loss: 4.8068 | Val Loss: 4.9955


Epoch 7: 100%|██████████| 2776/2776 [07:12<00:00,  6.42it/s]


Epoch 7 | Train Loss: 4.6831 | Val Loss: 4.9417


Epoch 8: 100%|██████████| 2776/2776 [07:12<00:00,  6.42it/s]


Epoch 8 | Train Loss: 4.5755 | Val Loss: 4.8872


Epoch 9: 100%|██████████| 2776/2776 [07:13<00:00,  6.41it/s]


Epoch 9 | Train Loss: 4.4778 | Val Loss: 4.8614


Epoch 10: 100%|██████████| 2776/2776 [07:12<00:00,  6.42it/s]


Epoch 10 | Train Loss: 4.3955 | Val Loss: 4.8018


Epoch 11: 100%|██████████| 2776/2776 [07:13<00:00,  6.41it/s]


Epoch 11 | Train Loss: 4.3186 | Val Loss: 4.7706


Epoch 12: 100%|██████████| 2776/2776 [07:11<00:00,  6.44it/s]


Epoch 12 | Train Loss: 4.2512 | Val Loss: 4.7382


Epoch 13: 100%|██████████| 2776/2776 [07:11<00:00,  6.44it/s]


Epoch 13 | Train Loss: 4.1900 | Val Loss: 4.7155


Epoch 14: 100%|██████████| 2776/2776 [07:10<00:00,  6.45it/s]


Epoch 14 | Train Loss: 4.1340 | Val Loss: 4.6885


Epoch 15: 100%|██████████| 2776/2776 [07:09<00:00,  6.46it/s]


Epoch 15 | Train Loss: 4.0839 | Val Loss: 4.6676


Epoch 16: 100%|██████████| 2776/2776 [07:09<00:00,  6.46it/s]


Epoch 16 | Train Loss: 4.0361 | Val Loss: 4.6512


Epoch 17: 100%|██████████| 2776/2776 [07:09<00:00,  6.46it/s]


Epoch 17 | Train Loss: 3.9935 | Val Loss: 4.6636


Epoch 18: 100%|██████████| 2776/2776 [07:09<00:00,  6.46it/s]


Epoch 18 | Train Loss: 3.9523 | Val Loss: 4.6194


Epoch 19: 100%|██████████| 2776/2776 [07:09<00:00,  6.46it/s]


Epoch 19 | Train Loss: 3.9146 | Val Loss: 4.6218


Epoch 20: 100%|██████████| 2776/2776 [07:10<00:00,  6.45it/s]


Epoch 20 | Train Loss: 3.8795 | Val Loss: 4.5935


In [18]:
# Test Evaluation
test_dataset = TranslationDataset(
    "processed_evbcorpus/test.en", "processed_evbcorpus/test.vi", tokenizer
)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

model.eval()
test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        src = batch["src_input"].to(device)
        tgt_out = batch["tgt_label"].to(device)
        tgt_in = tgt_out[:, :-1]
        labels = tgt_out[:, 1:]

        output = model(src, tgt_in)
        loss = criterion(output.reshape(-1, output.size(-1)), labels.reshape(-1))
        test_loss += loss.item()

print(f" Test Loss: {test_loss / len(test_loader):.4f}")


Testing: 100%|██████████| 29/29 [00:01<00:00, 19.22it/s]

 Test Loss: 4.6238





In [19]:
import sacrebleu

references, hypotheses = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating BLEU"):
        src_texts = [tokenizer.decode(x, skip_special_tokens=True) for x in batch["src_input"]]
        tgt_texts = [tokenizer.decode(x, skip_special_tokens=True) for x in batch["tgt_label"]]

        for s, t in zip(src_texts, tgt_texts):
            pred = translate(s)
            references.append(t)
            hypotheses.append(pred)

bleu = sacrebleu.corpus_bleu(hypotheses, [references])
print(f" BLEU score: {bleu.score:.2f}")


Evaluating BLEU: 100%|██████████| 29/29 [02:36<00:00,  5.41s/it]

 BLEU score: 0.83





Cách config đầu tiên train trên 50 epoch nên kết quả tốt hơn, dù d_model và d_embed thấp hơn. Tuy nhiên, khi dùng sarceBLEU để đo hiệu suất của kết quả dịch ở cả 2 config thì vẫn khá thấp ( 0.83 - 1.08).Điều này có thể là do Tokenization thiếu hiệu quả và Kích thước dữ liệu chưa đủ. Cụ thể, việc sử dụng Tokenization dựa trên từ (Word-level Tokenization) đơn giản cho cặp ngôn ngữ phức tạp như Anh-Việt đã tạo ra một bộ từ vựng (Vocabulary) khổng lồ, khiến quá nhiều từ trong bộ dữ liệu EVBCorpus bị gán nhãn là <UNK> (Unknown) khi giới hạn kích thước Vocab, dẫn đến hiện tượng Out-of-Vocabulary (OOV) nghiêm trọng làm mất thông tin ngữ nghĩa và khiến mô hình chỉ có thể dịch các chuỗi vô nghĩa.