# **En-Vi Neural Machine Translation using RNNs, Transformer Model**

## **Dataset**

In [1]:
!pip install -q datasets

In [2]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [4]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## **Tokenizer**

In [5]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models

# Tạo tokenizer dạng word-based
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))

tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Huấn luyện tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer)
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Lưu tokenizer
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")

In [6]:
len(tokenizer_en.get_vocab()), len(tokenizer_vi.get_vocab())

(15000, 13684)

In [7]:
tokenizer_en.encode("how are you")

Encoding(num_tokens=3, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer_en.encode("how are you").ids

[81, 27, 18]

In [9]:
tokenizer_vi.encode("bạn có khoẻ không").ids

[18, 9, 596, 14]

## **Encoding**

In [10]:
from transformers import PreTrainedTokenizerFast

# Load tokenizer đã train vào PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_en.json",
    unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>"
)
tokenizer_vi = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_vi.json",
    unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>"
)

In [11]:
len(tokenizer_en), len(tokenizer_vi)

(15000, 13684)

In [12]:
MAX_LEN = 75

def preprocess_function(examples):
    src_texts = examples["en"]
    tgt_texts = ["<bos> " + sent + "<eos>" for sent in examples["vi"]]

    src_encodings = tokenizer_en(
        src_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )
    tgt_encodings = tokenizer_vi(
        tgt_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )

    return {
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

In [13]:
tokenizer_vi.unk_token_id

1

In [14]:
tokenizer_vi.pad_token_id, tokenizer_vi.bos_token_id, tokenizer_vi.eos_token_id

(0, 2, 3)

In [15]:
preprocessed_ds['train']

Dataset({
    features: ['en', 'vi', 'input_ids', 'labels'],
    num_rows: 133317
})

In [16]:
print(preprocessed_ds['train'][0])

{'en': 'Rachel Pike : The science behind a climate headline', 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu', 'input_ids': [6675, 1, 57, 60, 339, 604, 13, 744, 5643, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [2, 1960, 66, 1157, 131, 8, 376, 113, 38, 417, 735, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


## **Model**

### **RNN**

In [17]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_src=10000, vocab_size_tgt=10000,
                 embedding_dim=128, hidden_size=128, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.dropout = dropout

class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))  # B x S x H
        output, hidden = self.gru(embedded)  # B x S x H, B x H
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_dim, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)  # LM Head

    def forward(self, input, hidden):
        output = self.embedding(input)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)  # B x 1 x Vocab
        return output, hidden

class Seq2SeqRNNModel(PreTrainedModel):
    config_class = Seq2SeqRNNConfig

    def __init__(self, config, tokenizer_en):
        super().__init__(config)
        self.encoder = EncoderRNN(
            config.vocab_size_src, config.embedding_dim,
            config.hidden_size, config.dropout)
        self.decoder = DecoderRNN(
            config.hidden_size, config.embedding_dim, config.vocab_size_tgt)
        self.BOS_IDX = tokenizer_en.bos_token_id
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD Token

    def forward(self, input_ids, labels):
        batch_size, seq_len = labels.shape
        decoder_input = torch.full((batch_size, 1), self.BOS_IDX, dtype=torch.long).to(input_ids.device)  # ✅ Sửa lỗi
        encoder_output, decoder_hidden = self.encoder(input_ids)
        decoder_outputs = []

        for i in range(seq_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            decoder_input = labels[:, i].unsqueeze(1)  # Teacher forcing

        logits = torch.cat(decoder_outputs, dim=1)  # B x S x Vocab
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return {"loss": loss, "logits": logits}
    

In [18]:
config_RNN= Seq2SeqRNNConfig(
    vocab_size_src=len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi)
)
model_RNN = Seq2SeqRNNModel(config_RNN, tokenizer_en)

In [19]:
model_RNN

Seq2SeqRNNModel(
  (encoder): EncoderRNN(
    (embedding): Embedding(15000, 128)
    (gru): GRU(128, 128, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(13684, 128)
    (gru): GRU(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=13684, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
)

### **Transformer**

In [20]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    device = src.device

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(torch.bool)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)
    src_padding_mask = (src == 0)
    tgt_padding_mask = (tgt == 0)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(
            self, vocab_size_src=10000, vocab_size_tgt=10000, max_seq_length=50,
            d_model=256, num_heads=8, num_layers=6, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout

class Seq2SeqTransformerModel(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig

    def __init__(self, config):
        super().__init__(config)

        self.embedding_src = nn.Embedding(
            config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(
            config.vocab_size_tgt, config.d_model)

        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model)
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model)

        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dropout=config.dropout,
            batch_first=True
        )

        self.generator = nn.Linear(
            config.d_model, config.vocab_size_tgt
            )
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD token

    def forward(self, input_ids, labels):
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]
        batch_size, seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape

        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(seq_len_tgt, device=labels.device).unsqueeze(0)

        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions)
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)

        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(input_ids, tgt_input)

        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.generator(outs)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)

        return {"loss": loss, "logits": logits}

    def encode(self, src, src_mask):
        _, seq_len_src = src.shape
        src_positions = torch.arange(
            seq_len_src, device=src.device).unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(
            src_positions)
        return self.transformer.encoder(src_embedded, src_mask)

    def decode(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(
            seq_len_tgt, device=tgt.device).unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_tgt(
            tgt_positions)
        return self.transformer.decoder(
            tgt_embedded, encoder_output, tgt_mask
        )

In [21]:
# Khởi tạo config
config_Trans= Seq2SeqTransformerConfig(
    vocab_size_src=len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi), max_seq_length=75
)

# Tạo mô hình
model_Trans = Seq2SeqTransformerModel(config_Trans)

In [22]:
model_Trans

Seq2SeqTransformerModel(
  (embedding_src): Embedding(15000, 256)
  (embedding_tgt): Embedding(13684, 256)
  (position_embedding_src): Embedding(75, 256)
  (position_embedding_tgt): Embedding(75, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=

### **Test Model**

In [23]:
input_ids = torch.tensor([preprocessed_ds['train'][0]['input_ids']])
labels = torch.tensor([preprocessed_ds['train'][0]['labels']])

In [24]:
input_ids

tensor([[6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]])

In [25]:
labels

tensor([[   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]])

In [26]:
pred_RNN = model_RNN(input_ids, labels)

In [27]:
pred_RNN

{'loss': tensor(9.5407, grad_fn=<NllLossBackward0>),
 'logits': tensor([[[ 0.1400,  0.0261,  0.1070,  ...,  0.2158,  0.3991, -0.1225],
          [ 0.1310,  0.0776,  0.0281,  ...,  0.2829,  0.3201,  0.0632],
          [ 0.1640,  0.0950,  0.0842,  ...,  0.4068,  0.4350,  0.1265],
          ...,
          [-0.1849, -0.4295, -0.0366,  ...,  0.0797,  0.3550,  0.0900],
          [-0.1849, -0.4295, -0.0366,  ...,  0.0797,  0.3550,  0.0900],
          [-0.1849, -0.4295, -0.0366,  ...,  0.0797,  0.3550,  0.0900]]],
        grad_fn=<CatBackward0>)}

In [28]:
pred_Trans = model_Trans(input_ids, labels)

In [29]:
pred_Trans

{'loss': tensor(9.8202, grad_fn=<NllLoss2DBackward0>),
 'logits': tensor([[[ 0.4558, -0.2175, -0.4031,  ...,  0.1920, -0.5069,  0.4263],
          [ 0.0118,  0.3549, -0.5619,  ...,  0.5031, -0.5127,  0.1225],
          [ 0.2352,  0.2604, -0.5220,  ...,  0.1060, -0.7448,  0.0319],
          ...,
          [ 0.3062,  0.7774, -0.7057,  ...,  0.4390, -0.7227, -0.3419],
          [-0.1526,  0.1448, -0.7680,  ..., -0.1041, -0.5709, -0.2234],
          [ 0.1949,  0.0654, -0.6790,  ...,  0.1697, -0.7941, -0.1305]]],
        grad_fn=<ViewBackward0>)}

## **Trainer**

In [30]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# Use wandb
# import wandb
# wandb.init(
#     project="en-vi-machine-translation",
#     name="transformer" # "gru"
# )

In [31]:
from transformers import Trainer, TrainingArguments

# Cấu hình training RNN
training_args_RNN = TrainingArguments(
    output_dir="./en-vi-machine-translation-rnn",
    logging_dir="logs/rnn",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=25,
    learning_rate=2e-5,
    save_total_limit=1,
    # report_to="wandb",
)

# RNN
trainer_RNN = Trainer(
    model=model_RNN,
    args=training_args_RNN,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"]
)

# Cấu hình training Transformer
training_args_Trans = TrainingArguments(
    output_dir="./en-vi-machine-translation-transformer",
    logging_dir="logs/transformer",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=25,
    learning_rate=2e-5,
    save_total_limit=1,
    # report_to="wandb",
)

# Transformer
trainer_Trans = Trainer(
    model=model_Trans,
    args=training_args_Trans,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"]
)




Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### **GRU**

In [32]:
trainer_RNN.train()

Epoch,Training Loss,Validation Loss
1,9.0566,7.784903
2,6.91,6.466119
3,6.3144,6.204129
4,6.1454,6.098507
5,6.0638,6.03778
6,6.0088,5.99056
7,5.961,5.944368
8,5.9131,5.897158
9,5.864,5.848826
10,5.8156,5.802661


TrainOutput(global_step=13025, training_loss=5.898351783679223, metrics={'train_runtime': 1399.0611, 'train_samples_per_second': 2382.258, 'train_steps_per_second': 9.31, 'total_flos': 2944709228925000.0, 'train_loss': 5.898351783679223, 'epoch': 25.0})

### **Transformer**

In [33]:
trainer_Trans.train()

Epoch,Training Loss,Validation Loss
1,6.6629,5.707497
2,5.5037,5.247213
3,5.145,4.977166
4,4.9138,4.788469
5,4.7514,4.658486
6,4.6291,4.558489
7,4.5322,4.477421
8,4.4525,4.410285
9,4.3841,4.351749
10,4.3248,4.296859


TrainOutput(global_step=26050, training_loss=4.413269047654751, metrics={'train_runtime': 4565.3503, 'train_samples_per_second': 730.048, 'train_steps_per_second': 5.706, 'total_flos': 3.1317297161085e+16, 'train_loss': 4.413269047654751, 'epoch': 25.0})

## **Inference**

### **Transformer**

In [34]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device="cpu"):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    print('v')
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1), device)
                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        prob = model.generator(out[:, -1, :]) # LM Head
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word[-1].item() # index

        ys = torch.cat([ys,torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        if next_word == 3: #EOS : 3
            break
    return ys

def translate(model, src_sentence, device):
    model.eval()
    input_ids = tokenizer_en([src_sentence], return_tensors='pt')['input_ids'].to(device)
    num_tokens = input_ids.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(device)
    tgt_tokens = greedy_decode(
        model,  input_ids, src_mask, max_len=num_tokens + 5, start_symbol=2, device=device)
    return tokenizer_vi.decode(tgt_tokens.detach().cpu()[0])

In [35]:
translate(model_Trans, "i go to school", model_Trans.device)  # => toi den truong

v


'<bos> Và tôi muốn học được học để học'

## **Evaluate**

In [36]:
!pip install -q sacrebleu==2.5.1

In [37]:
from tqdm import tqdm
import sacrebleu

pred_sentences, tgt_sentences = [], []

# Get the test examples (limited to 100 for faster testing)
test_en = ds['test']['en'][:100]  
test_vi = ds['test']['vi'][:100]

for src_sentence, tgt_sentence in tqdm(zip(test_en, test_vi), total=len(test_en)):
    try:
        pred_sentence = translate(model_Trans, src_sentence, model_Trans.device)
        pred_sentences.append(pred_sentence)
        tgt_sentences.append(tgt_sentence)
    except Exception as e:
        print(f"Error translating: '{src_sentence}' - {str(e)}")
        continue

if pred_sentences:
    bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
    print(f"BLEU score: {bleu_score.score}")
else:
    print("No successful translations to evaluate.")

bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
bleu_score

  0%|          | 0/100 [00:00<?, ?it/s]

v


  2%|▏         | 2/100 [00:00<00:27,  3.53it/s]

v
v


  3%|▎         | 3/100 [00:01<00:38,  2.52it/s]

v


  4%|▍         | 4/100 [00:01<00:37,  2.59it/s]

v


  6%|▌         | 6/100 [00:01<00:27,  3.46it/s]

v
v


  7%|▋         | 7/100 [00:02<00:24,  3.86it/s]

v


  8%|▊         | 8/100 [00:02<00:27,  3.36it/s]

v


  9%|▉         | 9/100 [00:02<00:27,  3.33it/s]

v
v


 11%|█         | 11/100 [00:03<00:20,  4.42it/s]

v


 12%|█▏        | 12/100 [00:03<00:24,  3.61it/s]

v


 13%|█▎        | 13/100 [00:04<00:28,  3.00it/s]

v


 15%|█▌        | 15/100 [00:04<00:23,  3.60it/s]

v
v


 16%|█▌        | 16/100 [00:04<00:25,  3.30it/s]

v


 17%|█▋        | 17/100 [00:05<00:30,  2.76it/s]

v


 19%|█▉        | 19/100 [00:05<00:23,  3.49it/s]

v
v


 20%|██        | 20/100 [00:06<00:22,  3.56it/s]

v


 21%|██        | 21/100 [00:06<00:22,  3.54it/s]

v
v


 23%|██▎       | 23/100 [00:06<00:16,  4.65it/s]

v


 24%|██▍       | 24/100 [00:07<00:24,  3.13it/s]

v


 25%|██▌       | 25/100 [00:07<00:24,  3.10it/s]

v


 27%|██▋       | 27/100 [00:07<00:19,  3.78it/s]

v
v


 28%|██▊       | 28/100 [00:08<00:22,  3.15it/s]

v


 29%|██▉       | 29/100 [00:08<00:21,  3.32it/s]

v


 30%|███       | 30/100 [00:08<00:20,  3.44it/s]

v


 32%|███▏      | 32/100 [00:09<00:16,  4.22it/s]

v
v


 33%|███▎      | 33/100 [00:09<00:17,  3.77it/s]

v


 34%|███▍      | 34/100 [00:09<00:16,  4.02it/s]

v


 36%|███▌      | 36/100 [00:10<00:15,  4.26it/s]

v


 37%|███▋      | 37/100 [00:10<00:12,  4.93it/s]

v
v


 39%|███▉      | 39/100 [00:11<00:13,  4.52it/s]

v
v


 40%|████      | 40/100 [00:11<00:14,  4.24it/s]

v


 41%|████      | 41/100 [00:11<00:14,  4.09it/s]

v


 42%|████▏     | 42/100 [00:11<00:17,  3.40it/s]

v


 43%|████▎     | 43/100 [00:12<00:16,  3.53it/s]

v


 44%|████▍     | 44/100 [00:12<00:16,  3.35it/s]

v


 45%|████▌     | 45/100 [00:12<00:17,  3.11it/s]

v


 47%|████▋     | 47/100 [00:13<00:16,  3.21it/s]

v
v


 48%|████▊     | 48/100 [00:13<00:16,  3.16it/s]

v


 49%|████▉     | 49/100 [00:14<00:15,  3.22it/s]

v


 51%|█████     | 51/100 [00:14<00:14,  3.48it/s]

v


 52%|█████▏    | 52/100 [00:14<00:11,  4.03it/s]

v
v


 54%|█████▍    | 54/100 [00:15<00:09,  4.79it/s]

v
v


 56%|█████▌    | 56/100 [00:15<00:08,  5.01it/s]

v
v


 57%|█████▋    | 57/100 [00:16<00:11,  3.73it/s]

v


 58%|█████▊    | 58/100 [00:16<00:15,  2.66it/s]

v


 60%|██████    | 60/100 [00:17<00:12,  3.26it/s]

v
v


 61%|██████    | 61/100 [00:17<00:13,  2.87it/s]

v


 62%|██████▏   | 62/100 [00:18<00:13,  2.81it/s]

v


 64%|██████▍   | 64/100 [00:18<00:09,  3.68it/s]

v
v


 65%|██████▌   | 65/100 [00:18<00:09,  3.81it/s]

v


 66%|██████▌   | 66/100 [00:18<00:09,  3.68it/s]

v


 68%|██████▊   | 68/100 [00:19<00:08,  3.68it/s]

v
v


 69%|██████▉   | 69/100 [00:19<00:09,  3.39it/s]

v


 70%|███████   | 70/100 [00:20<00:08,  3.38it/s]

v


 72%|███████▏  | 72/100 [00:20<00:07,  3.67it/s]

v
v


 73%|███████▎  | 73/100 [00:21<00:09,  2.98it/s]

v


 75%|███████▌  | 75/100 [00:21<00:07,  3.20it/s]

v
v


 77%|███████▋  | 77/100 [00:22<00:06,  3.39it/s]

v
v


 78%|███████▊  | 78/100 [00:23<00:09,  2.35it/s]

v


 79%|███████▉  | 79/100 [00:23<00:08,  2.54it/s]

v


 80%|████████  | 80/100 [00:23<00:06,  2.88it/s]

v


 82%|████████▏ | 82/100 [00:24<00:04,  3.65it/s]

v
v


 83%|████████▎ | 83/100 [00:24<00:05,  2.89it/s]

v


 84%|████████▍ | 84/100 [00:24<00:05,  3.12it/s]

v


 85%|████████▌ | 85/100 [00:25<00:04,  3.32it/s]

v


 86%|████████▌ | 86/100 [00:25<00:04,  3.37it/s]

v


 87%|████████▋ | 87/100 [00:26<00:05,  2.41it/s]

v


 88%|████████▊ | 88/100 [00:26<00:05,  2.21it/s]

v


 89%|████████▉ | 89/100 [00:27<00:04,  2.44it/s]

v
v


 91%|█████████ | 91/100 [00:27<00:02,  3.69it/s]

v


 93%|█████████▎| 93/100 [00:27<00:01,  4.05it/s]

v
v


 94%|█████████▍| 94/100 [00:27<00:01,  4.14it/s]

v


 96%|█████████▌| 96/100 [00:28<00:00,  4.56it/s]

v
v


 97%|█████████▋| 97/100 [00:28<00:00,  4.55it/s]

v


 98%|█████████▊| 98/100 [00:28<00:00,  4.21it/s]

v


 99%|█████████▉| 99/100 [00:29<00:00,  3.82it/s]

v


100%|██████████| 100/100 [00:29<00:00,  3.37it/s]

BLEU score: 2.021023866977245





BLEU = 2.02 21.3/4.9/1.0/0.2 (BP = 1.000 ratio = 1.045 hyp_len = 2807 ref_len = 2685)

In [38]:
import sacrebleu

pred_sentences = ['tôi đang đi học']
tgt_sentences = ['tôi đang đi tới trường']
bleu_score = sacrebleu.corpus_bleu(
    pred_sentences, [tgt_sentences], force=True
)
bleu_score

BLEU = 46.31 75.0/66.7/50.0/50.0 (BP = 0.779 ratio = 0.800 hyp_len = 4 ref_len = 5)