In [1]:
import warnings
warnings.filterwarnings('ignore')

### Wandb session

In [2]:
# import wandb
# wandb.login(key='6f62d15846ae3ef63e0c80b3ab1f4de94d5cc8a4')
# wandb.init(project='augmented_test')

import os
os.environ["WANDB_DISABLED"] = "true"


# model1.1.1:
Description of the model format a.b.c
* **a** means the phase of the dataset. Phase 1 means the first 1000 high-quality English-Bahnar sentences
* **b** means the direction of translation. Direction 1 means Bahnar to English
* **c** means the special technique and version to develop this model.

Technique 0 means just feed English-Bahnar data into the pretrained model. Technique 1 means feed both English-Bahnar and English-Vietnamese into the model


### Cleaning the kaggle/working directory

In [3]:
import os
import shutil

directory = '/kaggle/working'

# Loop through each item in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)  # Removes files and links
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)  # Removes directories and their contents
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

print("Cleaning complete.")

Cleaning complete.


### Installing dependencies

In [4]:
!pip install -q -U sentencepiece
!pip install -q -U datasets
!pip install -q -U sacrebleu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.0.13 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cufft-c

### Importing libraries

In [5]:
from datasets import Dataset, concatenate_datasets

In [6]:
import torch
import sentencepiece
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, concatenate_datasets

import sacrebleu

2025-05-27 06:18:09.996161: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748326690.228945      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748326690.301669      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Reproducibility

In [7]:
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
# If you are using CUDA
torch.cuda.manual_seed_all(RANDOM_SEED)
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 384
PADDING_LEN = 228

### Loading the pretrained model

In [8]:
model_name = "Helsinki-NLP/opus-mt-vi-en"  # Example model for Bahnar to English
en_ba_tokenizer = AutoTokenizer.from_pretrained(model_name)
en_vi_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]

### Loading the dataset

In [9]:
with open('/kaggle/input/enviba/train.ba', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()
with open('/kaggle/input/enviba/train2.en', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."
train_df = pd.DataFrame({'English': en_data, 'Bahnar': ba_data})
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=RANDOM_SEED)

ba_en_train_dataset = Dataset.from_pandas(train_df)
ba_en_val_dataset = Dataset.from_pandas(val_df)

with open('/kaggle/input/train-vie-eng-phomt-dev-dataset/dev.vi', 'r', encoding='utf-8') as file_vi:
    vi_data = file_vi.readlines()
with open('/kaggle/input/train-vie-eng-phomt-dev-dataset/dev.en', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()
assert len(vi_data) == len(en_data), "The files don't have the same number of lines."
train_df = pd.DataFrame({'English': en_data, 'Vietnamese': vi_data})

vi_en_train_dataset = Dataset.from_pandas(train_df[:4000])

In [10]:

with open('/kaggle/input/enviba/test.ba', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()
with open('/kaggle/input/enviba/test.en', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."
test_df = pd.DataFrame({'English': en_data, 'Bahnar': ba_data})
ba_en_test_dataset = Dataset.from_pandas(test_df)

print(len(ba_en_train_dataset), len(ba_en_val_dataset), len(ba_en_test_dataset))
print(len(vi_en_train_dataset))

8347 928 1987
4000


### Adding more data with augmentation


#### Investigate current tokenizer

In [11]:
en_ba_tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-vi-en', vocab_size=53739, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	53738: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [12]:
len(en_ba_tokenizer.encoder)

53739

In [13]:
print(ba_en_train_dataset[0]["Bahnar"])
print(en_ba_tokenizer.tokenize(ba_en_train_dataset[0]["Bahnar"]))
list(en_ba_tokenizer.encode(ba_en_train_dataset[0]["Bahnar"]))

Mĭ ơ̆p ngoh weng 'yŏk 'mưĭ dơnol, tơbưk.

['▁M', 'ĭ', '▁ơ', '̆', 'p', '▁ngo', 'h', '▁we', 'ng', "▁'", 'y', 'ŏ', 'k', "▁'", 'm', 'ư', 'ĭ', '▁dơ', 'n', 'ol', ',', '▁tơ', 'b', 'ư', 'k', '.']


[1757,
 1,
 1685,
 49915,
 883,
 18645,
 1266,
 54,
 373,
 587,
 337,
 53675,
 1735,
 587,
 78,
 21999,
 1,
 7207,
 270,
 4341,
 3,
 3063,
 1603,
 21999,
 1735,
 2,
 0]

In [14]:
import numpy as np 
import random
import math
mask_token = "<unk>"
mask_token_id = 1

def token_dropout(examples, word_dropout_ratio = 0.3, en_ba_tokenizer = en_ba_tokenizer):
    inputs = examples['Bahnar']
    inputs = en_ba_tokenizer.batch_encode_plus(inputs)['input_ids']
    modified_inputs = []
    for single_input in inputs:
        single_input = np.array(single_input)
        # Create a mask array where elements are True with probability word_dropout_ratio
        mask = np.random.random(single_input.shape) < word_dropout_ratio
        # Apply the mask to replace elements with mask_token_id
        single_input[mask] = mask_token_id
        single_input[-1] = 0
        modified_inputs.append(single_input)
    examples["Bahnar"] = en_ba_tokenizer.batch_decode(modified_inputs)
    return examples

def token_replacement(examples, word_dropout_ratio = 0.3, en_ba_tokenizer = en_ba_tokenizer):
    inputs = examples['Bahnar']
    inputs = en_ba_tokenizer.batch_encode_plus(inputs)['input_ids']
    modified_inputs = []
    for single_input in inputs:
        single_input = np.array(single_input)
        
        # Create a mask array where elements are True with probability word_dropout_ratio
        mask = np.random.random(single_input.shape) < word_dropout_ratio
        random_replacements = np.random.randint(2, 15, size=mask.sum()) #may change later
        # Replace the original token IDs at the masked positions
        single_input[mask] = random_replacements
        single_input[-1] = 0
        modified_inputs.append(single_input)
    examples["Bahnar"] = en_ba_tokenizer.batch_decode(modified_inputs)
    return examples

def sentence_boundary(examples, proportion = 0.5, en_ba_tokenizer = en_ba_tokenizer):
    inputs = en_ba_tokenizer.batch_encode_plus(examples['Bahnar'])['input_ids']
    targets = en_ba_tokenizer.batch_encode_plus(examples['English'])['input_ids']
    modified_inputs = []
    modified_targets = []
    for i in range(len(inputs)):
        uniform_p = np.random.uniform(high=proportion)
        s1, t1, s2, t2 = inputs[i-1][:-1], targets[i-1][:-1], inputs[i][:-1], targets[i][:-1]
        p1S1, p1T1 = math.ceil(uniform_p*len(s1)), math.ceil(uniform_p*len(t1))
        p2S2, p2T2 = math.ceil(uniform_p*len(s2)), math.ceil(uniform_p*len(t2))
        modified_inputs.append(s1[p1S1:] + s2[:p2S2]+[0])
        modified_targets.append(t1[p1T1:] + t2[:p2T2]+[0])
    examples["Bahnar"] = en_ba_tokenizer.batch_decode(modified_inputs)
    examples["English"] = en_ba_tokenizer.batch_decode(modified_targets)
    return examples


def token_swap(examples, swap_ratio = 0.1, en_ba_tokenizer = en_ba_tokenizer):
    inputs = en_ba_tokenizer.batch_encode_plus(examples['Bahnar'])['input_ids']
    targets = en_ba_tokenizer.batch_encode_plus(examples['English'])['input_ids']
    modified_inputs = []
    for single_input in inputs:
        n_words_swap=int(len(single_input)*swap_ratio)
        moved_pos=set()
        while(len(moved_pos)<n_words_swap):
            pos1, pos2 = random.sample(range(len(single_input)-1), 2)
            single_input[pos1], single_input[pos2] = single_input[pos2], single_input[pos1]
            moved_pos.add(pos1)
            moved_pos.add(pos2)
        single_input[-1] = 0
        modified_inputs.append(single_input)
    examples["Bahnar"] = en_ba_tokenizer.batch_decode(modified_inputs)
    return examples

# augmented = ba_en_train_dataset.map(token_swap, batched = True)

In [15]:
def word_dropout(examples, word_dropout_ratio = 0.3, en_ba_tokenizer = en_ba_tokenizer):
    inputs = examples['Bahnar']
    modified_inputs = []
    for single_input in inputs:
        single_input = single_input.split(" ")
        # Create a mask array where elements are True with probability word_dropout_ratio
        mask = np.random.random(len(single_input)) < word_dropout_ratio
        # Apply the mask to replace elements with mask_token
        single_input = [mask_token if mask[i] else word for i, word in enumerate(single_input)]
        modified_inputs.append(" ".join(map(str, single_input)))
    examples["Bahnar"] = modified_inputs
    return examples


def sentence_boundary_word(examples, proportion = 0.5, en_ba_tokenizer = en_ba_tokenizer):
    inputs = [sentence.split(" ") for sentence in examples['Bahnar']]
    targets = [sentence.split(" ") for sentence in examples['English']]
    modified_inputs = []
    modified_targets = []
    for i in range(len(inputs)):
        uniform_p = np.random.uniform(high=proportion)
        s1, t1, s2, t2 = inputs[i-1], targets[i-1], inputs[i][:-1], targets[i]
        p1S1, p1T1 = math.ceil(uniform_p*len(s1)), math.ceil(uniform_p*len(t1))
        p2S2, p2T2 = math.ceil(uniform_p*len(s2)), math.ceil(uniform_p*len(t2))
        modified_inputs.append(s1[p1S1:] + s2[:p2S2])
        modified_targets.append(t1[p1T1:] + t2[:p2T2])
    examples["Bahnar"] = [" ".join(map(str, single_input)) for single_input in modified_inputs]
    examples["English"] = [" ".join(map(str, single_target)) for single_target in modified_targets]
    return examples


def word_swap(examples, swap_ratio = 0.1, en_ba_tokenizer = en_ba_tokenizer):
    inputs = [sentence.split(" ") for sentence in examples['Bahnar']]
    modified_inputs = []
    for single_input in inputs:
        n_words_swap=int(len(single_input)*swap_ratio)
        moved_pos=set()
        while(len(moved_pos)<n_words_swap):
            pos1, pos2 = random.sample(range(len(single_input)-1), 2)
            single_input[pos1], single_input[pos2] = single_input[pos2], single_input[pos1]
            moved_pos.add(pos1)
            moved_pos.add(pos2)
        modified_inputs.append(single_input)
    examples["Bahnar"] = [" ".join(map(str, single_input)) for single_input in modified_inputs]
    return examples

# augmented = ba_en_train_dataset.map(word_dropout, batched = True)
# ba_en_train_dataset = concatenate_datasets([ba_en_train_dataset, augmented])

In [16]:
# !pip install -q -U requests nlpaug

In [17]:
##TF_IDF 

# import os
# os.environ["MODEL_DIR"] = '../model'
# import re

# import nlpaug.augmenter.word as naw
# import nlpaug.model.word_stats as nmw

# def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
#     token_pattern = re.compile(token_pattern)
#     return token_pattern.findall(text)

# train_x = ba_en_train_dataset[:]["Bahnar"]
# # Tokenize input
# train_x_tokens = [_tokenizer(x) for x in train_x]

# # Train TF-IDF model
# tfidf_model = nmw.TfIdf()
# tfidf_model.train(train_x_tokens)
# tfidf_model.save('.')

# # Load TF-IDF augmenter
# aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)

# def tfidf_replacement(examples, aug_model = aug):
#     inputs = examples['Bahnar']
#     modified_inputs = aug_model.augment(inputs)
#     examples["Bahnar"] = modified_inputs
#     return examples


In [18]:
# augmented[1]
# augmented = ba_en_train_dataset.map(tfidf_replacement, batched = True)
# ba_en_train_dataset = concatenate_datasets([ba_en_train_dataset, augmented])

In [19]:
# !pip install underthesea

In [20]:
# #Pseudo augmentation
# #Load Vi-Ba dictionary
# with open('/kaggle/input/vibadict/dict.vi', 'r', encoding='utf-8') as file_vi:
#     vi_dict = file_vi.readlines()
# with open('/kaggle/input/vibadict/dict.ba', 'r', encoding='utf-8') as file_ba:
#     ba_dict = file_ba.readlines()
# with open('/kaggle/input/vibadict/dict2.en', 'r', encoding='utf-8') as file_en:
#     en_dict = file_en.readlines()
# assert len(vi_dict) == len(ba_dict)

# dict_df = pd.DataFrame({'Vietnamese': [vntokenizer(tokenizer, x[:-1]) for x in vi_dict], 'Bahnar': [x[:-1] for x in ba_dict]})

In [21]:
# #Test EN dict
# preds = []
# refs = []
# dictionary = {en_dict[i][:-1]: ba_dict[i][:-1] for i in range(1, len(ba_dict))}
# print("done dict")
# test_examples = [{'Bahnar': ex['Bahnar'], 'English': ex['English']} for ex in ba_en_test_dataset]
# for example in test_examples:
#     refs.append(example['Bahnar'])
#     _input = example['English']
#     words = _input.split(" ")
#     indices_to_replace = [i for i, word in enumerate(words) if word in dictionary]
#     for i in indices_to_replace:
#         words[i] = dictionary[words[i]]  # Replace word with its mapped meaning
#     preds.append(" ".join(words))

# print(refs[:10])
# print(preds[:10])
    
# # Calculate and print the BLEU score
# bleu = sacrebleu.corpus_bleu(preds, refs)
# print("BLEU: ", round(bleu.score, 2))

# # Calculate CHRF
# chrf = sacrebleu.corpus_chrf(preds, refs)
# print("CHRF:", round(chrf.score, 2))

# # Calculate TER
# metric = sacrebleu.metrics.TER()
# ter = metric.corpus_score(preds, refs)
# print("TER:", round(ter.score, 2))    

In [22]:
# dictionary

In [23]:
# import underthesea

# import random


# dictionary = {vi_dict[i][:-1]: ba_dict[i][:-1] for i in range(len(ba_dict))}
# print("done dict")
# def word_mapping(examples, word_mapping_ratio = 0.7, dictionary = dictionary):
#     inputs = examples['Vietnamese']
#     modified_inputs = []
#     for single_input in inputs:
#         words = underthesea.word_tokenize(single_input)
#         #For every sentence, randomly choose words to map with its meaning in the dictionary
#         #until the number of replaced words reach the ratio or there is no more replacable words 

#         replaceable_indices = [i for i, word in enumerate(words) if word in dictionary]
#         num_replacements = min(len(replaceable_indices), int(len(words) * word_mapping_ratio))

#         if num_replacements > 0:
#             indices_to_replace = random.sample(replaceable_indices, num_replacements)
#             for i in indices_to_replace:
#                 words[i] = dictionary[words[i]]  # Replace word with its mapped meaning

#         modified_inputs.append(" ".join(words))
#     examples["Vietnamese"] = modified_inputs
#     return examples

# augmented_pivot = vi_en_train_dataset.map(word_mapping, batched = True)

In [24]:
# vi_en_train_dataset = concatenate_datasets([vi_en_train_dataset, augmented_pivot])

In [25]:
# augmented_pivot[:10]["Vietnamese"]

# Preprocess the dataset with tokenizer

In [27]:
def preprocess_function_target(examples):
    inputs = examples['Bahnar']
    targets = examples['English']
    try:
        model_inputs = en_ba_tokenizer(inputs, max_length=PADDING_LEN, padding='max_length', truncation=True)
    
        # Set up the tokenizer for Bahnar
        with en_ba_tokenizer.as_target_tokenizer():
            labels = en_ba_tokenizer(targets, max_length=PADDING_LEN, padding='max_length', truncation=True)
    
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    except:
        print(inputs)
    

tokenized_ba_en = ba_en_train_dataset.map(preprocess_function_target, batched=True)
tokenized_ba_en_val = ba_en_val_dataset.map(preprocess_function_target, batched=True)

Map:   0%|          | 0/8347 [00:00<?, ? examples/s]

Map:   0%|          | 0/928 [00:00<?, ? examples/s]

In [28]:
def preprocess_function_source(examples):
    inputs = examples['Vietnamese']
    targets = examples['English']
    model_inputs = en_vi_tokenizer(inputs, max_length=PADDING_LEN, padding='max_length', truncation=True)

    # Set up the tokenizer for Vietnamese
    with en_vi_tokenizer.as_target_tokenizer():
        labels = en_vi_tokenizer(targets, max_length=PADDING_LEN, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_vi_en = vi_en_train_dataset.map(preprocess_function_source, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

### Concatenate Vietnamese-English and Bahnar-English Dataset

In [29]:
# tokenized_viAndba_en = concatenate_datasets([tokenized_vi_en, tokenized_ba_en ])
tokenized_viAndba_en = tokenized_ba_en
# viOrba_en_dataset is now a combined dataset
print(tokenized_viAndba_en)

Dataset({
    features: ['English', 'Bahnar', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 8347
})


In [30]:
# shuffled_dataset = tokenized_viAndba_en.shuffle(seed=RANDOM_SEED)


# Train the model

In [31]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

print((len(tokenized_viAndba_en)))
# Use data collator
data_collator = DataCollatorForSeq2Seq(en_ba_tokenizer, model=model)

# Step 3: Fine-Tuning the Model on Bahnar to English
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.015,  # L2 regularization
    save_total_limit=3,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_viAndba_en,
    eval_dataset=tokenized_ba_en_val,
    data_collator=data_collator,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


8347


Epoch,Training Loss,Validation Loss
1,No log,0.442061
2,0.565800,0.381456
3,0.565800,0.34664
4,0.357800,0.323059
5,0.357800,0.307425
6,0.305500,0.296401
7,0.305500,0.288246
8,0.276500,0.283
9,0.276500,0.280288
10,0.262000,0.279319


TrainOutput(global_step=2610, training_loss=0.34957316533815813, metrics={'train_runtime': 1650.84, 'train_samples_per_second': 50.562, 'train_steps_per_second': 1.581, 'total_flos': 5040040677212160.0, 'train_loss': 0.34957316533815813, 'epoch': 10.0})

# Test the model

In [32]:
import torch
from tqdm.auto import tqdm

def evaluate_model(model, encode_tokenizer, decode_tokenizer, dataset, device):
    model.eval()
    prt = True

    predictions, references = [], []
    for example in tqdm(dataset, desc="Translating"):
        with encode_tokenizer.as_target_tokenizer():
            input_ids = encode_tokenizer.encode(example['Bahnar'], return_tensors='pt').to(device)

        output_ids = model.generate(input_ids)[0]
        pred = decode_tokenizer.decode(output_ids, skip_special_tokens=True)
        predictions.append([pred])

        references.append([example['English']])

    return predictions, references


In [33]:
test_examples = [{'Bahnar': ex['Bahnar'], 'English': ex['English']} for ex in ba_en_test_dataset]
predictions, references = evaluate_model(model, en_ba_tokenizer, en_vi_tokenizer, test_examples, device)

Translating:   0%|          | 0/1987 [00:00<?, ?it/s]

In [34]:
predictions[0], references[0] # Check format of predictions and references

(["At the same time, the district's news and the district's localities have been funded with the Prime Minister's plan on building a new countryside, leading to the Prime Minister's leadership."],
 ['Cooperate with the Department of Information and Communications and relevant units in preparing propagation publications and holding press conferences\n'])

In [35]:
len(predictions), len(references)

(1987, 1987)

### Calculate sacreBLEU

In [36]:
preds = []
refs = []
for pred in predictions:
    preds.append(pred[0])

for ref in references:
    refs.append(ref)

print(refs[:2])
print(preds[:2])

[['Cooperate with the Department of Information and Communications and relevant units in preparing propagation publications and holding press conferences\n'], ['In case of lack of space, the following priority order shall be given: out-of-province students, poor students, students with good academic records and first-year students.\n']]
["At the same time, the district's news and the district's localities have been funded with the Prime Minister's plan on building a new countryside, leading to the Prime Minister's leadership.", "At the same time, it is known that the capital source of the people's trust, the capital source, the capital of the district, the capital of the entire district, the capital of the entire district, the capital of the locality and the locality."]


In [37]:
# Calculate and print the BLEU score
bleu = sacrebleu.corpus_bleu(preds, refs)
print("BLEU: ", round(bleu.score, 2))

# Calculate CHRF
chrf = sacrebleu.corpus_chrf(preds, refs)
print("CHRF:", round(chrf.score, 2))

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(preds, refs)
print("TER:", round(ter.score, 2))

# wandb.log({"BLEU": round(bleu.score, 2), "CHRF": round(chrf.score, 2), "TER": round(ter.score, 2)})


BLEU:  51.77
CHRF: 37.7
TER: 113.4


### Test on our dataset:

In [38]:
with open('/kaggle/input/test-eng-bdq-phase2dataset-v1/bdq-eng.test.bdq', 'r', encoding='utf-8') as file_ba:
    ba_data = file_ba.readlines()
with open('/kaggle/input/test-eng-bdq-phase2dataset-v1/bdq-eng.test.eng', 'r', encoding='utf-8') as file_en:
    en_data = file_en.readlines()
assert len(ba_data) == len(en_data), "The files don't have the same number of lines."
test_df = pd.DataFrame({'English': en_data, 'Bahnar': ba_data})
ba_en_test_dataset = Dataset.from_pandas(test_df)

test_examples = [{'Bahnar': ex['Bahnar'], 'English': ex['English']} for ex in ba_en_test_dataset]
predictions, references = evaluate_model(model, en_ba_tokenizer, en_vi_tokenizer, test_examples, device)

Translating:   0%|          | 0/1000 [00:00<?, ?it/s]

In [39]:
predictions[0], references[0] # Check format of predictions and references

(["It's a long, beautiful place to build a new countryside."],
 ['I release you, but can do no more.\n'])

In [40]:
len(predictions), len(references)

(1000, 1000)

In [41]:
preds = []
refs = []
for pred in predictions:
    preds.append(pred[0])

for ref in references:
    refs.append(ref)

print(refs[:2])
print(preds[:2])

# Calculate and print the BLEU score
bleu = sacrebleu.corpus_bleu(preds, refs)
print("BLEU: ", round(bleu.score, 2))

# Calculate CHRF
chrf = sacrebleu.corpus_chrf(preds, refs)
print("CHRF:", round(chrf.score, 2))

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(preds, refs)
print("TER:", round(ter.score, 2))

# wandb.log({"BLEU": round(bleu.score, 2), "CHRF": round(chrf.score, 2), "TER": round(ter.score, 2)})

[['I release you, but can do no more.\n'], ['You have as much right to be a knight as any man.\n']]
["It's a long, beautiful place to build a new countryside.", "I'm the leader of the people who are the ones who are the ones who are the ones who are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the children of."]
BLEU:  22.96
CHRF: 30.42
TER: 58.19


### Saving the model

In [42]:
# Step 5: Save the Model
model.save_pretrained("./model1.1.1")
en_ba_tokenizer.save_pretrained("./tokenizer1.1.1")
en_vi_tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/source.spm',
 './tokenizer/target.spm',
 './tokenizer/added_tokens.json')

### Display translated and references for qualitative evaluation

In [43]:
# Display the first 3 prediction-reference pairs
for i in range(10):
    print(f"Prediction {i+1}: {predictions[i][0]}")
    print(f"Reference {i+1}: {references[i][0]}")

Prediction 1: It's a long, beautiful place to build a new countryside.
Reference 1: I release you, but can do no more.

Prediction 2: I'm the leader of the people who are the ones who are the ones who are the ones who are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the ones that are the children of.
Reference 2: You have as much right to be a knight as any man.

Prediction 3: I'm sorry. I'm sorry.
Reference 3: No. Leave me alone.

Prediction 4: I'm sorry, I'm sorry, I'm sorry.
Reference 4: I'm here to help!

Prediction 5: I'm sorry, I'm sorry, I'm sorry. I'm sorry, I'm sorry.
Reference 5: All you wanna do is pee.

Prediction 6: At the beginning of the 20th century, the district's districts have built a new countryside in the district.
Reference 6: And these platforms were certainly very helpful to a

### Clean up the output

In [44]:

import os
import shutil

def clear_directory_except_subdir(directory, subdir):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isdir(item_path) and item_path not in subdir:
            shutil.rmtree(item_path)  # Remove directories
        elif os.path.isfile(item_path) and item_path not in subdir:
            os.remove(item_path)  # Remove files

# Usage
working_dir = '/kaggle/working'
results_dir = ['/kaggle/working/results', '/kaggle/working/en-ba-model', '/kaggle/working/en-ba-tokenizer', '/kaggle/working/en-ba-aug-tokenizer', '/kaggle/working/en-ba-aug-model']
clear_directory_except_subdir(working_dir, results_dir)


# Zip the model

In [45]:
!zip -r model1.1.1.zip /kaggle/working/model1.1.1


zip error: Nothing to do! (try: zip -r model1.1.1.zip . -i /kaggle/working/model1.1.1)


In [46]:
!zip -r tokenizer1.1.1.zip /kaggle/working/tokenizer1.1.1


zip error: Nothing to do! (try: zip -r tokenizer1.1.1.zip . -i /kaggle/working/tokenizer1.1.1)


In [47]:
!zip -r tokenizer.zip /kaggle/working/tokenizer


zip error: Nothing to do! (try: zip -r tokenizer.zip . -i /kaggle/working/tokenizer)


# **Load the model and test**

In [48]:
# import torch
# from tqdm.auto import tqdm

# def evaluate_model(model, encode_tokenizer, decode_tokenizer, dataset, device):
#     model.eval()

#     predictions, references = [], []
#     for example in tqdm(dataset, desc="Translating"):
#         with encode_tokenizer.as_target_tokenizer():
#             input_ids = encode_tokenizer.encode(example['Bahnar'], return_tensors='pt').to(device)

#         output_ids = model.generate(input_ids)[0]
#         pred = decode_tokenizer.decode(output_ids, skip_special_tokens=True)
#         predictions.append([pred])

#         references.append([example['English']])

#     return predictions, references


In [49]:
# model_path = '/kaggle/working/model1.1.1'
# tokenizer_path = '/kaggle/working/model1.1.1'

# # Load the model and tokenizer
# loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
# loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# # Evaluate the model
# test_examples = [{'Bahnar': ex['Bahnar'], 'English': ex['English']} for ex in ba_en_test_dataset]
# predictions, references = evaluate_model(loaded_model, en_ba_tokenizer, en_ba_tokenizer, test_examples, device)

In [50]:
# preds = []
# refs = []
# for pred in predictions:
#     preds.append(pred[0])

# for ref in references:
#     refs.append(ref)

# print(refs[:10])
# print(preds[:10])

In [51]:
# # Calculate and print the BLEU score
# bleu = sacrebleu.corpus_bleu(preds, refs)
# print("BLEU: ", round(bleu.score, 2))

# # Calculate CHRF
# chrf = sacrebleu.corpus_chrf(preds, refs)
# print("CHRF:", round(chrf.score, 2))

# # Calculate TER
# metric = sacrebleu.metrics.TER()
# ter = metric.corpus_score(preds, refs)
# print("TER:", round(ter.score, 2))