In [None]:
!pip install transformers sentencepiece datasets sacrebleu

In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset
from transformers import MT5Tokenizer
from transformers import MT5Config, MT5ForConditionalGeneration
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm_notebook
from tqdm import tqdm
import re
import sacrebleu


In [None]:
regex_vietnamese = re.compile(r'[^a-zđáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựỳỵỷỹ\s]', re.IGNORECASE)
regex_english = re.compile(r"[^a-zA-Z\s']", re.IGNORECASE)
regex_spanish = re.compile(r'[^a-záéíóúñ\s]', re.IGNORECASE)

def clean_text(text, regex):
    if text and isinstance(text, str):
        text = regex.sub('', text).lower().strip()
        return " ".join(text.split())
    return ""
def load_and_prepare_data_vi_en(file_path):
    df = pd.read_csv(file_path)
    df['vi'] = df['vi'].apply(lambda x: clean_text(x, regex_vietnamese))
    df['en'] = df['en'].apply(lambda x: clean_text(x, regex_english))

    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def load_and_prepare_data_en_es(file_path):
    df = pd.read_csv(file_path)
    df['en'] = df['en'].apply(lambda x: clean_text(x, regex_english))
    df['es'] = df['es'].apply(lambda x: clean_text(x, regex_spanish))

    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def load_and_prepare_data_vi_es(file_path):
    df = pd.read_csv(file_path)
    df['vi'] = df['vi'].apply(lambda x: clean_text(x, regex_vietnamese))
    df['es'] = df['es'].apply(lambda x: clean_text(x, regex_spanish))

    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

df_vi_en = load_and_prepare_data_vi_en('/kaggle/input/data-du-an-cong-nghe-thong-tin/vi-en/train_data_en_vi.csv')
df_en_es = load_and_prepare_data_en_es('/kaggle/input/data-du-an-cong-nghe-thong-tin/en_es/train_data_en_es.csv')
df_vi_es = load_and_prepare_data_vi_es('/kaggle/input/data-vi-es/df_vi_es.csv')


In [None]:
test_df_vi_en = load_and_prepare_data_vi_en('/kaggle/input/data-du-an-cong-nghe-thong-tin/vi-en/test_data_en_vi.csv')
val_df_vi_en = load_and_prepare_data_vi_en('/kaggle/input/data-du-an-cong-nghe-thong-tin/vi-en/validation_data_en_vi.csv')
test_df_en_es = load_and_prepare_data_en_es('/kaggle/input/data-du-an-cong-nghe-thong-tin/en_es/test_data_en_es.csv')
val_df_en_es = load_and_prepare_data_en_es('/kaggle/input/data-du-an-cong-nghe-thong-tin/en_es/validation_data_en_es.csv')


In [None]:
test_vi_es = pd.read_csv("/kaggle/input/data-test-song-ngu/test_vi_es.csv")

In [None]:
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

In [None]:
config = MT5Config()

config.decoder_start_token_id = config.pad_token_id

model = MT5ForConditionalGeneration(config)


In [None]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'vi': '<vi>',
    'es': '<es>'
}

In [None]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def encode_str(text, tokenizer, seq_len):

    encoded = tokenizer.encode(text, max_length=seq_len, truncation=True, return_tensors="pt")

    if encoded.size(1) < seq_len:
        padding = torch.full((1, seq_len - encoded.size(1)), tokenizer.pad_token_id)
        encoded = torch.cat([encoded, padding], dim=1)

    return encoded

In [None]:
def encode_str_with_lang_tag(text, target_lang, tokenizer, seq_len):
    text_with_tag = LANG_TOKEN_MAPPING[target_lang] + ' ' + text

    return encode_str(text_with_tag, tokenizer, seq_len)


def process_dataset_with_lang_tag(dataset, input_lang, target_lang, tokenizer, seq_len):
    dataset['input_encoded'] = dataset[input_lang].apply(lambda x: encode_str_with_lang_tag(x, target_lang, tokenizer, seq_len))

    dataset['target_encoded'] = dataset[target_lang].apply(lambda x: encode_str(x, tokenizer, seq_len))

    return dataset[['input_encoded', 'target_encoded']]

In [None]:
seq_len = 101
processed_vi_en = process_dataset_with_lang_tag(df_vi_en, 'vi', 'en', tokenizer, seq_len)
processed_en_vi = process_dataset_with_lang_tag(df_vi_en, 'en', 'vi', tokenizer, seq_len)

processed_en_es = process_dataset_with_lang_tag(df_en_es, 'en', 'es', tokenizer, seq_len)
processed_es_en = process_dataset_with_lang_tag(df_en_es, 'es', 'en', tokenizer, seq_len)

processed_vi_es = process_dataset_with_lang_tag(df_vi_es, 'vi', 'es', tokenizer, seq_len)
processed_es_vi = process_dataset_with_lang_tag(df_vi_es, 'es', 'vi', tokenizer, seq_len)

In [None]:
processed_test_vi_en = process_dataset_with_lang_tag(test_df_vi_en, 'vi', 'en', tokenizer, seq_len)
processed_test_en_vi = process_dataset_with_lang_tag(test_df_vi_en, 'en', 'vi', tokenizer, seq_len)

processed_test_en_es = process_dataset_with_lang_tag(test_df_en_es, 'en', 'es', tokenizer, seq_len)
processed_test_es_en = process_dataset_with_lang_tag(test_df_en_es, 'es', 'en', tokenizer, seq_len)

processed_val_vi_en = process_dataset_with_lang_tag(val_df_vi_en, 'vi', 'en', tokenizer, seq_len)
processed_val_en_vi = process_dataset_with_lang_tag(val_df_vi_en, 'en', 'vi', tokenizer, seq_len)

processed_val_en_es = process_dataset_with_lang_tag(test_df_vi_en, 'en', 'es', tokenizer, seq_len)
processed_val_es_en = process_dataset_with_lang_tag(test_df_vi_en, 'es', 'en', tokenizer, seq_len)

processed_test_vi_es = process_dataset_with_lang_tag(test_vi_es, 'vi', 'es', tokenizer, seq_len)
processed_test_es_vi = process_dataset_with_lang_tag(test_vi_es, 'es', 'vi', tokenizer, seq_len)

In [None]:
def split_dataset_fixed(data, val_count=2000, random_state=42):

    assert len(data) > (test_count + val_count), 
    
    shuffled_data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
        
    val_data = shuffled_data[test_count:test_count+val_count]
    
    train_data = shuffled_data[test_count+val_count:]
    
    return train_data, val_data

In [None]:
train_data_vi_en = processed_vi_en.sample(frac=1, random_state=42).reset_index(drop=True)
train_data_en_vi = processed_en_vi.sample(frac=1, random_state=42).reset_index(drop=True)
train_data_en_es = processed_en_es.sample(frac=1, random_state=42).reset_index(drop=True)
train_data_es_en = processed_es_en.sample(frac=1, random_state=42).reset_index(drop=True)
train_data_vi_es, val_data_vi_es = split_dataset_fixed(processed_vi_es, val_count=2000, random_state=42)
train_data_es_vi, val_data_es_vi = split_dataset_fixed(processed_es_vi, val_count=2000, random_state=42)


In [None]:

train_data_total = pd.concat([train_data_vi_en, train_data_en_vi, train_data_vi_es, train_data_es_vi, train_data_en_es, train_data_es_en], ignore_index=True)

val_data_total = pd.concat([processed_val_vi_en, processed_val_en_vi, val_data_vi_es, val_data_es_vi, processed_val_en_es, processed_val_es_en], ignore_index=True)



In [None]:
train_data_total.to_csv("train_data_total.csv", index=False)
val_data_total.to_csv("val_data_total.csv", index=False)


In [None]:
train_data_total =  pd.read_csv("/kaggle/input/data-da-ngu/data_da_ngu/train_data_total.csv")


In [None]:
val_data_total =  pd.read_csv("/kaggle/input/data-da-ngu/data_da_ngu/val_data_total.csv")


In [None]:
train_data_total = train_data_total.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
val_data_total = val_data_total.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
def convert_string_to_tensor(string):
    clean_string = re.sub(r'\s+', ' ', string)  # Loại bỏ khoảng trắng thừa và xuống dòng
    clean_string = re.sub(r'tensor\(\[\[|\]\]\)', '', clean_string).strip()

    list_of_ints = [int(i) for i in clean_string.split(',')]

    return torch.tensor(list_of_ints)

train_data_total['input_encoded'] = train_data_total['input_encoded'].apply(convert_string_to_tensor)
train_data_total['target_encoded'] = train_data_total['target_encoded'].apply(convert_string_to_tensor)


In [None]:
val_data_total['input_encoded'] = val_data_total['input_encoded'].apply(convert_string_to_tensor)
val_data_total['target_encoded'] = val_data_total['target_encoded'].apply(convert_string_to_tensor)


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        return {
            'input': item['input_encoded'],
            'target': item['target_encoded']
        }


In [None]:
dataset = TranslationDataset(train_data_total)

In [None]:
val_dataset = TranslationDataset(val_data_total)

In [None]:
test_vi_es_dataset = TranslationDataset(processed_test_vi_es)
test_es_vi_dataset = TranslationDataset(processed_test_es_vi)

test_vi_es_loader = DataLoader(test_vi_es_dataset, batch_size=15, shuffle=False)
test_es_vi_loader = DataLoader(test_es_vi_dataset, batch_size=15, shuffle=False)

In [None]:
test_data_vi_en_dataset = TranslationDataset(processed_test_vi_en)

test_data_es_en_dataset = TranslationDataset(processed_test_es_en)

test_data_en_vi_dataset = TranslationDataset(processed_test_en_vi)

test_data_en_es_dataset = TranslationDataset(processed_test_en_es)


In [None]:
test_data_vi_en_test_loader = DataLoader(test_data_vi_en_dataset, batch_size=15, shuffle=False)

test_data_es_en_test_loader = DataLoader(test_data_es_en_dataset, batch_size=15, shuffle=False)

test_data_en_vi_test_loader = DataLoader(test_data_en_vi_dataset, batch_size=15, shuffle=False)

test_data_en_es_test_loader = DataLoader(test_data_en_es_dataset, batch_size=15, shuffle=False)




In [None]:
train_loader = DataLoader(dataset, batch_size=15, shuffle=False)

In [None]:
test_loader = DataLoader(val_dataset, batch_size=15, shuffle=False)

In [None]:
def tensor_to_string(tensor):
    token_ids = tensor.cpu().numpy()
    return tokenizer.decode(token_ids, skip_special_tokens=True)


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
def test_bleu_on_subset_hf(model, test_loader, device, num_batches_to_test, max_length=512):
    model.eval()
    metric = load_metric("sacrebleu")
    references = []
    hypotheses = []

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            if i >= num_batches_to_test:
                break 
            input_ids = batch['input'].to(device)
            target_ids = batch['target'].to(device)

            outputs = model.generate(input_ids, max_length=max_length)
            hypotheses.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs])
            references_batch = [tokenizer.decode(ids, skip_special_tokens=True) for ids in target_ids]
            references.extend([[ref] for ref in references_batch]) 

    for hypothesis, reference in zip(hypotheses, references):
        metric.add(prediction=hypothesis, reference=reference)

    final_score = metric.compute()["score"]
    return final_score

In [None]:
checkpoint_path = '/kaggle/input/da-ngu-tu-dau-v1-step100000-150000-epoch7/_step_240000_epoch_21.pt'
model_path = '/kaggle/working/'


In [None]:
def train_and_save_model(model, train_loader, test_loader, optimizer, device, num_epochs, save_path, tensor_to_string, start_epoch, start_global_step):
    global_step = 0
    model.to(device)  

    for epoch in range(start_epoch, num_epochs):
        model.train()
        total_loss = 0
        interval_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            if global_step < start_global_step:
                global_step += 1
                continue
            input_ids = batch['input'].to(device)
            
            attention_mask = (input_ids != model.config.pad_token_id).long()

            target_ids = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            loss_item = loss.item()
            total_loss += loss_item
            interval_loss += loss_item
            global_step += 1

            if global_step % 5000 == 0:
                avg_interval_loss = interval_loss / 5000
                print(f"Average Loss over last 5000 steps at step {global_step}: {avg_interval_loss}")
                interval_loss = 0  

            if global_step % 80000 == 0 :
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'global_step': global_step,
                    'epoch': epoch
                }, f"{save_path}_step_{global_step}_epoch_{epoch}.pt")

            progress_bar.set_postfix({'loss': loss_item})

        avg_loss = total_loss / len(train_loader)
        print(f"Trung bình Loss Epoch {epoch+1}: {avg_loss}")


        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step,
            'epoch': epoch
        }, f"{save_path}_epoch_{epoch}.pt")
        

        
        start_global_step = 0
        global_step = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_checkpoint(model, optimizer, checkpoint_path, device):
    model.to(device)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    start_global_step = checkpoint['global_step']
    return start_epoch, start_global_step

model.to(device)

start_epoch, start_global_step = load_checkpoint(model, optimizer, checkpoint_path, device)

train_and_save_model(model, train_loader, test_loader, optimizer, device, 30, model_path, tensor_to_string, start_epoch, start_global_step)



In [None]:
bleu_score_of_vi_en = test_bleu_on_subset_hf(model, test_data_vi_en_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of vi to en : {bleu_score_of_vi_en}")

bleu_score_of_vi_es = test_bleu_on_subset_hf(model, test_data_vi_es_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of vi to es : {bleu_score_of_vi_es}")

bleu_score_of_es_en = test_bleu_on_subset_hf(model, test_data_es_en_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of es to en : {bleu_score_of_es_en}")

bleu_score_of_en_vi = test_bleu_on_subset_hf(model, test_data_en_vi_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of en to vi : {bleu_score_of_en_vi}")

bleu_score_of_en_es = test_bleu_on_subset_hf(model, test_data_en_es_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of en to es : {bleu_score_of_en_es}")

bleu_score_of_es_vi = test_bleu_on_subset_hf(model, test_data_es_vi_test_loader, device, len(test_data_vi_en_test_loader),  max_length=512)
print(f"BLEU score of es to vi : {bleu_score_of_es_vi}")