In [None]:
from glob import glob 
import os
import json
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import evaluate
from sklearn.model_selection import train_test_split

DATA_FOLDER = 'data'



In [None]:
# Load and prepare the dataset
json_files = glob(os.path.join(DATA_FOLDER, '*.json'))
json_files.sort()

vi_texts = []
en_texts = []

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        for item in json_data['data']:
            vi_texts.append("vi: " + item['vi'].lower())
            en_texts.append("en: " + item['en'].lower())

data_dict = {
    "vi": vi_texts,
    "en": en_texts
}

dataset = Dataset.from_dict(data_dict)

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

# Load the model and tokenizer
model_name = "VietAI/envit5-translation"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(examples["vi"], return_tensors="pt", padding=True, truncation=True)
    targets = tokenizer(examples["en"], return_tensors="pt", padding=True, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the tokenization function to the dataset
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7034 [00:00<?, ? examples/s]

In [None]:
bleu_metric = evaluate.load("bleu")

def compute_metrics(pred):
    # Extract predictions and labels
    predictions, labels = pred
    
    # Decode the predicted token IDs and the ground truth token IDs
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return bleu_score

In [None]:
train_dataset, eval_dataset = train_test_split(dataset, test_size=0.1)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,        
    weight_decay=0.01,
    save_total_limit=2
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

In [None]:
# Import necessary libraries
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq
from evaluate import load

# Load the model and tokenizer
model_name = "VietAI/envit5-translation"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load and prepare the dataset
with open("demo.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Extract the 'vi' and 'en' columns as separate lists with language prefixes
vi_texts = ["vi: " + item["vi"] for item in dataset["data"]]
en_texts = ["en: " + item["en"] for item in dataset["data"]]

# Create a dictionary where each key is a column name, and the value is a list of values
data_dict = {
    "vi": vi_texts,
    "en": en_texts
}

# Create a Hugging Face dataset from the dictionary
train_dataset = Dataset.from_dict(data_dict)

# Tokenization function
def tokenize_function(examples):
    # Tokenize the "vi" column as the input and "en" column as the target
    inputs = tokenizer(examples["vi"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["en"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the tokenization function to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Initialize the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

bleu_metric = load("bleu")
def compute_metrics(pred):
    # Extract predictions and labels
    predictions, labels = pred
    # Decode the predicted token IDs and the ground truth token IDs
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return bleu_score


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Đánh giá sau mỗi epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,          # Tăng số epoch
    weight_decay=0.01,
    save_total_limit=2,
    gradient_accumulation_steps=8  # Tích lũy gradient qua 8 batch
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")