In [None]:
!pip install transformers huggingface_hub datasets torch evaluate sacrebleu

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `save_the_datasets` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq, DataCollatorWithPadding

from datasets import load_dataset

import evaluate
import numpy as np

In [None]:
model_names = ['deep-learning-analytics/GrammarCorrector', 'NlpHUST/t5-en-vi-small', 'NlpHUST/t5-vi-en-small']
metric = evaluate.load("sacrebleu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Giải mã các dự đoán thành chuỗi văn bản
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Thay thế giá trị -100 trong labels (được dùng khi padding) bằng pad token id của tokenizer
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Tiền xử lý: loại bỏ khoảng trắng thừa
    decoded_preds = [pred.strip() for pred in decoded_preds]
    # Metric BLEU yêu cầu references là danh sách các danh sách (mỗi câu có thể có nhiều bản dịch tham chiếu)
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # Tính toán BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Trả về kết quả dưới dạng dictionary
    return {"bleu": result["score"]}

In [None]:
def getModelAndData(model_name):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # 1. Đóng băng toàn bộ encoder
    for param in model.encoder.parameters():
        param.requires_grad = False

    # 2. Đóng băng các tham số chung (nếu bạn không muốn fine-tune các embedding)
    for param in model.shared.parameters():
        param.requires_grad = False

    # 3. Trong phần decoder, đóng băng tất cả các block ngoại trừ block cuối cùng.
    #    Với T5, phần decoder được lưu trong model.decoder.block là một ModuleList.
    num_decoder_blocks = len(model.decoder.block)
    for idx, block in enumerate(model.decoder.block):
        if idx < num_decoder_blocks - 1:  # Đóng băng các block từ 0 đến block thứ (num_decoder_blocks - 2)
            for param in block.parameters():
                param.requires_grad = False
        else:
            # Với block cuối cùng, giữ nguyên (hoặc bạn có thể chọn chỉ fine-tune một phần của block cuối)
            for param in block.parameters():
                param.requires_grad = True

    return tokenizer, model

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def get_filenames_from_path(path):
  filenames = []
  for root, _, files in os.walk(path):
    for file in files:
      filenames.append(os.path.join(root, file))
  return filenames

translate_data = get_filenames_from_path('/content/drive/MyDrive/CDIO3_Dataset/Translation')
grammar_data = get_filenames_from_path('/content/drive/MyDrive/CDIO3_Dataset/Grammarly')

In [None]:
import random

def select_and_split_dataset(dataset_list, seed=42):
    # Kiểm tra danh sách có rỗng hay không
    if not dataset_list:
        raise ValueError("Danh sách dataset rỗng!")

    # Chọn ngẫu nhiên một file từ danh sách
    file = random.choice(dataset_list)
    selected_file = {
        'train': file,
    }
    # Xóa file đã chọn khỏi danh sách
    dataset_list.remove(file)
    print(f"Đang sử dụng file: {selected_file}")

    # Load dataset từ file CSV đã chọn
    ds = load_dataset("csv", data_files=selected_file, delimiter=";")["train"]

    # Chia dữ liệu thành 80% train và 20% tạm (temp)
    ds_split = ds.train_test_split(test_size=0.2, shuffle=True, seed=seed)
    ds_train = ds_split["train"]
    ds_tmp = ds_split["test"]

    # Chia ds_tmp thành 50% validation và 50% test (tương đương 10% mỗi)
    ds_val_test = ds_tmp.train_test_split(test_size=0.5, shuffle=True, seed=seed)
    ds_val = ds_val_test["train"]
    ds_test = ds_val_test["test"]

    return {"train": ds_train, "validation": ds_val, "test": ds_test}, dataset_list

In [None]:
def tokenize_grammar(examples):
  inputs = examples['input']
  targets = examples['output']
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

def tokenize_translate_en(examples):
  inputs = examples['en']
  targets = examples['vi']
  model_inputs = tokenizer(inputs, max_length=512, truncation=True)
  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=512, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

def tokenize_translate_vi(examples):
  inputs = examples['vi']
  targets = examples['en']
  model_inputs = tokenizer(inputs, max_length=512, truncation=True)
  with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=512, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [None]:
def push_to_hub(model, tokenizer, model_name):
  model.push_to_hub(model_name)
  tokenizer.push_to_hub(model_name)

# Model grammar

## Get model, tokenize

In [None]:
tokenizer, model = getModelAndData(model_names[0])

## Preprocess Data

In [None]:
dataset, grammar_data = select_and_split_dataset(grammar_data)
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['validation']

Đang sử dụng file: {'train': '/content/drive/MyDrive/CDIO3_Dataset/Grammarly/grammarly_73.csv'}


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 1600
})

In [None]:
train_dataset = train_dataset.map(tokenize_grammar, batched=True)
test_dataset = test_dataset.map(tokenize_grammar, batched=True)
val_dataset = val_dataset.map(tokenize_grammar, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training

In [None]:
training_args = TrainingArguments(output_dir="/content/drive/MyDrive/CDIO3_Dataset/Models/grammar", eval_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

## Push to hub

In [None]:
push_to_hub(model, tokenizer, 'truongpvk41081/grammarly_correction_model')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


# Model en2vi

## Get model and tokenizer

In [None]:
model, tokenizer = getModelAndData(model_names[1])

tokenizer_config.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

## Preprocess Data

In [None]:
dataset, translate_data = select_and_split_dataset(translate_data)
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['validation']

In [None]:
train_dataset = train_dataset.map(tokenize_translate_en, batched=True)
test_dataset = test_dataset.map(tokenize_translate_en, batched=True)
val_dataset = val_dataset.map(tokenize_translate_en, batched=True)

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = TrainingArguments(output_dir="/content/drive/MyDrive/CDIO3_Dataset/Models/en2vi", eval_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
push_to_hub(model, tokenizer, 'truongpvk41081/translate-en-2-vi')

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

# Model vi2en

## Get model and tokenizer

In [None]:
model, tokenizer = getModelAndData(model_names[2])

tokenizer_config.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

## Preprocess Data

In [None]:
dataset, translate_data = select_and_split_dataset(translate_data)
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['validation']

In [None]:
train_dataset = train_dataset.map(tokenize_translate_vi, batched=True)
test_dataset = test_dataset.map(tokenize_translate_vi, batched=True)
val_dataset = val_dataset.map(tokenize_translate_vi, batched=True)

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = TrainingArguments(output_dir="/content/drive/MyDrive/CDIO3_Dataset/Models/vi2en", eval_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
push_to_hub(model, tokenizer, 'truongpvk41081/translate-vi-2-en')

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]