In [5]:
!pip install torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install tensorflow
!pip install flax jax jaxlib

Looking in indexes: https://download.pytorch.org/whl/cu118


In [9]:
!pip install --upgrade accelerate
!pip install transformers[torch]


Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0


In [1]:
import os
import json
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import numpy as np

# 1. Load dữ liệu (có thể chỉ có training_data.jsonl, sau đó split thành train/validation)
data_file = "training_data.jsonl"
dataset = load_dataset("json", data_files=data_file, split="train")

split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# 2. Chọn pre-trained model và tokenizer
model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 3. Hàm tiền xử lý
max_source_length = 512
max_target_length = 128

def preprocess_function(examples):

    inputs = [f"Question: {instr}\nAnswer:" for instr in examples["instruction"]]
    targets = examples["output"]
    model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

# 4. Chuẩn bị Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 5. Cấu hình các tham số huấn luyện
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_vn_law_chatbot",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,  # nếu máy bạn hỗ trợ CUDA fp16
    save_total_limit=3,
)
# 6. Tạo Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 7. Train mô hình
trainer.train()

# Sau khi train xong, bạn có thể lưu mô hình
trainer.save_model("mt5_vn_law_chatbot_final")




You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,15.9525
200,6.3208
300,3.8517
400,3.2543
500,2.9758
600,2.8199
700,2.7688
