# Import thư viện cần thiết 

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm  # Sử dụng tqdm trực tiếp thay vì tqdm.auto
import transformers
import logging
import pkg_resources

2025-05-28 08:35:33.789391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748421333.966670      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748421334.019805      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Huấn luyện

In [2]:
# Kích hoạt logging chi tiết
logging.basicConfig(level=logging.INFO)
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Kiểm tra phiên bản transformers và tqdm
print(f"Transformers version: {transformers.__version__}")
print(f"Tqdm version: {pkg_resources.get_distribution('tqdm').version}")

# 1. Đọc và chuẩn bị dữ liệu
file_path = "/kaggle/input/vietnamese-social-comments2/vne_dataset.csv"
df = pd.read_csv(file_path)
df = df[['comment', 'label']].dropna()

# Chuyển label thành số
label_map = {"positive": 0, "negative": 1, "neutral": 2, "toxic": 3}
df['labels'] = df['label'].map(label_map)

# 2. Chuyển sang Dataset và tokenize
model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples['comment'], truncation=True, padding='max_length', max_length=64)

raw_dataset = Dataset.from_pandas(df[['comment', 'labels']])
dataset = raw_dataset.train_test_split(test_size=0.2)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Loại bỏ cột không cần thiết và đặt định dạng
tokenized_dataset = tokenized_dataset.remove_columns(['comment'])
tokenized_dataset.set_format('torch')

# 3. Load mô hình
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# 4. Hàm tính metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_macro": f1_score(p.label_ids, preds, average="macro")
    }

# 5. Cấu hình huấn luyện
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    logging_first_step=True,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none"
)

# 6. Callback tùy chỉnh để hiển thị thanh tiến trình
class TqdmCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.progress_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        # Tính tổng số bước: num_examples / effective_batch_size * num_epochs
        num_examples = 3916  # Từ log: Num examples = 3,916
        effective_batch_size = args.per_device_train_batch_size * args.n_gpu
        total_steps = args.num_train_epochs * (num_examples // effective_batch_size + 1)
        self.progress_bar = tqdm(total=total_steps, desc="Training", position=0, leave=True)

    def on_step_end(self, args, state, control, **kwargs):
        if self.progress_bar is not None:
            self.progress_bar.update(1)

    def on_train_end(self, args, state, control, **kwargs):
        if self.progress_bar is not None:
            self.progress_bar.close()

# 7. Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# Thêm callback tùy chỉnh
trainer.add_callback(TqdmCallback())

# 8. Huấn luyện
trainer.train()

[INFO|tokenization_auto.py:795] 2025-05-28 08:35:46,696 >> Could not locate the tokenizer configuration file, will try to use the model config instead.


Transformers version: 4.51.3
Tqdm version: 4.67.1


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

[INFO|configuration_utils.py:693] 2025-05-28 08:35:46,847 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6e8e9a87af16a5da3190b0cc8/config.json
[INFO|configuration_utils.py:765] 2025-05-28 08:35:46,850 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "PhobertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,582 >> loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6e8e9a87af16a5da3190b0cc8/vocab.txt
[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,583 >> loading file bpe.codes from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6e8e9a87af16a5da3190b0cc8/bpe.codes
[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,583 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,584 >> loading file special_tokens_map.json from cache at None
[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,585 >> loading file tokenizer_config.json from cache at None
[INFO|tokenization_utils_base.py:2060] 2025-05-28 08:35:47,585 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Map:   0%|          | 0/980 [00:00<?, ? examples/s]

[INFO|configuration_utils.py:693] 2025-05-28 08:35:50,889 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6e8e9a87af16a5da3190b0cc8/config.json
[INFO|configuration_utils.py:765] 2025-05-28 08:35:50,891 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embeddin

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

[INFO|modeling_utils.py:1124] 2025-05-28 08:35:53,481 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--vinai--phobert-base-v2/snapshots/e2375d266bdf39c6e8e9a87af16a5da3190b0cc8/pytorch_model.bin
[INFO|safetensors_conversion.py:61] 2025-05-28 08:35:53,552 >> Attempting to create safetensors variant
[INFO|safetensors_conversion.py:74] 2025-05-28 08:35:53,722 >> Safetensors PR exists
[INFO|modeling_utils.py:4920] 2025-05-28 08:35:53,914 >> Some weights of the model checkpoint at vinai/phobert-base-v2 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT exp

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

[INFO|trainer.py:748] 2025-05-28 08:35:54,373 >> Using auto half precision backend
[INFO|trainer.py:2414] 2025-05-28 08:35:54,763 >> ***** Running training *****
[INFO|trainer.py:2415] 2025-05-28 08:35:54,764 >>   Num examples = 3,916
[INFO|trainer.py:2416] 2025-05-28 08:35:54,765 >>   Num Epochs = 5
[INFO|trainer.py:2417] 2025-05-28 08:35:54,766 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2419] 2025-05-28 08:35:54,766 >>   Training with DataParallel so batch size has been adjusted to: 32
[INFO|trainer.py:2420] 2025-05-28 08:35:54,768 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2421] 2025-05-28 08:35:54,768 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2422] 2025-05-28 08:35:54,769 >>   Total optimization steps = 615
[INFO|trainer.py:2423] 2025-05-28 08:35:54,771 >>   Number of trainable parameters = 135,001,348


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.9203,0.72939,0.754082,0.753089
2,0.5876,0.511414,0.826531,0.824004
3,0.4009,0.47787,0.839796,0.838113
4,0.3081,0.435322,0.856122,0.854221
5,0.2801,0.42081,0.862245,0.861094


Training:  20%|██        | 123/615 [00:38<02:14,  3.66it/s][INFO|trainer.py:4307] 2025-05-28 08:36:33,155 >> 
***** Running Evaluation *****
[INFO|trainer.py:4309] 2025-05-28 08:36:33,156 >>   Num examples = 980
[INFO|trainer.py:4312] 2025-05-28 08:36:33,157 >>   Batch size = 32
[INFO|trainer.py:3984] 2025-05-28 08:36:36,123 >> Saving model checkpoint to ./results/checkpoint-123
[INFO|configuration_utils.py:419] 2025-05-28 08:36:36,125 >> Configuration saved in ./results/checkpoint-123/config.json
[INFO|modeling_utils.py:3572] 2025-05-28 08:36:37,340 >> Model weights saved in ./results/checkpoint-123/model.safetensors
[INFO|tokenization_utils_base.py:2510] 2025-05-28 08:36:37,342 >> tokenizer config file saved in ./results/checkpoint-123/tokenizer_config.json
[INFO|tokenization_utils_base.py:2519] 2025-05-28 08:36:37,343 >> Special tokens file saved in ./results/checkpoint-123/special_tokens_map.json
[INFO|tokenization_utils_base.py:2572] 2025-05-28 08:36:37,344 >> added tokens file sa

TrainOutput(global_step=615, training_loss=0.5283274119462424, metrics={'train_runtime': 220.6874, 'train_samples_per_second': 88.723, 'train_steps_per_second': 2.787, 'total_flos': 643975871784960.0, 'train_loss': 0.5283274119462424, 'epoch': 5.0})

# Lưu mô hình 

In [3]:
# 9. Lưu mô hình
trainer.save_model("/kaggle/working/phobert-4class")
tokenizer.save_pretrained("/kaggle/working/phobert-4class")

[INFO|trainer.py:3984] 2025-05-28 08:39:35,637 >> Saving model checkpoint to /kaggle/working/phobert-4class
[INFO|configuration_utils.py:419] 2025-05-28 08:39:35,640 >> Configuration saved in /kaggle/working/phobert-4class/config.json
[INFO|modeling_utils.py:3572] 2025-05-28 08:39:36,677 >> Model weights saved in /kaggle/working/phobert-4class/model.safetensors
[INFO|tokenization_utils_base.py:2510] 2025-05-28 08:39:36,679 >> tokenizer config file saved in /kaggle/working/phobert-4class/tokenizer_config.json
[INFO|tokenization_utils_base.py:2519] 2025-05-28 08:39:36,680 >> Special tokens file saved in /kaggle/working/phobert-4class/special_tokens_map.json
[INFO|tokenization_utils_base.py:2572] 2025-05-28 08:39:36,680 >> added tokens file saved in /kaggle/working/phobert-4class/added_tokens.json
[INFO|tokenization_utils_base.py:2510] 2025-05-28 08:39:36,685 >> tokenizer config file saved in /kaggle/working/phobert-4class/tokenizer_config.json
[INFO|tokenization_utils_base.py:2519] 2025-

('/kaggle/working/phobert-4class/tokenizer_config.json',
 '/kaggle/working/phobert-4class/special_tokens_map.json',
 '/kaggle/working/phobert-4class/vocab.txt',
 '/kaggle/working/phobert-4class/bpe.codes',
 '/kaggle/working/phobert-4class/added_tokens.json')

# Kiểm tra mô hình 

In [4]:
# 1. Tải mô hình và tokenizer đã lưu
model_path = "/kaggle/working/phobert-4class"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 2. Hàm tính metrics (tái sử dụng từ code huấn luyện)
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_macro": f1_score(p.label_ids, preds, average="macro")
    }

# 3. Tạo Trainer để đánh giá trên tập test
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_dir="./logs",
    report_to="none"  # Không cần logging khi chỉ đánh giá
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    processing_class=tokenizer  # Sử dụng processing_class thay cho tokenizer để tránh FutureWarning
)

# 4. Đánh giá mô hình trên tập test
print("Đánh giá mô hình trên tập test:")
eval_results = trainer.evaluate()
print("Kết quả đánh giá:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# 5. Dự đoán trên một số ví dụ mới
def predict_new_examples(comments, tokenizer, model, label_map):
    # Tokenize các bình luận mới
    inputs = tokenizer(comments, truncation=True, padding='max_length', max_length=64, return_tensors="pt")
    
    # Chuyển sang thiết bị của mô hình (CPU hoặc GPU)
    model.eval()
    import torch
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    
    # Chuyển đổi dự đoán thành nhãn
    reverse_label_map = {v: k for k, v in label_map.items()}
    predicted_labels = [reverse_label_map[pred] for pred in predictions]
    return predicted_labels

# Định nghĩa label_map (tái sử dụng từ code huấn luyện)
label_map = {
    "positive": 0,
    "negative": 1,
    "neutral": 2,
    "toxic": 3
}

# Ví dụ bình luận mới để dự đoán
new_comments = [
    "Bài viết này hay quá, cảm ơn tác giả!",
    "Chất lượng sản phẩm quá tệ, không đáng tiền.",
    "Tôi không có ý kiến gì về vấn đề này.",
    "Thật là một lũ ngu ngốc, toàn nói nhảm!"
]

# Dự đoán trên các bình luận mới
print("\nDự đoán trên các bình luận mới:")
predicted_labels = predict_new_examples(new_comments, tokenizer, model, label_map)
for comment, label in zip(new_comments, predicted_labels):
    print(f"Bình luận: {comment}")
    print(f"Nhãn dự đoán: {label}\n")

[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,819 >> loading file vocab.txt
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,819 >> loading file bpe.codes
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,820 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,820 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,821 >> loading file tokenizer_config.json
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,821 >> loading file tokenizer.json
[INFO|tokenization_utils_base.py:2058] 2025-05-28 08:39:36,822 >> loading file chat_template.jinja
[INFO|configuration_utils.py:691] 2025-05-28 08:39:36,932 >> loading configuration file /kaggle/working/phobert-4class/config.json
[INFO|configuration_utils.py:765] 2025-05-28 08:39:36,934 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "

Đánh giá mô hình trên tập test:




Kết quả đánh giá:
eval_loss: 0.4208
eval_accuracy: 0.8622
eval_f1_macro: 0.8611
eval_runtime: 3.0649
eval_samples_per_second: 319.7480
eval_steps_per_second: 10.1140

Dự đoán trên các bình luận mới:
Bình luận: Bài viết này hay quá, cảm ơn tác giả!
Nhãn dự đoán: positive

Bình luận: Chất lượng sản phẩm quá tệ, không đáng tiền.
Nhãn dự đoán: negative

Bình luận: Tôi không có ý kiến gì về vấn đề này.
Nhãn dự đoán: neutral

Bình luận: Thật là một lũ ngu ngốc, toàn nói nhảm!
Nhãn dự đoán: toxic

