In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

MODEL_NAME = "vinai/phobert-base-v2"
DATA_PATH = "data/question_dataset.csv"
SAVE_PATH = "models/question_classifier"
NUM_EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

LABELS = ["Definition", "Yes/No", "List", "Factoid", "Inference"]
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}

class QuestionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train():    
    df = pd.read_csv(DATA_PATH)
    df['label_id'] = df['label'].map(label2id)
    
    X_train, X_val, y_train, y_val = train_test_split(
        df['question'].tolist(),
        df['label_id'].tolist(),
        test_size=0.2,
        random_state=42
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = QuestionDataset(X_train, y_train, tokenizer)
    val_dataset = QuestionDataset(X_val, y_val, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(LABELS),
        id2label=id2label,
        label2id=label2id
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(NUM_EPOCHS):
        print(f"--- Epoch {epoch + 1}/{NUM_EPOCHS} ---")
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

    os.makedirs(SAVE_PATH, exist_ok=True)
    model.save_pretrained(SAVE_PATH)
    tokenizer.save_pretrained(SAVE_PATH)
    print(f"Đã lưu mô hình vào {SAVE_PATH}")


train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Epoch 1/10 ---


  return forward_call(*args, **kwargs)


Average training loss: 1.5647
--- Epoch 2/10 ---
Average training loss: 1.3890
--- Epoch 3/10 ---
Average training loss: 1.1826
--- Epoch 4/10 ---
Average training loss: 0.9487
--- Epoch 5/10 ---
Average training loss: 0.6927
--- Epoch 6/10 ---
Average training loss: 0.4863
--- Epoch 7/10 ---
Average training loss: 0.3459
--- Epoch 8/10 ---
Average training loss: 0.2603
--- Epoch 9/10 ---
Average training loss: 0.2110
--- Epoch 10/10 ---
Average training loss: 0.1700
Đã lưu mô hình vào models/question_classifier
