In [11]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

MODEL_NAME = "vinai/phobert-base"  # or any model of your choice
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=8)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set the problem type to single-label classification
model.config.problem_type = "single_label_classification"

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

train = pd.read_csv("../lstm/processed-data/train.csv")
test = pd.read_csv("../lstm/processed-data/test.csv")
val = pd.read_csv("../lstm/processed-data/val.csv")

In [3]:
X_train = train["comment"].tolist()
y_train = train["label"].tolist()
X_test = test["comment"].tolist()
y_test = test["label"].tolist()
X_val = val["comment"].tolist()
y_val = val["label"].tolist()

In [4]:
X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]
X_test = [str(x) for x in X_test]

In [9]:
# Example function to tokenize a list of texts
def tokenize_texts(texts, max_length=128):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

# Tokenize your training and validation data
train_encodings = tokenize_texts(X_train)  # train_texts is the list of training comments
val_encodings = tokenize_texts(X_val)

In [12]:
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # these should be integers

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        # Convert label to a tensor with long type
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, y_train)
val_dataset = EmotionDataset(val_encodings, y_val)

In [13]:
# Replace your current label conversion cell with this:
y_train = [int(float(label)) for label in y_train]  # Convert float to int
y_val = [int(float(label)) for label in y_val]
y_test = [int(float(label)) for label in y_test]

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',             # where to save model predictions and checkpoints
    num_train_epochs=3,                 # number of training epochs
    per_device_train_batch_size=16,     # batch size per device during training
    per_device_eval_batch_size=16,      # batch size for evaluation
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,                 # only keep the last 2 checkpoints
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()



Step,Training Loss


KeyboardInterrupt: 

In [17]:
# New Vietnamese text for prediction
new_texts = ["Hôm qua bà ngoại mình mất."]

# Tokenize and prepare input
new_encodings = tokenizer(new_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Get predictions
outputs = model(**new_encodings)
predictions = torch.argmax(outputs.logits, dim=-1)
print("Predicted emotion label:", predictions.item())

Predicted emotion label: 1
