In [None]:
%%capture
!pip install pytorch-lightning
!pip install torchmetrics
!pip install transformers
!pip install datasets

In [None]:

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torchmetrics
from datasets import load_dataset
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                         DataCollatorWithPadding)

pl.seed_everything(seed=2401)

class VNFeedbackDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer, batch_size=64, num_workers=4):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def setup(self, stage=None):
        #Load and config dataset
        self.dataset = load_dataset("uitnlp/vietnamese_students_feedback")

        def tokenize_function(examples):
            return self.tokenizer(examples["sentence"], truncation=True)

        self.tokenized_dataset = self.dataset.map(tokenize_function, batched=True)

        self.tokenized_dataset = self.tokenized_dataset.rename_column("sentiment", "labels")

        self.tokenized_dataset = self.tokenized_dataset.remove_columns(["sentence", "topic"])

        self.tokenized_dataset.set_format("torch")

    def train_dataloader(self):
        return DataLoader(
            self.tokenized_dataset["train"],
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            collate_fn=self.data_collator
        )

    def val_dataloader(self):
        return DataLoader(
            self.tokenized_dataset["validation"],
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=self.data_collator
        )

    def test_dataloader(self):
        return DataLoader(
            self.tokenized_dataset["test"],
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=self.data_collator
        )

class SentimentModel(pl.LightningModule):
    def __init__(self, lr=2e-5, weight_decay=0.01):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "vinai/phobert-base",
            num_labels=3
        )
        self.lr = lr
        self.weight_decay = weight_decay

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=3)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=3)
        self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=3)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=3)
        self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=3)

    def forward(self, **inputs):
        return self.model(**inputs)

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.parameters(),
            lr=self.lr,
            weight_decay=self.weight_decay
        )

    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        preds = F.softmax(logits, dim=-1)
        self.train_acc(preds, batch["labels"])

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", self.train_acc, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        preds = F.softmax(logits, dim=-1)
        self.val_acc(preds, batch["labels"])
        self.val_f1(preds, batch["labels"])

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", self.val_acc, prog_bar=True)
        self.log("val_f1", self.val_f1, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        logits = outputs.logits

        preds = F.softmax(logits, dim=-1)
        self.test_acc(preds, batch["labels"])
        self.test_f1(preds, batch["labels"])

        self.log("test_acc", self.test_acc, prog_bar=True)
        self.log("test_f1", self.test_f1, prog_bar=True)




INFO:lightning_fabric.utilities.seed:Seed set to 2401


In [None]:
# Load model pretrained
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

# Create data module
data_module = VNFeedbackDataModule(tokenizer)


model = SentimentModel()

# Callback
checkpoint_callback = ModelCheckpoint(
    monitor="val_f1",
    mode="max",
    dirpath="checkpoints",
    filename="phobert-sentiment-{epoch:02d}-{val_f1:.2f}"
)

logger = pl_loggers.TensorBoardLogger("logs/")

trainer = pl.Trainer(
    max_epochs=6,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    logger=logger,
    callbacks=[checkpoint_callback],
    deterministic=True
)

trainer.fit(model, data_module)

trainer.test(model, data_module)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                             | Params | Mode 
-----------------------------------------------------------------------
0 | model     | RobertaForSequenceClassification | 135 M  | eval 
1 | train_acc | MulticlassAccuracy               | 0      | train
2 | val_acc   | MulticlassAccuracy               | 0      | train
3 | val_f1    | MulticlassF1Score                | 0      | train
4 | test_acc  | MulticlassAccuracy               | 0      | train
5 | test_f1   | MulticlassF1Score                | 0      | train
-----------------------------------------------------------------------
135 M     Trainable params
0         Non-trainable params
135 M     Total params
540.002   Total estimated model params size (MB)
5         Modules in train mode
230       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_acc': 0.9273531436920166, 'test_f1': 0.9273531436920166}]

In [None]:
# prompt: connect drive và lưu lại model với     model.model.save_pretrained("sentiment_model")

from google.colab import drive
drive.mount('/content/drive')
model.model.save_pretrained("/content/drive/MyDrive/sentiment_model2")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment_model2")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/sentiment_model2/tokenizer_config.json',
 '/content/drive/MyDrive/sentiment_model2/special_tokens_map.json',
 '/content/drive/MyDrive/sentiment_model2/vocab.txt',
 '/content/drive/MyDrive/sentiment_model2/bpe.codes',
 '/content/drive/MyDrive/sentiment_model2/added_tokens.json')