<a href="https://colab.research.google.com/github/taolati20011/INT3405/blob/main/BERTtrain_samplecode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

INSTALL DEPENDENCES

In [1]:
!pip install pytorch-lightning
!pip install transformers
!pip install sentencepiece
!pip install fairseq



In [2]:
# Mount to drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Replace this path to your dataset directory
DATA_ROOT_DIR = "/content/drive/MyDrive/shopee-sentiment"
!ls $DATA_ROOT_DIR

sample_submission.csv  train.csv	     train_preprocess_unsegment.csv
test.csv	       train.gsheet
test_preprocess.csv    train_preprocess.csv


In [4]:
# Import some dependences
import pandas as pd
import numpy as np
from torch.utils.data import random_split, DataLoader, Dataset
import pytorch_lightning as pl
import torch.nn as nn
import torch
from typing import Optional

train_ratio = 0.8
DATA_DIR = DATA_ROOT_DIR + '/train_preprocess_unsegment.csv'

In [5]:
# Use pandas to read csv, this will return a excel like table data
train = pd.read_csv(DATA_DIR, index_col=0)
train.head()

Unnamed: 0,id,text,class,preprocess_text
0,dee6dfc5,"Đến quán 2 lần thôi , rất là thích !\nQuán tuy...",1,đến quán 2 lần thôi rất là thích quán tuy nằm ...
1,800813f5,Đến quán vào tối chủ_nhật . Có band hát . Khá ...,0,đến quán vào tối chủ nhật có band hát khá ổn t...
2,6553e47f,Phục_vụ lâu quá mặc_dù khách rất vắng .\nĐợi g...,0,phục vụ lâu quá mặc dù khách rất vắng đợi gần ...
3,b45a1ff1,"Ko gian bé_tí , quán chật_chội , đông người nê...",0,ko gian bé tí quán chật chội đông người nên ...
4,f92694b0,"Khi mình order , đặt bánh thì nhận được sự tiế...",1,khi mình order đặt bánh thì nhận được sự tiếp ...


In [6]:
class SentimentData(Dataset):
  def __init__(self, data_dir):
    """
    Args:
    data_dir (string): Directory with the csv file
    """
    self.df = pd.read_csv(data_dir, index_col = 0)
  
  def __len__(self):
    """
    length of the dataset, i.e. number of rows in the csv file
    Returns: int 
    """
    return len(self.df)
  
  def __getitem__(self, idx):
    """
    given a row index, returns the corresponding row of the csv file
    Returns: text (string), label (int) 
    """
    text = self.df["preprocess_text"][idx]
    label = self.df["class"][idx]

    return text, label

class SentimentDataModule(pl.LightningDataModule):
    """
    Module class for sentiment analysis. this class is used to load the data to the model. 
    It is a subclass of LightningDataModule. 
    """
    def __init__(self, data_dir: str = DATA_DIR, batch_size: int = 16):
        """
        Args:
            data_dir (string): Directory with the csv file
            batch_size (int): batch size for dataloader
        """
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        """
        Loads the data to the model. 
        the data is loaded in the setup function, so that it is loaded only once. 
        """
        data_full = SentimentData(self.data_dir)
        train_size = round(len(data_full) * train_ratio)
        val_size = len(data_full) - train_size
        print(len(data_full), train_size, val_size)
        self.data_train, self.data_val = random_split(data_full, [train_size, val_size])

    def train_dataloader(self):
        """
        Returns: dataloader for training
        """
        return DataLoader(self.data_train, batch_size=self.batch_size)

    def val_dataloader(self):
        """
        Returns: dataloader for validation
        """
        return DataLoader(self.data_val, batch_size=self.batch_size)

# Do some Test with data
if __name__ == "__main__":
	dm = SentimentDataModule(DATA_DIR)
	dm.setup()
	idx = 0
	for item in (dm.train_dataloader()):
		print(idx)
		print(item)
		idx += 1
		if idx > 5: break

27000 21600 5400
0
[('chuyện xảy ra cũng lâu rồi nhưng nhắc lại vẫn thấy đúng là 1 trải nghiệm khá tệ đến sumo bq tô hiến thành vào lúc họ mới khai trương ăn chương trình 30k j đó đồ ăn tạm ổn không gian đẹp và mới mà nhân viên thì chán quá bọn mình đi 15 ng ngồi 1 phòng vip chỉ sợ nhân viên vất vả nên bảo thôi các em không phải nướng bọn anh tự nướng cho anh gọi thì mang đồ cho anh turn đầu rất nhanh và đầy đặn nhưng những lần sau thì càng ngày càng ít và càng lâu như kiểu sợ mình ăn hết cả bếp vậy 15 ng có 2 bếp mà mỗi lần bê đc 2 đĩa bò mỗi đĩa 4 miếng j đó thì sao xuể ngồi chờ các em bê lên hết cả hơi tráng miệng đax đặn để lại hoa quả mà cũng lờ đi k chuẩn bị báo hết cái gì cugx hết xong mình ra ngoài đi wc 3 em gái nhân viên ngồi tám may quá về rồi phục vụ đám này chắc hôm nay phải đòi tăng lương thực sự rất giận và tự hứa k bao giờ quay lại nữa   mất đến gần 5tr tiền ăn mà bị đối xử như ăn mày vậy', '1 tô như này mà 15k   tin được ko ăn no lắm luôn mà lại ngon nữa satế cực cay n

## **MODEL**

In [7]:
from fairseq.data import Dictionary
import sentencepiece as spm
from os.path import join as pjoin
from transformers import PreTrainedTokenizer
import sentencepiece as spm

class XLMRobertaTokenizer(PreTrainedTokenizer):
    """
    XLM-RoBERTa tokenizer adapted from transformers.PreTrainedTokenizer. This helps to convert the input text into 
    tokenized format. eg, 
    
    input: "Hello, how are you?" output: ["1", "2", "3", "65", "2", "1"]
    
    this class also provides the method to convert the tokenized format into the original text.
    
    eg, input: ["1", "2", "3", "65", "2", "1"] output: "Hello, how are you?"
    
    """
    def __init__(
            self,
            pretrained_file,
            bos_token="<s>",
            eos_token="</s>",
            sep_token="</s>",
            cls_token="<s>",
            unk_token="<unk>",
            pad_token="<pad>",
            mask_token="<mask>",
            **kwargs
    ):
        """
        :param pretrained_file: path to the pretrained model file
        :param bos_token: beginning of sentence token
        :param eos_token: end of sentence token
        :param sep_token: separation token
        :param cls_token: classification token
        :param unk_token: unknown token
        :param pad_token: padding token
        :param mask_token: mask token
        """
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        
        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model)  # please dont use anything from sp_model bcz it makes everything goes wrong
        self.bpe_dict = Dictionary().load(vocab_file)
        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0
        self.fairseq_tokens_to_ids["<mask>"] = len(self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

    def _tokenize(self, text):
        """ Tokenize a string. """
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        """ Size of the base vocabulary (without the added tokens) """
        return len(self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        """ Returns the vocabulary as a list of tokens. """
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
    

In [8]:
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification
import torch

pretrained_path = '/content/drive/MyDrive/envibert/'
!ls $pretrained_path
# load tokenizer
roberta = XLMRobertaForSequenceClassification.from_pretrained(pretrained_path)
tokenizer = XLMRobertaTokenizer(pretrained_path)

config.json  dict.txt  model.pt  pytorch_model.bin  sentencepiece.bpe.model


Some weights of the model checkpoint at /content/drive/MyDrive/envibert/ were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/envibert/ and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this mode

In [9]:
# try to convert some text into numbers
inputs = ["Tôi ghét nó", "Tôi thích nó", "Tôi quý nó"]
inputs = tokenizer(inputs, return_tensors='pt')
print(inputs)
outputs = roberta(**inputs, labels=torch.tensor([0, 1, 1]))
print(outputs)

{'input_ids': tensor([[ 842, 8919,  543],
        [ 842,  648,  543],
        [ 842,  976,  543]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}
SequenceClassifierOutput(loss=tensor(0.6868, grad_fn=<NllLossBackward0>), logits=tensor([[0.0120, 0.0346],
        [0.0425, 0.0999],
        [0.0039, 0.0082]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [10]:
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score


class SentimentRoberta(pl.LightningModule):
    """
    SentimentRoberta class inherits from LightningModule
    This class is used to train a model using PyTorch Lightning
    It overrides the following methods:
        - forward : forward pass of the model
        - training_step : training step of the model
        - validation_step : validation step of the model
        - validation_epoch_end : end of the validation epoch
        - configure_optimizers : configure optimizers
    """
    def __init__(self, lr_roberta, lr_classifier):
        """
        Initialize the model with the following parameters:
            - lr_roberta : learning rate of the roberta model
            - lr_classifier : learning rate of the classifier model
        """
        super().__init__()
        self.roberta = XLMRobertaForSequenceClassification.from_pretrained(pretrained_path)
        self.tokenizer = XLMRobertaTokenizer(pretrained_path)
        self.lr_roberta = lr_roberta
        self.lr_classifer = lr_classifier

    def forward(self, texts, labels=None):
        """
        Forward pass of the model
        Args:
            - texts : input texts
            - labels : labels of the input texts
        """
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)

        outputs = self.roberta(**inputs, labels=labels)
        return outputs

    def configure_optimizers(self):
        """
        Configure optimizers
        This method is used to configure the optimizers of the model by using the learning rate
        for specific parameter of the roberta model and the classifier model
        """
        roberta_params = self.roberta.roberta.named_parameters()
        classifier_params = self.roberta.classifier.named_parameters()

        grouped_params = [
            {"params": [p for n, p in roberta_params], "lr": self.lr_roberta},
            {"params": [p for n, p in classifier_params], "lr": self.lr_classifer}
        ]
        optimizer = torch.optim.AdamW(
            grouped_params
        )
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.98)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'f1/val',
            }
        }

    def training_step(self, batch, batch_idx):
        """
        Training step of the model
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step of the model, used to compute the metrics
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()

        output_scores = torch.softmax(logits, dim=-1)
        return loss, output_scores, labels

    def validation_epoch_end(self, validation_step_outputs):
        """
        End of the validation epoch, this method will be called at the end of the validation epoch,
        it will compute the multiple metrics of classification problem
        Args:
            - validation_step_outputs : outputs of the validation step
        """

        val_preds = torch.tensor([], device=self.device)
        val_scores = torch.tensor([], device=self.device)
        val_labels = torch.tensor([], device=self.device)
        val_loss = 0
        total_item = 0

        for idx, item in enumerate(validation_step_outputs):
            loss, output_scores, labels = item

            predictions = torch.argmax(output_scores, dim=-1)
            val_preds = torch.cat((val_preds, predictions), dim=0)
            val_scores = torch.cat((val_scores, output_scores[:, 1]), dim=0)
            val_labels = torch.cat((val_labels, labels), dim=0)

            val_loss += loss
            total_item += 1

        # print("VAL PREDS", val_preds.shape)
        # print("VAL SCORES", val_scores.shape)
        # print("VAL LABELS", val_labels.shape)
        val_preds = val_preds.cpu().numpy()
        val_scores = val_scores.cpu().numpy()
        val_labels = val_labels.cpu().numpy()

        reports = classification_report(val_labels, val_preds, output_dict=True)
        print("VAL LABELS", val_labels)
        print("VAL SCORES", val_scores)
        try:
            auc = roc_auc_score(val_labels, val_scores)
        except Exception as e:
            print(e)
            print("Cannot calculate AUC. Default to 0")
            auc = 0
        accuracy = accuracy_score(val_labels, val_preds)

        print(classification_report(val_labels, val_preds))

        self.log("loss/val", val_loss)
        self.log("auc/val", auc)
        self.log("accuracy/val", accuracy)
        self.log("precision/val", reports["weighted avg"]["precision"])
        self.log("recall/val", reports["weighted avg"]["recall"])
        self.log("f1/val", reports["weighted avg"]["f1-score"])

In [11]:
trainer = pl.Trainer(
    fast_dev_run=True,
)
model = SentimentRoberta(lr_roberta=1e-5, lr_classifier=3e-3)
dm = SentimentDataModule()

trainer.fit(model, dm)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
Some weights of the model checkpoint at /content/drive/MyDrive/envibert/ were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificatio

27000 21600 5400


  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1.]
VAL SCORES [0.00533327 0.00639464 0.00211217 0.016949   0.00575063 0.01762968
 0.06442668 0.02085559 0.00798262 0.00322134 0.00846163 0.00462749
 0.01220848 0.01852747 0.02916402 0.00787626]
              precision    recall  f1-score   support

         0.0       0.56      1.00      0.72         9
         1.0       0.00      0.00      0.00         7

    accuracy                           0.56        16
   macro avg       0.28      0.50      0.36        16
weighted avg       0.32      0.56      0.40        16



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **Training**

In [12]:
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

torch.manual_seed(123)

tb_logger = pl_loggers.TensorBoardLogger('/content/drive/MyDrive/colab/tb_logs/')

trainer = pl.Trainer(
    min_epochs=1,
    max_epochs=5,
    gpus=1,
    precision=16,
    val_check_interval=0.5,
    # check_val_every_n_epoch=1,
    callbacks=[
      ModelCheckpoint(
          dirpath='/content/drive/MyDrive/ckpt',
          save_top_k=3,
          monitor='f1/val',
      ), 
      EarlyStopping('f1/val', patience=5)
    ],
    fast_dev_run=False,
    logger=tb_logger
)

dm.setup(stage="fit")
trainer.fit(model, dm)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                                | Params
----------------------------------------------------------------
0 | roberta | XLMRobertaForSequenceClassification | 70.7 M
----------------------------------------------------------------
70.7 M    Trainable params
0         Non-trainable params
70.7 M    Total params
141.409   Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1.
 0. 1. 0. 0. 1. 0. 0. 0.]
VAL SCORES [0.00534326 0.00641316 0.00211983 0.01694742 0.00576373 0.01759375
 0.06459446 0.02088422 0.00795339 0.00322876 0.0082676  0.00457386
 0.01219413 0.01854656 0.02909076 0.00789198 0.00818791 0.00708012
 0.00818791 0.00370005 0.01693116 0.00993642 0.00778566 0.00871138
 0.0014579  0.05203782 0.02042984 0.00575255 0.01859997 0.00789198
 0.02375733 0.00326035]
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.75        19
         1.0       0.00      0.00      0.00        13

    accuracy                           0.59        32
   macro avg       0.30      0.50      0.37        32
weighted avg       0.35      0.59      0.44        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [0.05608106 0.9935993  0.9034647  ... 0.02691231 0.99297464 0.9790157 ]
              precision    recall  f1-score   support

         0.0       0.96      0.78      0.86      2683
         1.0       0.82      0.97      0.89      2717

    accuracy                           0.87      5400
   macro avg       0.89      0.87      0.87      5400
weighted avg       0.89      0.87      0.87      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [0.06841291 0.9496227  0.3479801  ... 0.04268228 0.8713654  0.74363375]
              precision    recall  f1-score   support

         0.0       0.91      0.88      0.89      2683
         1.0       0.88      0.91      0.90      2717

    accuracy                           0.89      5400
   macro avg       0.89      0.89      0.89      5400
weighted avg       0.89      0.89      0.89      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [3.1236487e-03 9.9939287e-01 5.1978034e-01 ... 6.1662967e-04 9.9835473e-01
 9.9429202e-01]
              precision    recall  f1-score   support

         0.0       0.94      0.85      0.89      2683
         1.0       0.86      0.95      0.90      2717

    accuracy                           0.90      5400
   macro avg       0.90      0.90      0.90      5400
weighted avg       0.90      0.90      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [0.06365676 0.9218515  0.44699255 ... 0.01578456 0.91030645 0.7736015 ]
              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90      2683
         1.0       0.88      0.92      0.90      2717

    accuracy                           0.90      5400
   macro avg       0.90      0.90      0.90      5400
weighted avg       0.90      0.90      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [0.00334391 0.9995566  0.4471434  ... 0.00337005 0.99877197 0.9924521 ]
              precision    recall  f1-score   support

         0.0       0.94      0.85      0.89      2683
         1.0       0.86      0.95      0.90      2717

    accuracy                           0.90      5400
   macro avg       0.90      0.90      0.90      5400
weighted avg       0.90      0.90      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 0.]
VAL SCORES [0.0282746  0.89796406 0.34565726 ... 0.01819437 0.78825736 0.7749668 ]
              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90      2683
         1.0       0.90      0.90      0.90      2717

    accuracy                           0.90      5400
   macro avg       0.90      0.90      0.90      5400
weighted avg       0.90      0.90      0.90      5400



  f"DataModule.{name} has already been called, so it will not be called again. "
