<a href="https://colab.research.google.com/github/taolati20011/INT3405/blob/main/BERTtrain_multilingual_cased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

INSTALL DEPENDENCES

In [15]:
!pip install pytorch-lightning
!pip install transformers
!pip install sentencepiece
!pip install fairseq



In [16]:
# Mount to drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Replace this path to your dataset directory
DATA_ROOT_DIR = "/content/drive/MyDrive/shopee-sentiment"
!ls $DATA_ROOT_DIR

sample_submission.csv  train.csv	     train_preprocess_unsegment.csv
test.csv	       train.gsheet
test_preprocess.csv    train_preprocess.csv


In [18]:
# Import some dependences
import pandas as pd
import numpy as np
from torch.utils.data import random_split, DataLoader, Dataset
import pytorch_lightning as pl
import torch.nn as nn
import torch
from typing import Optional

train_ratio = 0.8
DATA_DIR = DATA_ROOT_DIR + '/train_preprocess_unsegment.csv'

In [19]:
# Use pandas to read csv, this will return a excel like table data
train = pd.read_csv(DATA_DIR, index_col=0)
train.head()

Unnamed: 0,id,text,class,preprocess_text
0,dee6dfc5,"Đến quán 2 lần thôi , rất là thích !\nQuán tuy...",1,đến quán 2 lần thôi rất là thích quán tuy nằm ...
1,800813f5,Đến quán vào tối chủ_nhật . Có band hát . Khá ...,0,đến quán vào tối chủ nhật có band hát khá ổn t...
2,6553e47f,Phục_vụ lâu quá mặc_dù khách rất vắng .\nĐợi g...,0,phục vụ lâu quá mặc dù khách rất vắng đợi gần ...
3,b45a1ff1,"Ko gian bé_tí , quán chật_chội , đông người nê...",0,ko gian bé tí quán chật chội đông người nên ...
4,f92694b0,"Khi mình order , đặt bánh thì nhận được sự tiế...",1,khi mình order đặt bánh thì nhận được sự tiếp ...


In [20]:
class SentimentData(Dataset):
  def __init__(self, data_dir):
    """
    Args:
    data_dir (string): Directory with the csv file
    """
    self.df = pd.read_csv(data_dir, index_col = 0)
  
  def __len__(self):
    """
    length of the dataset, i.e. number of rows in the csv file
    Returns: int 
    """
    return len(self.df)
  
  def __getitem__(self, idx):
    """
    given a row index, returns the corresponding row of the csv file
    Returns: text (string), label (int) 
    """
    text = self.df["preprocess_text"][idx]
    label = self.df["class"][idx]

    return text, label

class SentimentDataModule(pl.LightningDataModule):
    """
    Module class for sentiment analysis. this class is used to load the data to the model. 
    It is a subclass of LightningDataModule. 
    """
    def __init__(self, data_dir: str = DATA_DIR, batch_size: int = 16):
        """
        Args:
            data_dir (string): Directory with the csv file
            batch_size (int): batch size for dataloader
        """
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        """
        Loads the data to the model. 
        the data is loaded in the setup function, so that it is loaded only once. 
        """
        data_full = SentimentData(self.data_dir)
        train_size = round(len(data_full) * train_ratio)
        val_size = len(data_full) - train_size
        print(len(data_full), train_size, val_size)
        self.data_train, self.data_val = random_split(data_full, [train_size, val_size])

    def train_dataloader(self):
        """
        Returns: dataloader for training
        """
        return DataLoader(self.data_train, batch_size=self.batch_size)

    def val_dataloader(self):
        """
        Returns: dataloader for validation
        """
        return DataLoader(self.data_val, batch_size=self.batch_size)

# Do some Test with data
if __name__ == "__main__":
	dm = SentimentDataModule(DATA_DIR)
	dm.setup()
	idx = 0
	for item in (dm.train_dataloader()):
		print(idx)
		print(item)
		idx += 1
		if idx > 5: break

27000 21600 5400
0
[('chả hiểu nhân viên làm ăn kiểu gì mua 5 miếng mực mang về về đến nơi nở ra cháy thui đầy mỡ cầm diã xiên vào còn k đc thế mà cũng lấy tiền rồi phục vụ đc', 'mình không biết nên dùng quán hay nhà hàng vì không gian không quá rộng nhưng trông rất sang trọng và sạch sẽ ấn tượng đầu tiên từ sự lịch sự của nhân viên tại đây món chính ở đây là piza mang một hương vị rất riêng không giống với các loại bánh piza mình vẫn ăn ở sài gòn nhân viên cũng rất chiều lòng khách khi lịch sự hỏi mình muốn dùng loại bánh đế dày hay mỏng đặc biệt còn cho phép mình chọn nhân bánh theo sở thích nhưng vì không rành lắm nên mình chỉ chọn loại bánh king mark   mình đi 2 người nhưng ăn không hết vì bánh khá to thức uống ở đây nói chung là khá tốt đặc biệt có món nước cóc nhìn rất đẹp mắt và ngon trước đây mình không nghĩ cóc có thể ép lấy nước được nói chung ở bmt không tìm được nhiều nơi họp mặt ăn tối như thế này', 'không gian quán trang trí đẹp tươi mát nước thì không ngon lắm có dịch vụ

## **MODEL**

In [21]:
from fairseq.data import Dictionary
import sentencepiece as spm
from os.path import join as pjoin
from transformers import PreTrainedTokenizer
import sentencepiece as spm

class XLMRobertaTokenizer(PreTrainedTokenizer):
    """
    XLM-RoBERTa tokenizer adapted from transformers.PreTrainedTokenizer. This helps to convert the input text into 
    tokenized format. eg, 
    
    input: "Hello, how are you?" output: ["1", "2", "3", "65", "2", "1"]
    
    this class also provides the method to convert the tokenized format into the original text.
     
    eg, input: ["1", "2", "3", "65", "2", "1"] output: "Hello, how are you?"
    
    """
    def __init__(
            self,
            pretrained_file,
            bos_token="<s>",
            eos_token="</s>",
            sep_token="</s>",
            cls_token="<s>",
            unk_token="<unk>",
            pad_token="<pad>",
            mask_token="<mask>",
            **kwargs
    ):
        """
        :param pretrained_file: path to the pretrained model file
        :param bos_token: beginning of sentence token
        :param eos_token: end of sentence token
        :param sep_token: separation token
        :param cls_token: classification token
        :param unk_token: unknown token
        :param pad_token: padding token
        :param mask_token: mask token
        """
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
        
        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model)  # please dont use anything from sp_model bcz it makes everything goes wrong
        self.bpe_dict = Dictionary().load(vocab_file)
        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0
        self.fairseq_tokens_to_ids["<mask>"] = len(self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

    def _tokenize(self, text):
        """ Tokenize a string. """
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        """ Size of the base vocabulary (without the added tokens) """
        return len(self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        """ Returns the vocabulary as a list of tokens. """
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
    

In [22]:
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification
import torch

pretrained_path = '/content/drive/MyDrive/envibert/'
!ls $pretrained_path
multi_model = 'bert-base-multilingual-cased'
# load tokenizer
roberta = XLMRobertaForSequenceClassification.from_pretrained(multi_model)
tokenizer = XLMRobertaTokenizer(pretrained_path)

config.json  dict.txt  model.pt  pytorch_model.bin  sentencepiece.bpe.model


You are using a model of type bert to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing XLMRobertaForSequenceClassification: ['bert.encoder.layer.2.attention.self.value.bias', 'bert.encoder.layer.4.attention.self.query.bias', 'bert.encoder.layer.3.attention.self.query.bias', 'bert.encoder.layer.3.attention.output.dense.weight', 'bert.encoder.layer.2.attention.output.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.weight', 'cls.seq_relationship.bias', 'bert.encoder.layer.8.attention.output.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.weight', 'bert.encoder.layer.6.attention.self.value.weight', 'cls.predictions.bias', 'bert.encoder.layer.1.attention.self.query.bias', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.att

In [23]:
# try to convert some text into numbers
inputs = ["Tôi ghét nó", "Tôi thích nó", "Tôi quý nó"]
inputs = tokenizer(inputs, return_tensors='pt')
print(inputs)
outputs = roberta(**inputs, labels=torch.tensor([0, 1, 1]))
print(outputs)

{'input_ids': tensor([[ 842, 8919,  543],
        [ 842,  648,  543],
        [ 842,  976,  543]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}
SequenceClassifierOutput(loss=tensor(0.6560, grad_fn=<NllLossBackward0>), logits=tensor([[0.1081, 0.1682],
        [0.1700, 0.2207],
        [0.1125, 0.3616]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [24]:
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score


class SentimentRoberta(pl.LightningModule):
    """
    SentimentRoberta class inherits from LightningModule
    This class is used to train a model using PyTorch Lightning
    It overrides the following methods:
        - forward : forward pass of the model
        - training_step : training step of the model
        - validation_step : validation step of the model
        - validation_epoch_end : end of the validation epoch
        - configure_optimizers : configure optimizers
    """
    def __init__(self, lr_roberta, lr_classifier):
        """
        Initialize the model with the following parameters:
            - lr_roberta : learning rate of the roberta model
            - lr_classifier : learning rate of the classifier model
        """
        super().__init__()
        self.roberta = XLMRobertaForSequenceClassification.from_pretrained(pretrained_path)
        self.tokenizer = XLMRobertaTokenizer(pretrained_path)
        self.lr_roberta = lr_roberta
        self.lr_classifer = lr_classifier

    def forward(self, texts, labels=None):
        """
        Forward pass of the model
        Args:
            - texts : input texts
            - labels : labels of the input texts
        """
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)

        outputs = self.roberta(**inputs, labels=labels)
        return outputs

    def configure_optimizers(self):
        """
        Configure optimizers
        This method is used to configure the optimizers of the model by using the learning rate
        for specific parameter of the roberta model and the classifier model
        """
        roberta_params = self.roberta.roberta.named_parameters()
        classifier_params = self.roberta.classifier.named_parameters()

        grouped_params = [
            {"params": [p for n, p in roberta_params], "lr": self.lr_roberta},
            {"params": [p for n, p in classifier_params], "lr": self.lr_classifer}
        ]
        optimizer = torch.optim.AdamW(
            grouped_params
        )
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.98)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'f1/val',
            }
        }

    def training_step(self, batch, batch_idx):
        """
        Training step of the model
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step of the model, used to compute the metrics
        Args:
            - batch : batch of the data
            - batch_idx : index of the batch
        """
        texts, labels = batch
        outputs = self(texts, labels=labels)

        if len(outputs.values()) == 3:
            loss, logits, _ = outputs.values()
        else:
            loss, logits = outputs.values()

        output_scores = torch.softmax(logits, dim=-1)
        return loss, output_scores, labels

    def validation_epoch_end(self, validation_step_outputs):
        """
        End of the validation epoch, this method will be called at the end of the validation epoch,
        it will compute the multiple metrics of classification problem
        Args:
            - validation_step_outputs : outputs of the validation step
        """

        val_preds = torch.tensor([], device=self.device)
        val_scores = torch.tensor([], device=self.device)
        val_labels = torch.tensor([], device=self.device)
        val_loss = 0
        total_item = 0

        for idx, item in enumerate(validation_step_outputs):
            loss, output_scores, labels = item

            predictions = torch.argmax(output_scores, dim=-1)
            val_preds = torch.cat((val_preds, predictions), dim=0)
            val_scores = torch.cat((val_scores, output_scores[:, 1]), dim=0)
            val_labels = torch.cat((val_labels, labels), dim=0)

            val_loss += loss
            total_item += 1

        # print("VAL PREDS", val_preds.shape)
        # print("VAL SCORES", val_scores.shape)
        # print("VAL LABELS", val_labels.shape)
        val_preds = val_preds.cpu().numpy()
        val_scores = val_scores.cpu().numpy()
        val_labels = val_labels.cpu().numpy()

        reports = classification_report(val_labels, val_preds, output_dict=True)
        print("VAL LABELS", val_labels)
        print("VAL SCORES", val_scores)
        try:
            auc = roc_auc_score(val_labels, val_scores)
        except Exception as e:
            print(e)
            print("Cannot calculate AUC. Default to 0")
            auc = 0
        accuracy = accuracy_score(val_labels, val_preds)

        print(classification_report(val_labels, val_preds))

        self.log("loss/val", val_loss)
        self.log("auc/val", auc)
        self.log("accuracy/val", accuracy)
        self.log("precision/val", reports["weighted avg"]["precision"])
        self.log("recall/val", reports["weighted avg"]["recall"])
        self.log("f1/val", reports["weighted avg"]["f1-score"])

In [25]:
trainer = pl.Trainer(
    fast_dev_run=True,
)
model = SentimentRoberta(lr_roberta=1e-5, lr_classifier=3e-3)
dm = SentimentDataModule()

trainer.fit(model, dm)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).
Some weights of the model checkpoint at /content/drive/MyDrive/envibert/ were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificatio

27000 21600 5400


  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1.]
VAL SCORES [0.24849708 0.19594167 0.29711363 0.27237555 0.42527446 0.2254733
 0.47977757 0.16361475 0.26737562 0.554814   0.67098767 0.39491248
 0.3205925  0.4494153  0.38901308 0.5723379 ]
              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87        10
         1.0       1.00      0.50      0.67         6

    accuracy                           0.81        16
   macro avg       0.88      0.75      0.77        16
weighted avg       0.86      0.81      0.79        16



## **Training**

In [26]:
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

torch.manual_seed(123)

tb_logger = pl_loggers.TensorBoardLogger('/content/drive/MyDrive/colab/tb_logs/')

trainer = pl.Trainer(
    min_epochs=1,
    max_epochs=5,
    gpus=1,
    precision=16,
    val_check_interval=0.5,
    # check_val_every_n_epoch=1,
    callbacks=[
      ModelCheckpoint(
          dirpath='/content/drive/MyDrive/ckpt',
          save_top_k=3,
          monitor='f1/val',
      ), 
      EarlyStopping('f1/val', patience=5)
    ],
    fast_dev_run=False,
    logger=tb_logger
)

dm.setup(stage="fit")
trainer.fit(model, dm)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                                | Params
----------------------------------------------------------------
0 | roberta | XLMRobertaForSequenceClassification | 70.7 M
----------------------------------------------------------------
70.7 M    Trainable params
0         Non-trainable params
70.7 M    Total params
141.409   Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 0. 1.]
VAL SCORES [0.2486254  0.19620943 0.29752097 0.27212125 0.4254262  0.22584318
 0.4795589  0.16371268 0.26759952 0.55468154 0.67033863 0.39487165
 0.3209277  0.44936997 0.38998207 0.57340974 0.39717826 0.33928013
 0.31462938 0.6766733  0.43231803 0.64220816 0.25818568 0.41507664
 0.28572062 0.34064966 0.20850107 0.4358565  0.2666436  0.6581979
 0.4210756  0.38300648]
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.73        15
         1.0       1.00      0.35      0.52        17

    accuracy                           0.66        32
   macro avg       0.79      0.68      0.63        32
weighted avg       0.80      0.66      0.62        32



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [0.07613248 0.6695288  0.0141451  ... 0.01308339 0.99256825 0.99298817]
              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90      2699
         1.0       0.88      0.93      0.90      2701

    accuracy                           0.90      5400
   macro avg       0.90      0.90      0.90      5400
weighted avg       0.90      0.90      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [0.08858347 0.8327813  0.00602695 ... 0.00477372 0.99722075 0.9960699 ]
              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90      2699
         1.0       0.89      0.93      0.91      2701

    accuracy                           0.90      5400
   macro avg       0.91      0.90      0.90      5400
weighted avg       0.91      0.90      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [0.0297883  0.81390494 0.00150705 ... 0.0025809  0.99758446 0.9971551 ]
              precision    recall  f1-score   support

         0.0       0.92      0.89      0.91      2699
         1.0       0.89      0.93      0.91      2701

    accuracy                           0.91      5400
   macro avg       0.91      0.91      0.91      5400
weighted avg       0.91      0.91      0.91      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [0.09347093 0.9725715  0.00369285 ... 0.0021998  0.99566    0.9953904 ]
              precision    recall  f1-score   support

         0.0       0.94      0.87      0.90      2699
         1.0       0.88      0.94      0.91      2701

    accuracy                           0.91      5400
   macro avg       0.91      0.90      0.90      5400
weighted avg       0.91      0.91      0.90      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [3.4979649e-02 9.2434698e-01 7.5836002e-04 ... 3.4496887e-03 9.9726379e-01
 9.9735802e-01]
              precision    recall  f1-score   support

         0.0       0.94      0.87      0.90      2699
         1.0       0.88      0.94      0.91      2701

    accuracy                           0.91      5400
   macro avg       0.91      0.91      0.91      5400
weighted avg       0.91      0.91      0.91      5400



Validating: 0it [00:00, ?it/s]

VAL LABELS [0. 1. 0. ... 0. 1. 1.]
VAL SCORES [0.01973801 0.93189764 0.00495331 ... 0.00385436 0.9969187  0.99603146]
              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91      2699
         1.0       0.90      0.92      0.91      2701

    accuracy                           0.91      5400
   macro avg       0.91      0.91      0.91      5400
weighted avg       0.91      0.91      0.91      5400



  f"DataModule.{name} has already been called, so it will not be called again. "
