<a href="https://colab.research.google.com/github/tienhuynh96/Aspect-Based-Sentiment-Analysis-Project/blob/main/Demo_%5Bcolab_2%5D_Aspect_Based_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Data preparation

## 1.1 Download and load dataset

In [None]:
# Download dataset
# Dataset is SemEval-2014 Task 4: Aspect Based Sentiment Analysis
# Dataset is preprocessing include: remove punctuation, normalize and separate based on whitespace
!gdown 1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C

Downloading...
From: https://drive.google.com/uc?id=1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C
To: /content/data.zip
  0% 0.00/151k [00:00<?, ?B/s]100% 151k/151k [00:00<00:00, 45.1MB/s]


In [None]:
# Unzip dataset
!unzip ./data.zip

Archive:  ./data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/restaurants_train.csv  
  inflating: __MACOSX/data/._restaurants_train.csv  
  inflating: data/restaurants_test.csv  
  inflating: __MACOSX/data/._restaurants_test.csv  


In [None]:
# Load dataset
import pandas as pd

# Load train dataset
train_df = pd.read_csv('./data/restaurants_train.csv')
# Load test dataset
test_df = pd.read_csv('./data/restaurants_test.csv')

In [None]:
# Show a sample dataset
train_df.iloc[0]

Tokens        ['But', 'the', 'staff', 'was', 'so', 'horrible...
Tags                                [0, 0, 1, 0, 0, 0, 0, 0, 0]
Polarities                  [-1, -1, 0, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

## 1.2 Tokenization

In [None]:
# Tokenizer for sub-word
# We try sub word in this project
from transformers import BertTokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [None]:
tokenizer.cls_token_id

101

In [None]:
tokenizer.sep_token_id

102

## 1.3 Build Dataset

In [None]:
import torch
# Customize data to train
from torch.utils.data import Dataset

# Defining a custome Dataset class for Aspect Based Sentiment Analysis (ABSA)
class ABSADataset(Dataset):
    # Input is dataframe and tokenizer
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    # Defining the method to retrieve a single item from the dataframe
    def __getitem__(self, idx):
        # Each item get: Tokens, Tags and Polarities
        tokens, tags, pols = self.df.iloc[idx, :3].values

        # tokens: Replace unnecesary character "'", strip "][" and split them by ", "
        tokens = tokens.replace("'", "").strip("][").split(', ')
        # tags: strip "][" and split them by ", "
        tags = tags.strip('][').split(', ')
        # pols: strip "][" and split them by ", "
        pols = pols.strip('][').split(', ')

        # We use tokenizer for sub-word, so we need create 3 empty list for restore sub-word from tokens, tags, pols
        # Bert tokens
        bert_tokens = []
        # Bert aspect or object (use in pretrained bert)
        bert_att = []
        # pols label
        pols_label = 0

        # Iterator for get all words in tokens
        for i in range(len(tokens)):
            # tokenize each tokens => sub word (sub token)
            t = self.tokenizer.tokenize(tokens[i])
            # Restore sub token in bert_tokens
            bert_tokens += t

            # Check pols[i] is not -1 (or 0, 1, 2)
            if int(pols[i]) != -1:
                # Append Bert aspect
                bert_att += t
                # Save pols label
                pols_label = int(pols[i])

        # Segment_tensor is to know where is the sentence and insertting aspect (use in pretrained bert)
        # Segment_tensor is: CLS + bert_tokens + SEP + Aspect
        segment_tensor = [0] + [0]*len(bert_tokens) + [0] + [1]*len(bert_att)
        # bert tokens in this project is including: CLS + bert_tokens + SEP + Aspect
        bert_tokens = ['[CLS]'] + bert_tokens + ['[SEP]'] + bert_att

        # Convert sub token to ids
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        # Convert to tensor type
        ids_tensor = torch.tensor(bert_ids)
        pols_tensor = torch.tensor(pols_label)
        segment_tensor = torch.tensor(segment_tensor)

        return bert_tokens, ids_tensor, segment_tensor, pols_tensor

    # Defining the method to return the length of dataframe
    def __len__(self):
        return len(self.df)

In [None]:
# Build dataset
train_ds = ABSADataset(train_df, tokenizer)
test_ds = ABSADataset(test_df, tokenizer)

In [None]:
# Check a sample
next(iter(train_ds))

(['[CLS]',
  'but',
  'the',
  'staff',
  'was',
  'so',
  'horrible',
  'to',
  'us',
  '[SEP]',
  'staff'],
 tensor([ 101, 2021, 1996, 3095, 2001, 2061, 9202, 2000, 2149,  102, 3095]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 tensor(0))

## 1.4 Dataloader

In [None]:
# Padding
# Use pad_sequence in RNN, padding follow the highest len samples in dataset
from torch.nn.utils.rnn import pad_sequence

def padding(samples):
    # get the token ids => s[1]
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    # get the segments_tensors => s[2]
    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    # get the pols_tensor => s[3], takes a sequence of tensors and concatenates them into a single tensor
    label_ids = torch.stack([s[3] for s in samples])

    # Create mask, used for Bert
    # Create zero matrix for mask
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    # Fill 1 if ids_tensors != 0
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [None]:
# Build DataLoader
from torch.utils.data import DataLoader

# batch_size = 2  #for check
batch_size = 32

train_loader = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)
test_loader = DataLoader(
    test_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)

In [None]:
next(iter(train_loader))

(tensor([[  101,  1045,  2052,  ...,     0,     0,     0],
         [  101,  3621,  2682,  ...,     0,     0,     0],
         [  101,  2256, 15610,  ...,     0,     0,     0],
         ...,
         [  101,  1996,  2158,  ...,     0,     0,     0],
         [  101,  2049,  2025,  ...,     0,     0,     0],
         [  101,  2151,  2065,  ...,     0,     0,     0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 1, 2, 0, 1, 0, 2, 0, 2, 2, 0, 2, 2, 2,
         2, 0, 0, 2, 0, 1, 0, 2]))

In [None]:
next(iter(train_loader))[0].shape

torch.Size([32, 72])

# 2. Model

In [None]:
from transformers import BertModel

# Build model using Bert pretrain model.
class ABSABert(torch.nn.Module):
    def __init__(self, model_name):
        super(ABSABert, self).__init__()
        # Using Bert from pretrained model.
        self.bert = BertModel.from_pretrained(model_name)
        # linear: input is 768, output is 3 ( 0, 1, 2).
        # self.bert.config.hidden_size is 768 in model.
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        # Use cross entropy for loss
        self.loss_fn = torch.nn.CrossEntropyLoss()

    # Build forward function
    def forward(self, ids_tensors, masks_tensors, segments_tensors, lable_tensors):
        # Bert model ouput
        # The return_dict= Default => indicates that the output will be a dictionary
        out_dict = self.bert(
            input_ids=ids_tensors,
            attention_mask=masks_tensors,
            # segment tensor
            token_type_ids=segments_tensors
        )
        # Select element "pooler_output" in output dictionary (first element in bert)
        linear_outputs = self.linear(out_dict['pooler_output'])

        # Loss Calculation and Output:
        if lable_tensors is not None:
            # Compute loss if lable is not none
            loss = self.loss_fn(linear_outputs, lable_tensors)
            return loss, linear_outputs
        else:
            return linear_outputs

In [None]:
# Call model
model = ABSABert(model_name)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# Put model to device
model.to(device)

ABSABert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

# 3. Training

## 3.1 Build training function

In [None]:
import time
import numpy as np

# Define train function
def train_epoch(model, optimizer, train_loader, device):
    # Initialize empty variable losses
    losses = []
    # Get data from each batch
    for batch in (train_loader):
        # ids_tensors, segments_tensors, masks_tensors, label_ids from batch
        ids_tensors, segments_tensors, masks_tensors, label_ids = batch
        # Put data to device
        ids_tensors = ids_tensors.to(device)
        segments_tensors = segments_tensors.to(device)
        label_ids = label_ids.to(device)
        masks_tensors = masks_tensors.to(device)

        # Compute loss and outputs "_"
        loss, _ = model(
            ids_tensors=ids_tensors,
            masks_tensors=masks_tensors,
            segments_tensors=segments_tensors,
            lable_tensors=label_ids
        )

        # Save the loss
        losses.append(loss.item())
        # Backward
        loss.backward()
        # Otimize weight
        optimizer.step()
        # Update value
        optimizer.zero_grad()
    # Return average loss
    return sum(losses)/len(losses)

# Define evaluate function
def evaluate_epoch(model, valid_loader, device):
    # Initialize empty variable losses
    losses = []

    # Initialize empty variable predictions (preds) and labels
    preds, labels = [], []
    # Run with no update weight mode (no grad)
    with torch.no_grad():
        # Get data from each batch
        for batch in (valid_loader):
          # ids_tensors, segments_tensors, masks_tensors, label_ids from batch
            ids_tensors, segments_tensors, masks_tensors, label_ids = batch
            # Put data to device
            ids_tensors = ids_tensors.to(device)
            segments_tensors = segments_tensors.to(device)
            masks_tensors = masks_tensors.to(device)
            label_ids = label_ids.to(device)

            # Compute loss and outputs
            loss, outputs = model(
                ids_tensors=ids_tensors,
                masks_tensors=masks_tensors,
                segments_tensors=segments_tensors, #***************
                lable_tensors=label_ids
            )

            # Save the loss
            losses.append(loss.item())

            # Computes the maximum values and their indices along the specified dimension (dim=1), output: ********
            # "_" contains the maximum values, "p" contains the indices of these maximum values
            _, p = torch.max(outputs, dim=1)
            # Convert prediction to a list
            preds += list([int(i) for i in p])
            # Convert lables to a list
            labels += list([int(i) for i in label_ids])

    # Element-wise Comparison and compute accuracy (mean)
    acc = np.mean(np.array(preds) == np.array(labels))
    # Return average losss and acc
    return sum(losses)/len(losses), acc

# Define train function
def train(model, model_name, save_model, optimizer, train_loader, valid_loader, num_epochs, device):
    # Initialize empty variable
    train_losses = []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []

    # Get data from each batch
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        # Training
        train_loss = train_epoch(model, optimizer, train_loader, device)
        train_losses.append(train_loss)

        # Evaluation
        eval_loss, eval_acc = evaluate_epoch(model, valid_loader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, train_loss, eval_acc, eval_loss
            )
        )
        print("-" * 59)

    # Load best model
    # loads the model state dictionary (i.e., the parameters and buffers of the model)
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt'))
    # Setting the Model to Evaluation Mode
    model.eval()
    # Creating a Metrics Dictionary
    metrics = {
        'train_loss': train_losses,
        'valid_accuracy': eval_accs,
        'valid_loss': eval_losses,
        'time': times
    }
    return model, metrics

In [None]:
# Set optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
# Check train_epoch
loss = train_epoch(model, optimizer, train_loader, device)
loss

0.8637604280910661

In [None]:
# Check evaluate_epochh
loss, acc = evaluate_epoch(model, test_loader, device)
loss, acc

(0.5851292737892696, 0.7837354781054513)

## 3.2 Training

In [None]:
# Create folder model
!mkdir "./model"

In [None]:
# Set save model
save_model = "./model"
# Set model
model = ABSABert(model_name)
# Put model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set number of epochs
num_epochs = 5
# Train model
best_model, metrics = train(
    model, model_name, save_model, optimizer, train_loader, test_loader, num_epochs, device
)

-----------------------------------------------------------
| End of epoch   1 | Time: 46.77s | Train Loss    0.829 | Valid Accuracy    0.748 | Valid Loss    0.644 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   2 | Time: 39.83s | Train Loss    0.539 | Valid Accuracy    0.801 | Valid Loss    0.507 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   3 | Time: 39.13s | Train Loss    0.353 | Valid Accuracy    0.814 | Valid Loss    0.469 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   4 | Time: 40.16s | Train Loss    0.218 | Valid Accuracy    0.777 | Valid Loss    0.623 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   5 | Time: 39.52s | Trai

# 4. Prediction

In [None]:
test_df.iloc[0]

Tokens        ['The', 'bread', 'is', 'top', 'notch', 'as', '...
Tags                                   [0, 1, 0, 0, 0, 0, 0, 0]
Polarities                      [-1, 2, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

In [None]:
# Define predict function
# Input include: sentence and aspect
def predict(model, tokenizer, sentence, aspect, device):
    # Tokenize for sentence
    t1 = tokenizer.tokenize(sentence)
    # Tokenize for aspect
    t2 = tokenizer.tokenize(aspect)

    # Create input for model: cls + sentence + sep + aspect
    word_pieces = ['[CLS]'] + t1 + ['[SEP]'] + t2

    # Create segment tensor for model (focus on aspect)
    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    # Convert tokens to id
    input_ids = tokenizer.convert_tokens_to_ids(word_pieces)
    # Convert input_ids to tensor and put to device
    input_tensor = torch.tensor([input_ids]).to(device)
    # Convert segment_tensor to tensor and put to device
    segment_tensor = torch.tensor([segment_tensor]).to(device)

    # Compute output
    with torch.no_grad():
        outputs = model(input_tensor, None, segment_tensor, None)
        # Get idex of output
        _, predictions = torch.max(outputs, dim=1)

    return word_pieces, int(predictions), outputs

In [None]:
# Check
" ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))

'The bread is top notch as well'

In [None]:
# Check with a sample
# Get input
sentence = " ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))
# Get aspect
aspect = "bread"
# predict
predict(best_model, tokenizer, sentence, aspect, device)

(['[CLS]',
  'the',
  'bread',
  'is',
  'top',
  'notch',
  'as',
  'well',
  '[SEP]',
  'bread'],
 2,
 tensor([[-1.9732, -1.9526,  3.3590]], device='cuda:0'))