<a href="https://colab.research.google.com/github/tienhuynh96/Aspect-Based-Sentiment-Analysis-Project/blob/main/Demo_%5Bcolab_1%5D_Aspect_Based_Term_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Data preparation

## 1.1 Download and load dataset

In [None]:
# Upgrade gdown => for download
!pip install --upgrade gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 5.1.0
    Uninstalling gdown-5.1.0:
      Successfully uninstalled gdown-5.1.0
Successfully installed gdown-5.2.0


In [None]:
# Download dataset
# Dataset is SemEval-2014 Task 4: Aspect Based Sentiment Analysis
# Dataset is preprocessing include: remove punctuation, normalize and separate based on whitespace
!gdown 1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C

Downloading...
From: https://drive.google.com/uc?id=1d7JABk4jViI-USjLsWmhGkvzi8uQIL5C
To: /content/data.zip
  0% 0.00/151k [00:00<?, ?B/s]100% 151k/151k [00:00<00:00, 83.3MB/s]


In [None]:
# Unzip dataset
!unzip ./data.zip

Archive:  ./data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/restaurants_train.csv  
  inflating: __MACOSX/data/._restaurants_train.csv  
  inflating: data/restaurants_test.csv  
  inflating: __MACOSX/data/._restaurants_test.csv  


In [None]:
# Load dataset
import pandas as pd

# Load train dataset
train_df = pd.read_csv('./data/restaurants_train.csv')
# Load test dataset
test_df = pd.read_csv('./data/restaurants_test.csv')

In [None]:
# Show a sample dataset
train_df.iloc[0]

Tokens        ['But', 'the', 'staff', 'was', 'so', 'horrible...
Tags                                [0, 0, 1, 0, 0, 0, 0, 0, 0]
Polarities                  [-1, -1, 0, -1, -1, -1, -1, -1, -1]
Name: 0, dtype: object

## 1.2 Tokenization

In [None]:
# Tokenizer for sub-word
# We try sub word in this project
from transformers import BertTokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [None]:
tokenizer.cls_token_id

101

In [None]:
tokenizer.sep_token_id

102

## 1.3 Build Dataset

In [None]:
import torch
# Customize data to train
from torch.utils.data import Dataset

# Defining a custome Dataset class for Aspect Based Sentiment Analysis (ABSA)
class ABSADataset(Dataset):
    # Input is dataframe and tokenizer
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    # Defining the method to retrieve a single item from the dataframe
    def __getitem__(self, idx):
        # Each item get: Tokens, Tags and Polarities
        tokens, tags, pols = self.df.iloc[idx, :3].values

        # tokens: Replace unnecesary character "'", strip "][" and split them by ", "
        tokens = tokens.replace("'", "").strip("][").split(', ')
        # tags: strip "][" and split them by ", "
        tags = tags.strip('][').split(', ')
        # pols: strip "][" and split them by ", "
        pols = pols.strip('][').split(', ')

        # We use tokenizer for sub-word, so we need create 3 empty list for restore sub-word from tokens, tags, pols
        bert_tokens = []
        bert_tags = []
        bert_pols = []

        # Itorator for get all word in tokens
        for i in range(len(tokens)):
            # tokenize each tokens => sub word (sub token)
            t = self.tokenizer.tokenize(tokens[i])
            # Restore sub token in bert_tokens
            bert_tokens += t
            # Restore tags in bert_tags, "*len(t)" is for ensure the len of sub token equal len of tags
            bert_tags += [int(tags[i])]*len(t)
            # Restore pols in bert_pols, "*len(t)" is for ensure the len of sub token equal len of pols
            bert_pols += [int(pols[i])]*len(t)

        # Convert sub token to ids
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        # Convert to tensor type
        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)

        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    # Defining the method to return the length of dataframe
    def __len__(self):
        return len(self.df)

In [None]:
# Build dataset
train_ds = ABSADataset(train_df, tokenizer)
test_ds = ABSADataset(test_df, tokenizer)

In [None]:
next(iter(train_ds))

(['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us'],
 tensor([2021, 1996, 3095, 2001, 2061, 9202, 2000, 2149]),
 tensor([0, 0, 1, 0, 0, 0, 0, 0]),
 tensor([-1, -1,  0, -1, -1, -1, -1, -1]))

## 1.4 Dataloader

In [None]:
# Padding
# Use pad_sequence in RNN, padding follow the highest len samples in dataset
from torch.nn.utils.rnn import pad_sequence

def padding(samples):
    # get the token ids => s[1]
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    # get the tags => s[2]
    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    # get the pols => s[3]
    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)

    # Create mask, used for Bert
    # Create zero matrix for mask
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    # Fill 1 if ids_tensors != 0
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [None]:
# Build DataLoader
from torch.utils.data import DataLoader

batch_size = 32
train_loader = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)
test_loader = DataLoader(
    test_ds, batch_size=batch_size, shuffle=True, collate_fn=padding
)

In [None]:
next(iter(train_loader))

(tensor([[ 1996,  2326,  2001,  ...,     0,     0,     0],
         [ 2096,  1996,  4707,  ...,     0,     0,     0],
         [ 2151,  2065,  2017,  ...,     0,     0,     0],
         ...,
         [ 1996, 17688,  8694,  ...,     0,     0,     0],
         [ 2305,  2302,  1037,  ...,     0,     0,     0],
         [ 2093,  5352,  3601,  ...,     0,     0,     0]]),
 tensor([[0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 1, 2,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0]]),
 tensor([[-1,  2, -1,  ...,  0,  0,  0],
         [-1, -1, -1,  ...,  0,  0,  0],
         [-1, -1, -1,  ...,  0,  0,  0],
         ...,
         [-1, -1, -1,  ...,  0,  0,  0],
         [-1, -1, -1,  ...,  0,  0,  0],
         [-1, -1, -1,  ...,  0,  0,  0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ...

In [None]:
next(iter(train_loader))[0].shape

torch.Size([32, 38])

# 2. Model

In [None]:
from transformers import BertModel

# Build model using Bert pretrain model.
class ABTEBert(torch.nn.Module):
    def __init__(self, model_name):
        super(ABTEBert, self).__init__()
        # Using Bert from pretrained model.
        self.bert = BertModel.from_pretrained(model_name)
        # linear: input is 768, output is 3 ( 0, 1, 2).
        # self.bert.config.hidden_size is 768 in model.
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        # Use cross entropy for loss
        self.loss_fn = torch.nn.CrossEntropyLoss()

    # Build forward function
    def forward(self, ids_tensors, masks_tensors, tags_tensors):
        # Bert model ouput
        # The return_dict=False parameter indicates that the output will be a tuple rather than a dictionary.
        bert_outputs= self.bert(
            input_ids=ids_tensors, attention_mask=masks_tensors, return_dict=False
            )
        # Selects the first element of the tuple, which typically represents the last hidden states of the BERT model for each input token.
        bert_outputs = bert_outputs[0]

        # Linear Layer Outputs:
        linear_outputs = self.linear(bert_outputs)

        # Loss Calculation and Output:
        if tags_tensors is not None:
            # Reshapes tags_tensors to a 1D tensor.
            tags_tensors = tags_tensors.view(-1)
            # Reshapes linear_outputs to have shape (-1, 3), where -1 infers the first dimension size and 3 is the number of tag classes.
            linear_outputs_ = linear_outputs.view(-1,3)
            # Compute loss
            loss = self.loss_fn(linear_outputs_, tags_tensors)
            return loss, linear_outputs
        else:
            return linear_outputs

In [None]:
# Call model
model = ABTEBert(model_name)

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# Put model to device
model.to(device)

ABTEBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

# 3. Training

## 3.1 Build training function

In [None]:
import time
import numpy as np
from sklearn.metrics import classification_report

# Define train function
def train_epoch(model, optimizer, train_loader, device):
    # Initialize empty variable losses
    losses = []
    # Get data from each batch
    for batch in (train_loader):
        # Get ids_tensors, tags_tensors, _, masks_tensors from batch
        ids_tensors, tags_tensors, _, masks_tensors = batch
        # Put data to device
        ids_tensors = ids_tensors.to(device)
        tags_tensors = tags_tensors.to(device)
        masks_tensors = masks_tensors.to(device)

        # Compute loss and outputs
        # loss, outputs = model(
        #     ids_tensors=ids_tensors,
        #     masks_tensors=masks_tensors,
        #     tags_tensors=tags_tensors
        # )
        loss, _ = model(
            ids_tensors=ids_tensors,
            masks_tensors=masks_tensors,
            tags_tensors=tags_tensors
        )


        # Save the loss
        losses.append(loss.item())
        # Backward
        loss.backward()
        # Otimize weight
        optimizer.step()
        # Update value
        optimizer.zero_grad()
    # Return average loss
    return sum(losses)/len(losses)

# Define evaluate function
def evaluate_epoch(model, valid_loader, device):
    # Initialize empty variable losses
    losses = []

    # Initialize empty variable predictions (preds) and labels
    preds, labels = [], []
    # Run with no update weight mode (no grad)
    with torch.no_grad():
        # Get data from each batch
        for batch in (valid_loader):
            # Get ids_tensors, tags_tensors, _, masks_tensors from batch
            ids_tensors, tags_tensors, _, masks_tensors = batch
            # Put data to device
            ids_tensors = ids_tensors.to(device)
            tags_tensors = tags_tensors.to(device)
            masks_tensors = masks_tensors.to(device)

            # Compute loss and outputs
            loss, outputs = model(
                ids_tensors=ids_tensors,
                masks_tensors=masks_tensors,
                tags_tensors=tags_tensors
            )
            # Save the loss
            losses.append(loss.item())

            # Computes the maximum values and their indices along the specified dimension (dim=2), output: (batch, sentence, tokens)
            # "_" contains the maximum values, "p" contains the indices of these maximum values
            _, p = torch.max(outputs, dim=2)
            # Flattens the indices p and converts it to a list of integers
            # Iterates over each element i in p, and then over each element j in i
            preds += list([int(j) for i in p for j in i ])
            # Flattens the tags_tensors and converts it to a list of integer
            labels += list([int(j) for i in tags_tensors for j in i ])

    # Element-wise Comparison and compute accuracy (mean)
    acc = np.mean(np.array(preds) == np.array(labels))
    # Return average losss and acc
    return sum(losses)/len(losses), acc

# Define train function
def train(model, model_name, save_model, optimizer, train_loader, valid_loader, num_epochs, device):
    # Initialize empty variable
    train_losses = []
    eval_accs, eval_losses = [], []
    best_loss_eval = 100
    times = []

    # Get data from each batch
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        # Training
        train_loss = train_epoch(model, optimizer, train_loader, device)
        train_losses.append(train_loss)

        # Evaluation
        eval_loss, eval_acc = evaluate_epoch(model, valid_loader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, train_loss, eval_acc, eval_loss
            )
        )
        print("-" * 59)

    # Load best model
    # loads the model state dictionary (i.e., the parameters and buffers of the model)
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt'))
    # Setting the Model to Evaluation Mode
    model.eval()
    # Creating a Metrics Dictionary
    metrics = {
        'train_loss': train_losses,
        'valid_accuracy': eval_accs,
        'valid_loss': eval_losses,
        'time': times
    }
    return model, metrics

In [None]:
# Set optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
# Check train_epoch
loss = train_epoch(model, optimizer, train_loader, device)
loss

0.2623182161983135

In [None]:
# # Check evaluate_epochh
loss, acc = evaluate_epoch(model, test_loader, device)
loss, acc

(0.22837289316313608, 0.9127831495727324)

## 3.2 Training

In [None]:
# Create folder model
!mkdir "./model"

In [None]:
# Set save model
save_model = "./model"
# Set model
model = ABTEBert(model_name)
# Put model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set number of epochs
num_epochs = 5
# Train model
best_model, metrics = train(
    model, model_name, save_model, optimizer, train_loader, test_loader, num_epochs, device
)

-----------------------------------------------------------
| End of epoch   1 | Time: 40.76s | Train Loss    0.261 | Valid Accuracy    0.908 | Valid Loss    0.236 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   2 | Time: 36.48s | Train Loss    0.167 | Valid Accuracy    0.912 | Valid Loss    0.219 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   3 | Time: 37.09s | Train Loss    0.119 | Valid Accuracy    0.918 | Valid Loss    0.226 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   4 | Time: 39.61s | Train Loss    0.077 | Valid Accuracy    0.915 | Valid Loss    0.266 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch   5 | Time: 36.83s | Trai

# 4. Prediction

In [None]:
# Define predict function
def predict(best_model, sentence, device):
    # Tokenize sentence input
    word_pieces = list(tokenizer.tokenize(sentence))
    # Convert tokens to ids
    input_ids = tokenizer.convert_tokens_to_ids(word_pieces)
    # Put input to device
    input_tensor = torch.tensor([input_ids]).to(device)

    # Predict
    with torch.no_grad():
        # Compute output (loss, linear output)
        outputs = model(input_tensor, None, None)
        # compute prediction
        _, predictions = torch.max(outputs, dim=2)

    # predictions to list
    predictions = predictions[0].tolist()
    return word_pieces, predictions, outputs

In [None]:
sentence = " ".join(test_df.iloc[0]["Tokens"].replace("'", "").strip("][").split(', '))
predict(best_model, sentence, device)

(['the', 'bread', 'is', 'top', 'notch', 'as', 'well'],
 [0, 1, 0, 0, 0, 0, 0],
 tensor([[[ 6.4141, -2.6148, -2.8782],
          [-1.9732,  3.9786, -2.0171],
          [ 5.6106, -2.2266, -2.3923],
          [ 5.5387, -2.0726, -2.5704],
          [ 5.6542, -2.3159, -2.3395],
          [ 5.4828, -2.1373, -2.5462],
          [ 5.3447, -2.1985, -2.5673]]], device='cuda:0'))