# Finetuning SmSA
SmSA is a Sentiment Analysis dataset with 3 possible labels: `positive`, `negative`, and `neutral`

In [1]:
import sys
sys.path.append('/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [4]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
# Set random seed
set_seed(26092020)

# Load Model

In [6]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
count_param(model)

124443651

# Prepare Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = '/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/400exhuma_preprocessed_data.csv'
df = pd.read_csv(file_path, sep=',')

# Split the dataset into train, test, and validation sets
train_df, test_valid_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, valid_df = train_test_split(test_valid_df, test_size=0.5, random_state=42)

# Save the splits as TSV files
train_df.to_csv('/content/sentiment/train.tsv', sep='\t', index=False)
test_df.to_csv('/content/sentiment/test.tsv', sep='\t', index=False)
valid_df.to_csv('/content/sentiment/valid.tsv', sep='\t', index=False)

In [9]:
train_dataset_path = '/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/dataset/smsa_doc-sentiment-prosa/train.tsv'
valid_dataset_path = '/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/dataset/smsa_doc-sentiment-prosa/valid.tsv'
test_dataset_path = '/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/dataset/smsa_doc-sentiment-prosa/test.tsv'

In [10]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [11]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


# Test model on sample sentences

In [12]:
text = 'film exhuma bagus'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: film exhuma bagus | Label : neutral (37.124%)


In [13]:
text = 'film exhuma jelek'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: film exhuma jelek | Label : negative (36.605%)


In [14]:
text = 'mau nonton exhuma plis review'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: mau nonton exhuma plis review | Label : negative (40.409%)


# Fine Tuning & Evaluation

In [15]:
import shutil

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
!nvidia-smi

Wed Mar 20 16:59:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [17]:
import torch
torch.cuda.is_available()

True

In [18]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [19]:
# Train
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:1.0897 LR:0.00000300: 100%|██████████| 10/10 [00:05<00:00,  1.69it/s]


(Epoch 1) TRAIN LOSS:1.0897 ACC:0.42 F1:0.41 REC:0.42 PRE:0.42 LR:0.00000300


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
VALID LOSS:0.9239 ACC:0.62 F1:0.26 REC:0.33 PRE:0.21: 100%|██████████| 2/2 [00:01<00:00,  1.70it/s]
  _warn_prf(average, modifier, msg_start, len(result))


(Epoch 1) VALID LOSS:0.9239 ACC:0.62 F1:0.26 REC:0.33 PRE:0.21


(Epoch 2) TRAIN LOSS:1.0278 LR:0.00000300: 100%|██████████| 10/10 [00:03<00:00,  2.66it/s]
  _warn_prf(average, modifier, msg_start, len(result))


(Epoch 2) TRAIN LOSS:1.0278 ACC:0.48 F1:0.31 REC:0.39 PRE:0.38 LR:0.00000300


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
VALID LOSS:0.8098 ACC:0.62 F1:0.26 REC:0.33 PRE:0.21: 100%|██████████| 2/2 [00:00<00:00,  2.58it/s]
  _warn_prf(average, modifier, msg_start, len(result))


(Epoch 2) VALID LOSS:0.8098 ACC:0.62 F1:0.26 REC:0.33 PRE:0.21


(Epoch 3) TRAIN LOSS:0.9620 LR:0.00000300: 100%|██████████| 10/10 [00:03<00:00,  2.98it/s]


(Epoch 3) TRAIN LOSS:0.9620 ACC:0.53 F1:0.40 REC:0.44 PRE:0.71 LR:0.00000300


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
VALID LOSS:0.8110 ACC:0.62 F1:0.36 REC:0.39 PRE:0.38: 100%|██████████| 2/2 [00:00<00:00,  2.58it/s]
  _warn_prf(average, modifier, msg_start, len(result))


(Epoch 3) VALID LOSS:0.8110 ACC:0.62 F1:0.36 REC:0.39 PRE:0.38


(Epoch 4) TRAIN LOSS:0.9192 LR:0.00000300: 100%|██████████| 10/10 [00:03<00:00,  2.82it/s]


(Epoch 4) TRAIN LOSS:0.9192 ACC:0.59 F1:0.49 REC:0.53 PRE:0.65 LR:0.00000300


VALID LOSS:0.8015 ACC:0.65 F1:0.52 REC:0.50 PRE:0.59: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s]


(Epoch 4) VALID LOSS:0.8015 ACC:0.65 F1:0.52 REC:0.50 PRE:0.59


(Epoch 5) TRAIN LOSS:0.8684 LR:0.00000300: 100%|██████████| 10/10 [00:03<00:00,  2.55it/s]


(Epoch 5) TRAIN LOSS:0.8684 ACC:0.63 F1:0.56 REC:0.58 PRE:0.65 LR:0.00000300


VALID LOSS:0.7125 ACC:0.68 F1:0.54 REC:0.51 PRE:0.63: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s]


(Epoch 5) VALID LOSS:0.7125 ACC:0.68 F1:0.54 REC:0.51 PRE:0.63


(Epoch 6) TRAIN LOSS:0.7988 LR:0.00000300: 100%|██████████| 10/10 [00:03<00:00,  2.53it/s]


(Epoch 6) TRAIN LOSS:0.7988 ACC:0.69 F1:0.64 REC:0.64 PRE:0.71 LR:0.00000300


VALID LOSS:0.6605 ACC:0.75 F1:0.67 REC:0.66 PRE:0.70: 100%|██████████| 2/2 [00:01<00:00,  1.28it/s]


(Epoch 6) VALID LOSS:0.6605 ACC:0.75 F1:0.67 REC:0.66 PRE:0.70


(Epoch 7) TRAIN LOSS:0.7279 LR:0.00000300: 100%|██████████| 10/10 [00:04<00:00,  2.24it/s]


(Epoch 7) TRAIN LOSS:0.7279 ACC:0.74 F1:0.72 REC:0.71 PRE:0.75 LR:0.00000300


VALID LOSS:0.5589 ACC:0.78 F1:0.69 REC:0.67 PRE:0.75: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]


(Epoch 7) VALID LOSS:0.5589 ACC:0.78 F1:0.69 REC:0.67 PRE:0.75


(Epoch 8) TRAIN LOSS:0.6617 LR:0.00000300: 100%|██████████| 10/10 [00:04<00:00,  2.22it/s]


(Epoch 8) TRAIN LOSS:0.6617 ACC:0.80 F1:0.77 REC:0.76 PRE:0.80 LR:0.00000300


VALID LOSS:0.4883 ACC:0.80 F1:0.72 REC:0.72 PRE:0.77: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s]


(Epoch 8) VALID LOSS:0.4883 ACC:0.80 F1:0.72 REC:0.72 PRE:0.77


(Epoch 9) TRAIN LOSS:0.5762 LR:0.00000300: 100%|██████████| 10/10 [00:04<00:00,  2.32it/s]


(Epoch 9) TRAIN LOSS:0.5762 ACC:0.84 F1:0.83 REC:0.82 PRE:0.84 LR:0.00000300


VALID LOSS:0.4282 ACC:0.80 F1:0.74 REC:0.75 PRE:0.77: 100%|██████████| 2/2 [00:01<00:00,  1.17it/s]


(Epoch 9) VALID LOSS:0.4282 ACC:0.80 F1:0.74 REC:0.75 PRE:0.77


(Epoch 10) TRAIN LOSS:0.5044 LR:0.00000300: 100%|██████████| 10/10 [00:04<00:00,  2.28it/s]


(Epoch 10) TRAIN LOSS:0.5044 ACC:0.85 F1:0.83 REC:0.82 PRE:0.85 LR:0.00000300


VALID LOSS:0.3897 ACC:0.82 F1:0.78 REC:0.79 PRE:0.79: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s]

(Epoch 10) VALID LOSS:0.3897 ACC:0.82 F1:0.78 REC:0.79 PRE:0.79





In [20]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 2/2 [00:00<00:00,  2.41it/s]

    index     label
0       0  positive
1       1  negative
2       2  positive
3       3   neutral
4       4   neutral
5       5  negative
6       6   neutral
7       7   neutral
8       8  negative
9       9  positive
10     10  positive
11     11   neutral
12     12   neutral
13     13  positive
14     14  positive
15     15  positive
16     16  positive
17     17  negative
18     18  positive
19     19   neutral
20     20  positive
21     21  positive
22     22   neutral
23     23   neutral
24     24  positive
25     25  negative
26     26   neutral
27     27  negative
28     28   neutral
29     29  positive
30     30  positive
31     31   neutral
32     32  positive
33     33  negative
34     34  positive
35     35  positive
36     36   neutral
37     37   neutral
38     38  negative
39     39  positive





In [24]:
torch.save(model, '/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/indonlu_model.pth')

# Test fine-tuned model on sample sentences

In [25]:
text = 'film exhuma bagus'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: exhuma bagus tapi jelek | Label : positive (84.873%)


In [22]:
text = 'film exhuma jelek'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: film exhuma jelek | Label : negative (61.890%)


In [23]:
text = 'mau nonton exhuma plis review'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: mau nonton exhuma plis review | Label : neutral (82.813%)


# run model

In [26]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
loc = torch.device('cpu')
model = torch.load('/content/drive/MyDrive/test_teknikal_kecilin_syalwadea/indonlu/indonlu_model.pth', map_location=loc)

In [27]:
text = 'mau nonton exhuma plis review'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: mau nonton exhuma plis review | Label : neutral (82.813%)
