# Import Library

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertModel, BertConfig

import pandas as pd
import numpy as np

from google.colab import drive

In [None]:
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


# Load Dataset

In [None]:
file_path = "/content/MyDrive/MyDrive/research-dataset.csv"

In [None]:
df = pd.read_csv(file_path)
df

Unnamed: 0,Sentence,Sentiment
0,benaran ngomong begitu nih,NETRAL
1,mbg ini masih jalan sih perkembangannya gimana ya,NETRAL
2,semoga ini menjadi awal yang baik untuk peruba...,POSITIF
3,itu pinjaman ya nanti baliknya ada lebih sedik...,NEGATIF
4,biasanya habis omon omon ada yang klarifikasi,NEGATIF
...,...,...
1241,semoga ini jadi awal yang baik,POSITIF
1242,kami optimis indonesia akan lebih baik di bawa...,POSITIF
1243,ini contoh nyata pemimpin yang peduli dengan r...,POSITIF
1244,kerja keras bapak sangat kami apresiasi,POSITIF


In [None]:
df['Sentiment'] = [i.strip() for i in df['Sentiment']]
df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
NEGATIF,424
NETRAL,422
POSITIF,400


In [None]:
# lower all sentence
temp = []
for i in df['Sentence']:
  temp.append(i.lower())
df['Sentence'] = temp
df

Unnamed: 0,Sentence,Sentiment
0,benaran ngomong begitu nih,NETRAL
1,mbg ini masih jalan sih perkembangannya gimana ya,NETRAL
2,semoga ini menjadi awal yang baik untuk peruba...,POSITIF
3,itu pinjaman ya nanti baliknya ada lebih sedik...,NEGATIF
4,biasanya habis omon omon ada yang klarifikasi,NEGATIF
...,...,...
1241,semoga ini jadi awal yang baik,POSITIF
1242,kami optimis indonesia akan lebih baik di bawa...,POSITIF
1243,ini contoh nyata pemimpin yang peduli dengan r...,POSITIF
1244,kerja keras bapak sangat kami apresiasi,POSITIF


# Configuration

In [None]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
bert_model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

# Splitting

In [None]:
X = df['Sentence']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test

Unnamed: 0,Sentence
735,yang menunda pengangkatan honorer itu jelas me...
442,skill issue wowo sejak dulu ya begitu ada alas...
1117,perlu waktu untuk lihat dampaknya
333,tapi pendukung dan buzzer seolah olah saya men...
327,perlu ada kejelasan teknis dalam pelaksanaan
...,...
1144,semoga semakin banyak kebijakan yang memperhat...
1060,harapannya bener bener ada hasil
1190,sip nih siapa pun yang sama hak rakyat bakal k...
423,selagi pelaku masih hidup bisa jadi balik lagi


# Tokenize

In [None]:
MAX_LEN = 128
train_encoded = tokenizer(
    list(X_train),
    padding=True,
    truncation=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

test_encoded = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

In [None]:
train_encoded

{'input_ids': tensor([[    2,  6043,  2569,  ...,     0,     0,     0],
        [    2,  3453, 21919,  ...,     0,     0,     0],
        [    2,  2132,   308,  ...,     0,     0,     0],
        ...,
        [    2,   916,    79,  ...,     0,     0,     0],
        [    2,   664,   256,  ...,     0,     0,     0],
        [    2, 21785,  4229,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

# Encode Label

In [None]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [None]:
y_test

array([1, 1, 1, 0, 1, 1, 2, 2, 1, 0, 1, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0,
       0, 1, 2, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 1, 1, 2,
       1, 1, 2, 0, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 0,
       2, 2, 0, 2, 1, 2, 2, 0, 1, 2, 0, 0, 0, 1, 2, 1, 2, 1, 2, 0, 1, 1,
       0, 0, 1, 0, 2, 1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2, 2, 0, 0, 1,
       2, 1, 2, 0, 2, 0, 0, 2, 0, 1, 1, 2, 2, 2, 0, 1, 2, 2, 1, 1, 0, 2,
       0, 0, 2, 1, 1, 2, 2, 0, 2, 1, 1, 2, 2, 2, 0, 0, 1, 2, 2, 2, 0, 0,
       1, 2, 1, 0, 2, 1, 1, 0, 2, 2, 0, 2, 1, 0, 2, 1, 2, 0, 0, 0, 1, 1,
       0, 1, 0, 2, 2, 1, 2, 2, 1, 0, 1, 1, 2, 2, 1, 0, 2, 0, 1, 0, 0, 1,
       2, 0, 0, 2, 2, 1, 2, 0, 2, 1, 1, 1, 1, 2, 2, 0, 1, 0, 0, 1, 2, 2,
       2, 0, 1, 0, 1, 2, 2, 2, 0, 0, 1, 0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 0,
       2, 0, 0, 2, 1, 0, 0, 1])

# Preparing Dataset

In [None]:
X_train_input_ids = train_encoded['input_ids']
X_train_attention_mask = train_encoded['attention_mask']
y_train = torch.tensor(y_train)

X_test_input_ids = test_encoded['input_ids']
X_test_attention_mask = test_encoded['attention_mask']
y_test = torch.tensor(y_test)

train_dataset = TensorDataset(X_train_input_ids, X_train_attention_mask, y_train)
test_dataset = TensorDataset(X_test_input_ids, X_test_attention_mask, y_test)

train_dataloader = DataLoader(train_dataset, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=True)

# Modeling

In [None]:
class ClassifyUsingBERT(nn.Module):
  def __init__(self, bert_model, hidden_size=768, n_classes=3, lr=2e-5):
    super().__init__()
    self.bert = bert_model
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(in_features=hidden_size, out_features=n_classes)
    self.learning_rate = lr

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    output = self.drop(pooled_output)
    return self.out(output)

  def predict(self, sentence):
    encoded = tokenizer(
        sentence,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )

    with torch.no_grad():
      outputs = self.forward(encoded['input_ids'], encoded['attention_mask'])
      probs = torch.softmax(outputs, dim=1)
      pred = torch.argmax(probs, dim=1)
    return encoder.inverse_transform(pred.numpy())

  def get_optimizer(self):
    return Adam(self.parameters(), lr=self.learning_rate)

  def training_steps(self, dataloader: DataLoader, epochs: int = 100):
    optimizer = self.get_optimizer()
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
      total_loss = 0

      for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = self.forward(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        total_loss += float(loss)

      if total_loss < 0.0001:
        break

      optimizer.step()
      optimizer.zero_grad()

      if epoch % 10 == 0:
        print(f'Epoch {epoch}: Loss {total_loss}')

In [None]:
model = ClassifyUsingBERT(bert_model, n_classes=3)

# Predict Without Training

In [None]:
sentence = "Kerja bagus pak prabowo"
model.predict(sentence)

array(['NETRAL'], dtype=object)

# Training

In [None]:
model.training_steps(train_dataloader, epochs=1)

Epoch 0: Loss 704.7072067186236


# Predict

In [None]:
sentence = "Kerja bagus"
model.predict(sentence)

array(['POSITIF'], dtype=object)

# Test

In [None]:
real_y = []
y_pred = []
model.eval()

with torch.no_grad():
  for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    real_y.extend(labels)
    outputs = model(input_ids, attention_mask)
    preds = torch.argmax(outputs, dim=1)
    y_pred.extend(preds)

In [None]:
print('BERT Result')
print(f'Accuracy: {accuracy_score(real_y, y_pred)}')
print(f'Recall: {recall_score(real_y, y_pred, average="macro")}')
print(f'F1: {f1_score(real_y, y_pred, average="macro")}')
print(f'Precision: {precision_score(real_y, y_pred, average="macro")}')

BERT Result
Accuracy: 0.76
Recall: 0.7538808967950296
F1: 0.7520487849999992
Precision: 0.7709121870041411


## Model Parameters

In [None]:
for name, param in model.named_parameters():
  print(name, param.shape)

bert.embeddings.word_embeddings.weight torch.Size([50000, 768])
bert.embeddings.position_embeddings.weight torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight torch.Size([2, 768])
bert.embeddings.LayerNorm.weight torch.Size([768])
bert.embeddings.LayerNorm.bias torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
bert.encoder

# Save Model

In [None]:
# Save pretrained BERT part
model.bert.save_pretrained('/content/MyDrive/MyDrive/saved_bert')

# Save linear part (PyTorch native)
torch.save(model.out.state_dict(), '/content/MyDrive/MyDrive/saved_bert/out.pt')

# Save tokenizer
tokenizer.save_pretrained('/content/MyDrive/MyDrive/saved_bert')

('/content/MyDrive/MyDrive/saved_bert/tokenizer_config.json',
 '/content/MyDrive/MyDrive/saved_bert/special_tokens_map.json',
 '/content/MyDrive/MyDrive/saved_bert/vocab.txt',
 '/content/MyDrive/MyDrive/saved_bert/added_tokens.json')

# Load Model

In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('/content/MyDrive/MyDrive/saved_bert')

# Load BERT backbone
bert_model = BertModel.from_pretrained('/content/MyDrive/MyDrive/saved_bert')

# Buat ulang linear dan load weight
out = nn.Linear(768, 3)
out.load_state_dict(torch.load('/content/MyDrive/MyDrive/saved_bert/out.pt'))

# Gabung lagi
model = ClassifyUsingBERT(bert_model, n_classes=3)
model.out = out