# Import Library

In [None]:
!pip install numpy==1.26.0
!pip install gensim



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence

from gensim.models import Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from google.colab import drive

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


# Load Data

In [None]:
file_path = "/content/MyDrive/MyDrive/research-dataset.csv"

In [None]:
df = pd.read_csv(file_path)
df

Unnamed: 0,Sentence,Sentiment
0,benaran ngomong begitu nih,NETRAL
1,mbg ini masih jalan sih perkembangannya gimana ya,NETRAL
2,semoga ini menjadi awal yang baik untuk peruba...,POSITIF
3,itu pinjaman ya nanti baliknya ada lebih sedik...,NEGATIF
4,biasanya habis omon omon ada yang klarifikasi,NEGATIF
...,...,...
1241,semoga ini jadi awal yang baik,POSITIF
1242,kami optimis indonesia akan lebih baik di bawa...,POSITIF
1243,ini contoh nyata pemimpin yang peduli dengan r...,POSITIF
1244,kerja keras bapak sangat kami apresiasi,POSITIF


In [None]:
df['Sentiment'] = [i.strip() for i in df['Sentiment']]

In [None]:
df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
NEGATIF,424
NETRAL,422
POSITIF,400


# Preprocess

In [None]:
for idx, sentence in enumerate(df['Sentence']):
  words = word_tokenize(sentence)
  words = [word.lower() for word in words if word.isalpha()]
  words = [word for word in words if word not in stopwords.words('indonesian')]
  df.loc[idx, 'Sentence'] = ' '.join(words)
df

Unnamed: 0,Sentence,Sentiment
0,benaran ngomong nih,NETRAL
1,mbg jalan sih perkembangannya gimana ya,NETRAL
2,semoga perubahan pemerintahan,POSITIF
3,pinjaman ya baliknya bisnis,NEGATIF
4,habis omon omon klarifikasi,NEGATIF
...,...,...
1241,semoga,POSITIF
1242,optimis indonesia kepemimpinan,POSITIF
1243,contoh nyata pemimpin peduli rakyat,POSITIF
1244,kerja keras apresiasi,POSITIF


In [None]:
all_sentences = list(df['Sentence'])
all_sentences = [word_tokenize(sentence) for sentence in all_sentences]
all_sentences[0:2]

[['benaran', 'ngomong', 'nih'],
 ['mbg', 'jalan', 'sih', 'perkembangannya', 'gimana', 'ya']]

In [None]:
temp = []
for idx, sent in enumerate(all_sentences):
  if len(sent) >= 1:
    temp.append(sent)
  else:
    df.drop(index=idx, inplace=True)
all_sentences = temp

# Variables

In [None]:
embedding_dim = 100
window_size = 5
hidden_size = 128

# Word Embeddings

In [None]:
embedding_model = Word2Vec(all_sentences, vector_size=embedding_dim, window=window_size, min_count=1)
embedding_model.wv['joko']

array([-0.0045286 , -0.00287669, -0.00327441, -0.00795131, -0.00164339,
       -0.00441174, -0.00324444,  0.00069526,  0.00708205,  0.00811579,
       -0.0067065 , -0.00550145,  0.00440483,  0.00247138,  0.00187877,
       -0.00208883,  0.00495909,  0.00757344,  0.00688422,  0.00645346,
       -0.00698185, -0.00782422,  0.00688151,  0.00627382, -0.00108973,
        0.00549345, -0.00885701,  0.00308212, -0.00300999,  0.00539954,
       -0.00021437, -0.00547646, -0.00802602,  0.0050889 ,  0.00760772,
       -0.00080364, -0.00548525,  0.00420858,  0.00417563,  0.00017112,
       -0.00681593, -0.00722905, -0.00897463, -0.00771889,  0.00728402,
       -0.00384408, -0.0084662 , -0.00744923, -0.00126926,  0.00866947,
        0.00294147,  0.00164629, -0.00401854, -0.00758189,  0.00066674,
       -0.00896798, -0.00605638,  0.00797303,  0.00135288, -0.00779549,
       -0.0037238 , -0.00223826, -0.00906876, -0.00354703, -0.00721217,
        0.00175369,  0.00011784, -0.00536108,  0.00497196, -0.00

# Padding

In [None]:
embedded_sequence = []
for sentence in all_sentences:
  sent_embeddings = [torch.tensor(embedding_model.wv[word]) for word in sentence if word in embedding_model.wv]
  embedded_sequence.append(sent_embeddings)

In [None]:
padded = pad_sequence(
    [torch.stack(seq) for seq in embedded_sequence],
    batch_first=True,
    padding_value=0.0
)
padded[0].shape

torch.Size([13, 100])

In [None]:
max_pad_len = len(padded[0])
max_pad_len

13

# Preparing Dataset

In [None]:
encoder = LabelEncoder()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])
df['Sentiment']

Unnamed: 0,Sentiment
0,1
1,1
2,2
3,0
4,0
...,...
1241,2
1242,2
1243,2
1244,2


In [None]:
X = padded
y = df['Sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_dataset = TensorDataset(X_train, torch.tensor(y_train))
test_dataset = TensorDataset(X_test, torch.tensor(y_test))

train_dataloader = DataLoader(train_dataset, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=False)

# GRU

In [None]:
class ClassifyUsingGRU(nn.Module):
  def __init__(self, vector_size: int, hidden_size: int, lr: float = 0.001):
    super().__init__()
    self.learning_rate = lr

    self.gru = nn.GRU(
        input_size=vector_size,
        hidden_size=hidden_size,
        batch_first=True
    )

    self.input_to_hidden_linear = nn.Linear(in_features=hidden_size, out_features=64, bias=True)
    self.hidden_to_output_linear = nn.Linear(in_features=64, out_features=3, bias=True)

  def forward(self, inputs: torch.Tensor):
    output, h_n = self.gru(inputs)
    # output adalah kumpulan dari hidden state tiap unroll
    # h_n adalah hidden state terakhir
    input_to_hidden = self.input_to_hidden_linear(h_n.squeeze(0))
    relu = torch.relu(input_to_hidden)
    hidden_to_output = self.hidden_to_output_linear(relu)
    return hidden_to_output

    # return h_n

  def predict(self, inputs: torch.Tensor, encoder: LabelEncoder):
    with torch.no_grad():
      result = self.forward(inputs)
      result = torch.softmax(result, dim=-1)
      result = torch.argmax(result, dim=-1)
      result = result.reshape(-1)
    return encoder.inverse_transform(result)

  def get_optimizers(self):
    return Adam(self.parameters(), lr=self.learning_rate)

  def training_steps(self, dataloader: DataLoader, epochs: int = 100):
    optimizer = self.get_optimizers()
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
      total_loss = 0

      for batch in dataloader:
        X, y = batch
        y_pred = self.forward(X)
        loss = criterion(y_pred, y)
        loss.backward()
        total_loss += float(loss)

      if total_loss < 0.0001:
        break

      optimizer.step()
      optimizer.zero_grad()

      if epoch % 10 == 0:
        print(f'Epoch {epoch}: Loss {total_loss}')

## Try Predict Without Training

In [None]:
embedding_first_try = padded[0]
embedding_first_try

tensor([[ 0.0050, -0.0053,  0.0007,  ..., -0.0033, -0.0002,  0.0074],
        [-0.0096,  0.0061,  0.0047,  ...,  0.0068,  0.0044,  0.0076],
        [-0.0080,  0.0053,  0.0028,  ...,  0.0082,  0.0075,  0.0066],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
gru_model = ClassifyUsingGRU(embedding_dim, hidden_size)
result = gru_model.predict(embedding_first_try.unsqueeze(0), encoder)
print(f'Prediction for {all_sentences[0]}')
print(f'{result}')

Prediction for ['benaran', 'ngomong', 'nih']
['POSITIF']


## Training

In [None]:
gru_model.training_steps(train_dataloader, epochs=400)

Epoch 0: Loss 1088.105110526085
Epoch 10: Loss 1085.2609467506409
Epoch 20: Loss 1085.130828499794
Epoch 30: Loss 1085.1123563051224
Epoch 40: Loss 1085.0800815820694
Epoch 50: Loss 1084.9342789649963
Epoch 60: Loss 1082.8008357286453
Epoch 70: Loss 1026.8642408251762
Epoch 80: Loss 964.1639813482761
Epoch 90: Loss 939.6406997144222
Epoch 100: Loss 919.8133267015219
Epoch 110: Loss 902.7177403569221
Epoch 120: Loss 887.0452955663204
Epoch 130: Loss 873.9281403124332
Epoch 140: Loss 862.0336077362299
Epoch 150: Loss 893.5498815476894
Epoch 160: Loss 851.2003500461578
Epoch 170: Loss 845.9292049109936
Epoch 180: Loss 832.661196693778
Epoch 190: Loss 816.7138336598873
Epoch 200: Loss 850.9328913390636
Epoch 210: Loss 805.3848039358854
Epoch 220: Loss 802.9336355030537
Epoch 230: Loss 783.0058150440454
Epoch 240: Loss 861.9388611055911
Epoch 250: Loss 781.9992435872555
Epoch 260: Loss 769.4914872348309
Epoch 270: Loss 761.7900388501585
Epoch 280: Loss 871.5771068260074
Epoch 290: Loss 845.

In [None]:
def preprocess(sentence: str, embedding_model: Word2Vec, max_len: int = 20):
  sentence = sentence.lower()
  words = word_tokenize(sentence)
  words = [word for word in words if word.isalpha()]
  words = [word for word in words if word not in stopwords.words('indonesian')]

  embedded = []
  for word in words:
    if word in embedding_model.wv:
      embedded.append(torch.tensor(embedding_model.wv[word]))

  # jika tidak ada kata valid
  if len(embedded) == 0:
    embedded = [torch.zeros(embedding_model.vector_size)]

  # pad or truncate
  if len(embedded) < max_len:
    pad_len = max_len - len(embedded)
    embedded += [torch.zeros(embedding_model.vector_size)] * pad_len
  else:
    embedded = embedded[:max_len]

  return torch.stack(embedded)

In [None]:
coba = 'mantap' # misal
coba = preprocess(coba, embedding_model, max_len=max_pad_len).unsqueeze(0)
result = gru_model.predict(coba, encoder)
print(result)

['POSITIF']


# Test Dataset

In [None]:
real_y = []
y_pred = []
for batch in test_dataloader:
  X_test, y_test = batch
  real_y.extend(y_test)
  pred = gru_model.predict(X_test, encoder)
  y_pred.extend(encoder.transform(pred))

In [None]:
print('GRU Result')
print(f'Accuracy: {accuracy_score(real_y, y_pred)}')
print(f'Recall: {recall_score(real_y, y_pred, average="macro")}')
print(f'F1: {f1_score(real_y, y_pred, average="macro")}')
print(f'Precision: {precision_score(real_y, y_pred, average="macro")}')

GRU Result
Accuracy: 0.5344129554655871
Recall: 0.5358146873207114
F1: 0.5293808849724225
Precision: 0.5403481012658228


## Model Parameters

In [None]:
for name, param in gru_model.named_parameters():
  print(name, param.shape)

gru.weight_ih_l0 torch.Size([384, 100])
gru.weight_hh_l0 torch.Size([384, 128])
gru.bias_ih_l0 torch.Size([384])
gru.bias_hh_l0 torch.Size([384])
input_to_hidden_linear.weight torch.Size([64, 128])
input_to_hidden_linear.bias torch.Size([64])
hidden_to_output_linear.weight torch.Size([3, 64])
hidden_to_output_linear.bias torch.Size([3])


# Save Model

In [None]:
# torch.save(gru_model.state_dict(), '/content/gru_model.pth')

# Load (jika mau pakai lagi)
gru_model = ClassifyUsingGRU(embedding_dim, hidden_size)
gru_model.load_state_dict(torch.load('/content/MyDrive/MyDrive/gru_model.pth'))
gru_model.eval()  # jangan lupa set eval mode

ClassifyUsingGRU(
  (gru): GRU(100, 128, batch_first=True)
  (input_to_hidden_linear): Linear(in_features=128, out_features=64, bias=True)
  (hidden_to_output_linear): Linear(in_features=64, out_features=3, bias=True)
)