<a href="https://colab.research.google.com/github/vatoer/pu-sentiment/blob/main/indoBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data
df = pd.read_csv("translated_sentiment_data.csv", sep=",")

# Select the text column and the target column
text_column = "translated_text"
target_column = "Sentiment"

# Handle NaN values
df[target_column] = df[target_column].fillna(0).astype(int)

# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")

# Create a custom dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        label = self.labels[idx]
        label_shifted = int(label + 1)  # Convert label_shifted to an integer
        one_hot_label = torch.zeros(3)  # 3 classes
        one_hot_label[label_shifted] = 1
        item['labels'] = one_hot_label
        return item

    def __len__(self):
        return len(self.labels)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create training and testing datasets
train_dataset = SentimentDataset(tokenizer(train_df['clean_text'].astype(str).tolist(), padding=True, truncation=True, max_length=512), train_df[target_column].tolist())
test_dataset = SentimentDataset(tokenizer(test_df['clean_text'].astype(str).tolist(), padding=True, truncation=True, max_length=512), test_df[target_column].tolist())

# Create training and testing data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("indolem/indobert-base-uncased", num_labels=3)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Training loop
def train(model, dataloader, optimizer, device):
    model.train()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [None]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train(model, train_loader, optimizer, device)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predicted = torch.argmax(logits, dim=-1)
            # Convert true_labels to binary targets
            true_labels.extend(torch.argmax(batch['labels'], dim=-1).cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    return accuracy, precision, recall, f1

In [None]:
# Evaluate the model on the testing set
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")

Test Accuracy: 0.8958
Test Precision: 0.8665
Test Recall: 0.8958
Test F1-Score: 0.8717


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def predict_sentiment(text, model, tokenizer, device):
    """Predicts the sentiment of a given text using the trained model.

    Args:
        text (str): The text to analyze.
        model (transformers.AutoModelForSequenceClassification): The trained sentiment analysis model.
        tokenizer (transformers.AutoTokenizer): The tokenizer used for the model.
        device (torch.device): The device to run the model on (CPU or GPU).

    Returns:
        str: The predicted sentiment label ("negative", "neutral", or "positive").
    """
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Move inputs to the device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_label = torch.argmax(outputs.logits, dim=-1).item()

    # Map label to sentiment
    sentiment_mapping = {0: "negative", 1: "neutral", 2: "positive"}
    predicted_sentiment = sentiment_mapping[predicted_label]

    return predicted_sentiment



In [None]:
# Example usage:
tweet = "banyak korupsi #kabinetMerahPutih"
predicted_sentiment = predict_sentiment(tweet, model, tokenizer, device)
print(f"Predicted sentiment: {predicted_sentiment}")

# Example usage:
tweet = "Kabinet baru dilantik di istana negara"
predicted_sentiment = predict_sentiment(tweet, model, tokenizer, device)
print(f"Predicted sentiment: {predicted_sentiment}")


# Example usage:
tweet = "selamat untuk kabinet yang baru, semoga Indonesia makin maju"
predicted_sentiment = predict_sentiment(tweet, model, tokenizer, device)
print(f"Predicted sentiment: {predicted_sentiment}")

Predicted sentiment: positive
Predicted sentiment: positive
Predicted sentiment: positive
