In [1]:
# 📦 Install requirements (for Colab)
!pip install transformers datasets accelerate -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# ✅ Import libraries
import pandas as pd
import numpy as np
import re
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch

In [3]:
# 🔄 Load and preprocess dataset
df = pd.read_csv("/content/dataset.csv")

def minimal_clean(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"#", '', text)
    return text.strip()

df['cleaned_tweet'] = df['tweet'].astype(str).apply(minimal_clean)
label2id = {'negatif': 0, 'netral': 1, 'positif': 2}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['sentimen'].map(label2id)

In [5]:
# ✨ Tokenize with IndoBERTweet
model_name = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = Dataset.from_pandas(df[['cleaned_tweet', 'label']])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

def tokenize(batch):
    return tokenizer(batch['cleaned_tweet'], truncation=True, padding='max_length', max_length=128)

tokenized_ds = dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/1452 [00:00<?, ? examples/s]

Map:   0%|          | 0/363 [00:00<?, ? examples/s]

In [6]:
# 🧠 Load IndoBERTweet model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 📏 Define metrics
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }

In [18]:
training_args = TrainingArguments(
    output_dir="/content/bertweet_sentiment",
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    logging_dir="/content/logs"
)


In [19]:
# 🚀 Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [20]:
# 🔥 Train model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtpp354313[0m ([33mtpp354313-weq[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=364, training_loss=0.588993659386268, metrics={'train_runtime': 10281.3226, 'train_samples_per_second': 0.565, 'train_steps_per_second': 0.035, 'total_flos': 382040682541056.0, 'train_loss': 0.588993659386268, 'epoch': 4.0})

In [21]:
# 📊 Evaluate final model
trainer.evaluate()

{'eval_loss': 0.8210615515708923,
 'eval_accuracy': 0.6776859504132231,
 'eval_f1': 0.6784105141593063,
 'eval_runtime': 191.5385,
 'eval_samples_per_second': 1.895,
 'eval_steps_per_second': 0.063,
 'epoch': 4.0}

In [23]:
# 💾 Save final model and tokenizer
model.save_pretrained("/content/tweet_sentiment_model")
tokenizer.save_pretrained("/content/tweet_sentiment_model")

('/content/tweet_sentiment_model/tokenizer_config.json',
 '/content/tweet_sentiment_model/special_tokens_map.json',
 '/content/tweet_sentiment_model/vocab.txt',
 '/content/tweet_sentiment_model/added_tokens.json',
 '/content/tweet_sentiment_model/tokenizer.json')

In [24]:
export_path = "/content/cleaned_dataset_transformer_model.csv"
df.to_csv(export_path, index=False)

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load model & tokenizer dari folder hasil fine-tuning
model_path = "/content/tweet_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Label mapping
id2label = {0: "negatif", 1: "netral", 2: "positif"}


In [26]:
# Fungsi prediksi
def predict_tweet(text):
    # Tokenisasi
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Prediksi
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=1).item()

    return id2label[predicted_class]

In [27]:
# Contoh penggunaan
print(predict_tweet("Saya sangat kecewa dengan janji politik."))
print(predict_tweet("Saya kagum dengan pembangunan infrastruktur yang masif."))
print(predict_tweet("Debat tadi malam cukup menarik dan informatif."))

negatif
positif
positif
