####Library

In [22]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder


####Load data

In [23]:
df = pd.read_csv("/content/drive/MyDrive/PemrosesanTeksTeori/YT_comments_histID.csv")
col_clean = "comment_clean"
df = df.dropna(subset=[col_clean]).reset_index(drop=True)

print("Total data:", len(df))


Total data: 981


####Labeling dengan lexicon

In [25]:
positive_words = [
    "keren", "bagus", "mantap", "terbaik", "wow", "suka", "love", "semangat", "edukasi", "edukatif", "paham", "syukur", "baik"
    "bagus banget", "luar biasa", "mantap banget", "bagus sekali", "terima kasih", "respect", "hormat", "cerdas", "algoritma"
]

negative_words = [
    "jelek", "buruk", "parah", "kesal", "anjing", "bohong", "rusak", "bobrok", "bohong", "hapus", "takut", "bodoh", "gelap",
    "normal", "benci", "kecewa", "hancur", "tidak suka", "nyata", "pejabat", "khianat", "sedih", "miris", "ancam",
    "bahaya", "kejam", "bahlil", "petrus", "propaganda", "brengsek", "prabowo", "seleweng", "dongo", "korup", "korupsi"
]

neutral_words = [
    "sejarah", "kobi", "buku", "pelajaran", "menang", "fakta"
    "pemerintah", "absen", "pertama", "masa", "sekolah", "guru"
]

def lexicon_label(text):
    t = text.lower()
    pos = sum(w in t for w in positive_words)
    neg = sum(w in t for w in negative_words)
    neu = sum(w in t for w in neutral_words)

    if pos > neg and pos > neu:
        return "positif"
    elif neg > pos and neg > neu:
        return "negatif"
    elif neu > pos and neu > neg:
        return "netral"
    else:
        return None

df["seed_label"] = df[col_clean].apply(lexicon_label)

pos_seed = df[df["seed_label"] == "positif"].head(100)
neg_seed = df[df["seed_label"] == "negatif"].head(100)
neu_seed = df[df["seed_label"] == "netral"].head(50)

seed_df = pd.concat([pos_seed, neg_seed, neu_seed]).reset_index(drop=True)

print("Seed label counts:")
print(seed_df["seed_label"].value_counts())


Seed label counts:
seed_label
negatif    100
positif     93
netral      50
Name: count, dtype: int64


####Package

In [26]:
!pip install transformers sentencepiece torch torchvision torchaudio --quiet

####Model

In [27]:
model = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model)

# Encode labels
le = LabelEncoder()
seed_df["label_id"] = le.fit_transform(seed_df["seed_label"])

class CommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128
        )
        enc = {k: torch.tensor(v) for k, v in enc.items()}
        enc["labels"] = torch.tensor(self.labels[idx])
        return enc

train_dataset = CommentDataset(seed_df[col_clean].tolist(), seed_df["label_id"].tolist())

model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=3
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=20,
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
20,0.6337
40,0.1278
60,0.0198
80,0.0067


TrainOutput(global_step=80, training_loss=0.19700159672647716, metrics={'train_runtime': 70.1518, 'train_samples_per_second': 34.639, 'train_steps_per_second': 1.14, 'total_flos': 159841401269760.0, 'train_loss': 0.19700159672647716, 'epoch': 10.0})

####Prediksi

In [28]:
def predict(text):
    input_ids = tokenizer(
        text, truncation=True, padding="max_length", max_length=128, return_tensors="pt"
    )
    # Move input tensors to the same device as the model
    input_ids = {k: v.to(model.device) for k, v in input_ids.items()}

    with torch.no_grad():
        logits = model(**input_ids).logits
    pred = torch.argmax(logits, dim=1).item()
    return le.inverse_transform([pred])[0]

df["final_label"] = df[col_clean].apply(predict)

df.to_csv(f"/content/drive/MyDrive/PemrosesanTeksTeori/YT_histID_labeled.csv", index=False, encoding="utf-8")


####Hitung hasil training

In [29]:
df["seed_label"].value_counts().reset_index()

Unnamed: 0,seed_label,count
0,netral,246
1,negatif,141
2,positif,93


In [30]:
df["final_label"].value_counts().reset_index()


Unnamed: 0,final_label,count
0,negatif,457
1,netral,265
2,positif,259


####Perbandingan hasil training vs LLM

In [64]:
df_model = pd.read_csv("/content/drive/MyDrive/PemrosesanTeksTeori/YT_histID_labeled.csv")
df_llm = pd.read_csv("/content/drive/MyDrive/PemrosesanTeksTeori/labeled_LLM.csv")

df = df_model.merge(df_llm, on="comment_ori", suffixes=("_model", "_llm"))
df.head()

Unnamed: 0,comment_ori,comment_clean_model,seed_label,final_label,comment_clean_llm,llm_label
0,Pemerintah kita tahun ini ngumumin adanya penu...,perintah tahun ngumumin tulis ulang buku sejar...,,netral,perintah tahun ngumumin tulis ulang buku sejar...,Netral
1,Sekarang sudah diangkat menjadi pahlawan Nasional,sekarang angkat pahlawan nasional,,negatif,sekarang angkat pahlawan nasional,Netral
2,Sejarah ditulis oleh pemenang,sejarah tulis menang,netral,netral,sejarah tulis menang,Netral
3,nonton sejarah cina kuno: üòØü§îüòç\nnonton sejarah...,nonton sejarah cina kuno nonton sejarah indonesia,netral,netral,nonton sejarah cina kuno nonton sejarah indonesia,Netral
4,Ku harap video ygkek gini naik sampe ke dengar...,video naik dengar laku,,negatif,video naik dengar laku,Netral


In [68]:
df["llm_label"] = df["llm_label"].str.lower().str.strip()

In [69]:
agreement = (df["final_label"] == df["llm_label"]).mean()
print("Kesamaan label model vs LLM:", agreement)

Kesamaan label model vs LLM: 0.3118609406952965


In [70]:
df["final_label"].value_counts().reset_index()


Unnamed: 0,final_label,count
0,negatif,456
1,netral,264
2,positif,258


In [71]:
df["llm_label"].value_counts().reset_index()


Unnamed: 0,llm_label,count
0,positif,515
1,netral,371
2,negatif,92
