In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, AdamW, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_new = pd.read_csv('preprocessed_data_news.csv')

In [3]:
df_new["text"] = df_new["text"].astype(str)

In [4]:
print(df_new[df_new["text"].isnull()]) 

Empty DataFrame
Columns: [Unnamed: 0, title, text, label]
Index: []


In [5]:
dataset = Dataset.from_pandas(df_new[['text', 'label']])

# Split into train (80%) and test (20%)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the training set into train (90%) and validation (10%)
train_val_split = train_test_split["train"].train_test_split(test_size=0.1, seed=42)

datasets = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": train_test_split["test"],
})

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

Map: 100%|████████████████████████████████████████████████████████████████████| 360/360 [00:05<00:00, 63.85 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 88.06 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 80.73 examples/s]


In [16]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert raw DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df_new[["text", "label"]])

# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split into train (80%) and test (20%)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Further split the training set into train (90%) and validation (10%)
train_val_split = train_test_split["train"].train_test_split(test_size=0.1, seed=42)

# Final DatasetDict with train, validation, and test sets
datasets = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": train_test_split["test"],
})

# Remove the raw text column (optional, but avoids issues)
datasets = datasets.remove_columns(["text"])

print(datasets)

Map: 100%|███████████████████████████████████████████████████████████████████| 500/500 [00:04<00:00, 113.08 examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 360
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})





In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    weight_decay=0.01,
    fp16=True,
    learning_rate=3e-5  
)

optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    optimizers=(optimizer, None)  # Pass optimizer explicitly
)

trainer.train()
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6287,0.635945
2,0.3707,0.51585
3,0.13,0.515494


{'eval_loss': 0.5154937505722046,
 'eval_runtime': 23.3543,
 'eval_samples_per_second': 1.713,
 'eval_steps_per_second': 0.214,
 'epoch': 3.8444444444444446}

In [19]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

('./final_model\\tokenizer_config.json',
 './final_model\\special_tokens_map.json',
 './final_model\\vocab.txt',
 './final_model\\added_tokens.json')

In [20]:
#dimulai dari sini
model = BertForSequenceClassification.from_pretrained("./final_model")
tokenizer = BertTokenizer.from_pretrained("./final_model")

In [21]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np
predictions = trainer.predict(datasets["test"])
logits = predictions.predictions
y_pred = np.argmax(logits, axis=1)

y_true = datasets["test"]["label"]
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

Accuracy: 0.8800
Precision: 0.9400
Recall: 0.8393
F1-score: 0.8868

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        44
           1       0.94      0.84      0.89        56

    accuracy                           0.88       100
   macro avg       0.88      0.89      0.88       100
weighted avg       0.89      0.88      0.88       100



In [24]:
input_text = "demonstr gather last night exercis constitut protect right peac protest order rais issu creat chang loretta lynch aka eric holder skirt"

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():  
    outputs = model(**inputs)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()

print(f"Predicted class: {predicted_class}")

Predicted class: 1
