In [None]:
!pip install optuna
!pip install transformers
import optuna
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizer,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
train.isna().any()

Unnamed: 0,0
id,False
text,False
author,False


In [None]:
test.isna().any()

Unnamed: 0,0
id,False
text,False


In [None]:
author_to_label = {"EAP": 0, "HPL": 1, "MWS": 2}
train["label"] = train["author"].map(author_to_label)

# split train:test 80:20
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train["text"], train["label"], test_size=0.2, random_state=42
)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# ubah menjadi bentuk PyTorch class dataset
class DatasetHorror(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Memecah dataset menjadi bentuk encoding (input_id dan attention_mask)
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)
train_dataset = DatasetHorror(train_encodings, train_labels.tolist())
val_dataset = DatasetHorror(val_encodings, val_labels.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
#buat run pake GPU nvidia
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#menentukan hyperparameter tuning, training args untuk Hugging Face Trainer
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)

    #load pretrained model: RobertaForSequenceClassification
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3).to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_trial_{trial.number}",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=True,  # Mixed precision for GPU
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_dir=f"./logs_trial_{trial.number}",
        logging_steps=100,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    trainer.train()

    val_preds = trainer.predict(val_dataset)
    #menghitung probabilitas dengan softmax
    val_probs = torch.softmax(torch.tensor(val_preds.predictions), dim=1).numpy()
    loss = log_loss(val_labels, val_probs)

    return loss


In [None]:
#membuat hyperparameter tuner dengan optuna menggunakan medianpruner
study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner(),
)

#mencoba training 10 kali untuk mencari hyperparameter terbaik
study.optimize(objective, n_trials=10)

#print out best trial dari percobaan yang dijalankan optuna
print("Best trial:")
trial = study.best_trial
print(f"  Value (log loss): {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-05-13 10:46:47,101] A new study created in memory with name: no-name-f94f996f-1701-4a91-be0d-0b673013e769
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5049,0.442802
2,0.3543,0.472142
3,0.2282,0.628877
4,0.1987,0.734087
5,0.1392,0.811386


[I 2025-05-13 11:05:06,566] Trial 0 finished with value: 0.44280487279928826 and parameters: {'learning_rate': 1.0067165249824999e-05, 'batch_size': 8, 'num_epochs': 5, 'weight_decay': 0.04799356962738726}. Best is trial 0 with value: 0.44280487279928826.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.465,0.439906
2,0.2916,0.532026


[I 2025-05-13 11:12:30,661] Trial 1 finished with value: 0.43991112365853846 and parameters: {'learning_rate': 1.7743566427211544e-05, 'batch_size': 8, 'num_epochs': 2, 'weight_decay': 0.06389631927697813}. Best is trial 1 with value: 0.43991112365853846.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4945,0.419832
2,0.3105,0.391245
3,0.178,0.429035


[I 2025-05-13 11:18:40,184] Trial 2 finished with value: 0.39124012029894945 and parameters: {'learning_rate': 2.1610102677925056e-05, 'batch_size': 32, 'num_epochs': 3, 'weight_decay': 0.013039594991676173}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4237,0.466702
2,0.3429,0.603297
3,0.1766,0.493002
4,0.0738,0.630994


[I 2025-05-13 11:29:07,003] Trial 3 finished with value: 0.46670130500031815 and parameters: {'learning_rate': 1.640739794076999e-05, 'batch_size': 16, 'num_epochs': 4, 'weight_decay': 0.06470613280120338}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4248,0.465008
2,0.3433,0.467658
3,0.1987,0.50207


[I 2025-05-13 11:36:56,207] Trial 4 finished with value: 0.46501121113666055 and parameters: {'learning_rate': 1.5331006355923643e-05, 'batch_size': 16, 'num_epochs': 3, 'weight_decay': 0.00881404136221391}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4211,0.459909
2,0.295,0.46282
3,0.1344,0.588752
4,0.033,0.747198


[I 2025-05-13 11:47:01,550] Trial 5 finished with value: 0.4599055517793712 and parameters: {'learning_rate': 3.0602433685437927e-05, 'batch_size': 16, 'num_epochs': 4, 'weight_decay': 0.0848892789025408}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.52,0.440368
2,0.3208,0.450469
3,0.1812,0.440659
4,0.111,0.525063
5,0.0624,0.666088


[I 2025-05-13 11:57:11,756] Trial 6 finished with value: 0.4403628154224407 and parameters: {'learning_rate': 2.4622101926922356e-05, 'batch_size': 32, 'num_epochs': 5, 'weight_decay': 0.05885575187754838}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4595,0.518699
2,0.2682,0.545564


[I 2025-05-13 12:04:44,262] Trial 7 finished with value: 0.5187008931851165 and parameters: {'learning_rate': 2.1299581317264884e-05, 'batch_size': 8, 'num_epochs': 2, 'weight_decay': 0.03610899958991968}. Best is trial 2 with value: 0.39124012029894945.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4759,0.404844
2,0.2686,0.369933


[I 2025-05-13 12:08:38,159] Trial 8 finished with value: 0.36993438202047907 and parameters: {'learning_rate': 3.399847047530166e-05, 'batch_size': 32, 'num_epochs': 2, 'weight_decay': 0.03463715390643281}. Best is trial 8 with value: 0.36993438202047907.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4505,0.458611
2,0.3577,0.453388
3,0.2386,0.461661


[I 2025-05-13 12:16:16,232] Trial 9 finished with value: 0.4533882551185191 and parameters: {'learning_rate': 1.001747421371885e-05, 'batch_size': 16, 'num_epochs': 3, 'weight_decay': 0.04357818185283763}. Best is trial 8 with value: 0.36993438202047907.


Best trial:
  Value (log loss): 0.36993438202047907
  Params: 
    learning_rate: 3.399847047530166e-05
    batch_size: 32
    num_epochs: 2
    weight_decay: 0.03463715390643281


In [None]:
#menyimpan hyperparameter terbaik
best_params = study.best_params
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3).to(device)

#menentukan parameter untuk Trainer final, 
#kali ini menggunakan hyperparameter yang sudah diperoleh
final_args = TrainingArguments(
    output_dir="./best_model",
    report_to="none",
    push_to_hub=False,
    num_train_epochs=best_params["num_epochs"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"] * 2,
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"],
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs_best",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=final_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

#melakukan training secara final
trainer.train()

In [None]:
#menyimpan fine-tuned model dengan meng-exportnya ke 'best_roberta_model' dan menyimpan 
#tokenizer yang sudah diatur truncation dan paddingnya
trainer.save_model("best_roberta_model")
tokenizer.save_pretrained("best_roberta_model")

('best_roberta_model/tokenizer_config.json',
 'best_roberta_model/special_tokens_map.json',
 'best_roberta_model/vocab.json',
 'best_roberta_model/merges.txt',
 'best_roberta_model/added_tokens.json')

In [None]:
#melakukan tokenisasi terhadap dataset test terlebih dahulu sebelum diprediksi 
test_encodings = tokenizer(test["text"].tolist(), truncation=True, padding=True, max_length=128)
test_dataset = DatasetHorror(test_encodings, [0] * len(test))

#menggunakan fine-tuned model untuk prediksi dan menentukan probabilitas dengan softmax
test_preds = trainer.predict(test_dataset)
test_probs = torch.softmax(torch.tensor(test_preds.predictions), dim=1).numpy()

#mengubah label numerik menjadi inisial penulis
submission = pd.DataFrame({
    "id": test["id"],
    "EAP": test_probs[:, 0],
    "HPL": test_probs[:, 1],
    "MWS": test_probs[:, 2],
})

#export hasil prediksi
submission.to_csv("submission_optuna_roberta.csv", index=False)

NameError: name 'tokenizer' is not defined