In [17]:
import os
import shutil
import re
import pandas as pd
import numpy as np
import mlflow
import torch
import pathlib
from datasets import load_dataset, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import shuffle

# --- KONFIGURASI PATH ---
current_dir = os.getcwd()

# Target Utama: root/mlruns
# (Naik 3 level dari root/src/models/nlpModel ke root, lalu masuk mlruns)
mlflow_dir = os.path.abspath(os.path.join(current_dir, "../../../mlruns"))

# Target Output Model: root/mlruns/final_model_bert
# (Kita simpan hasil save model langsung di dalam folder mlruns juga biar ngumpul)
output_dir = os.path.join(mlflow_dir, "final_model_bert")

LOCAL_TEST_FILE = os.path.abspath(os.path.join(current_dir, "../../../data/lyrics/lyrics.csv")) 

# Buat folder mlruns dan output jika belum ada
os.makedirs(mlflow_dir, exist_ok=True)

# --- 2. SETUP URI (Windows Pathlib Fix) ---
# Mengubah "C:\..." menjadi "file:///C:/..." agar MLflow tidak error
mlflow_tracking_uri = pathlib.Path(mlflow_dir).as_uri()
mlflow.set_tracking_uri(mlflow_tracking_uri)

# --- 3. SETUP EXPERIMENT ---
experiment_name = "Roodio_BERT_Finetuning"
mlflow.set_experiment(experiment_name)

print(f"üìÇ Working Directory: {current_dir}")
print(f"üì° MLflow Tracking URI: {mlflow_tracking_uri}")
print(f"üíæ Model akan disimpan di: {output_dir}")
print(f"üß™ Experiment Name: {experiment_name}")

2026/01/26 23:23:56 INFO mlflow.tracking.fluent: Experiment with name 'Roodio_BERT_Finetuning' does not exist. Creating a new experiment.


üìÇ Working Directory: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\src\models\nlpModel
üì° MLflow Tracking URI: file:///c:/CAWU4GROUP3/projects/projectRoodio/machineLearning/mlruns
üíæ Model akan disimpan di: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\mlruns\final_model_bert
üß™ Experiment Name: Roodio_BERT_Finetuning


In [3]:
print("‚¨áÔ∏è Menyiapkan Dataset GoEmotions...")
ds = load_dataset("go_emotions", "simplified")
df = ds['train'].to_pandas()

print(f"‚úÖ Data Mentah Terload: {len(df)} baris.")
display(df.head(3)) # Menampilkan sampel data

‚¨áÔ∏è Menyiapkan Dataset GoEmotions...
‚úÖ Data Mentah Terload: 43410 baris.


Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj


In [5]:
# 1. Definisi Mapping
target_map = {
    'anger': 'angry', 'annoyance': 'angry', 'disapproval': 'angry', 'fear': 'angry', 'nervousness': 'angry',
    'joy': 'happy', 'excitement': 'happy', 'love': 'happy', 'admiration': 'happy', 'amusement': 'happy', 'optimism': 'happy',
    'sadness': 'sad', 'disappointment': 'sad', 'grief': 'sad', 'remorse': 'sad',
    'relief': 'relaxed', 'neutral': 'relaxed', 'realization': 'relaxed'
}
labels_list = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", 
    "confusion", "curiosity", "desire", "disappointment", "disapproval", 
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", 
    "joy", "love", "nervousness", "optimism", "pride", "realization", 
    "relief", "remorse", "sadness", "surprise", "neutral"
]

def map_emotion(row_labels):
    for label_idx in row_labels:
        lbl = labels_list[label_idx]
        if lbl in target_map: return target_map[lbl]
    return None

print("‚öôÔ∏è Melakukan Mapping & Balancing...")
df['label_name'] = df['labels'].apply(map_emotion)
df_clean = df.dropna(subset=['label_name']).copy()

# 2. Undersampling (Agar Adil)
min_sample = df_clean['label_name'].value_counts().min()
df_balanced = df_clean.groupby('label_name').apply(lambda x: x.sample(n=min_sample, random_state=26)).reset_index(drop=True)

# 3. Finalisasi Dataset
label2id = {'angry': 0, 'happy': 1, 'relaxed': 2, 'sad': 3}
id2label = {0: 'angry', 1: 'happy', 2: 'relaxed', 3: 'sad'}
df_balanced['label'] = df_balanced['label_name'].map(label2id)

# Buat Dataset HuggingFace
train_dataset = Dataset.from_pandas(df_balanced[['text', 'label']].rename(columns={"text": "lyrics"}))

print(f"‚úÖ Data Training Siap: {len(train_dataset)} baris.")
print(f"‚öñÔ∏è Jumlah per kelas: {min_sample}")

‚öôÔ∏è Melakukan Mapping & Balancing...
‚úÖ Data Training Siap: 10396 baris.
‚öñÔ∏è Jumlah per kelas: 2599


  df_balanced = df_clean.groupby('label_name').apply(lambda x: x.sample(n=min_sample, random_state=26)).reset_index(drop=True)


In [7]:
if os.path.exists(LOCAL_TEST_FILE):
    print(f"üìÇ Membaca file lirik lokal: {LOCAL_TEST_FILE}")
    
    raw_test_data = []
    current_buffer = []
    valid_moods = ['angry', 'happy', 'relaxed', 'sad']
    
    with open(LOCAL_TEST_FILE, 'r', encoding='utf-8', errors='replace') as f:
        lines = f.readlines()
    lines = lines[1:] # Skip header
    
    for line in lines:
        line = line.strip()
        if not line: continue
        is_end = False
        if ';' in line:
            parts = line.rsplit(';', 1)
            if len(parts) == 2:
                pmood = parts[1].lower().strip()
                if pmood in valid_moods:
                    full_lyric = " ".join(current_buffer) + " " + parts[0]
                    # Cleaning simpel
                    full_lyric = re.sub(r'\[.*?\]', '', full_lyric)
                    full_lyric = re.sub(r"[^a-z\s']", '', full_lyric.lower())
                    raw_test_data.append({'lyrics': full_lyric, 'label': label2id[pmood]})
                    current_buffer = []
                    is_end = True
        if not is_end:
            current_buffer.append(line)
    
    test_dataset = Dataset.from_pandas(pd.DataFrame(raw_test_data))
    print(f"‚úÖ Data Test Siap: {len(test_dataset)} lagu.")
    
else:
    print(f"‚ùå WARNING: File {LOCAL_TEST_FILE} tidak ditemukan di {current_dir}.")
    print("‚ö†Ô∏è Training akan berjalan menggunakan dummy test set (sebagian dari data train).")
    test_dataset = train_dataset.select(range(50))

üìÇ Membaca file lirik lokal: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\data\lyrics\lyrics.csv
‚úÖ Data Test Siap: 100 lagu.


In [8]:
model_ckpt = "bert-base-uncased"
print(f"‚è≥ Loading Tokenizer & Model: {model_ckpt}...")

tokenizer = BertTokenizer.from_pretrained(model_ckpt)
model = BertForSequenceClassification.from_pretrained(
    model_ckpt, 
    num_labels=4, 
    id2label=id2label, 
    label2id=label2id
)

def tokenize_func(examples):
    # Max length 128 cukup untuk GoEmotions
    return tokenizer(examples["lyrics"], padding="max_length", truncation=True, max_length=128)

print("‚öôÔ∏è Melakukan Tokenisasi...")
tokenized_train = train_dataset.map(tokenize_func, batched=True)
tokenized_test = test_dataset.map(tokenize_func, batched=True)

print("‚úÖ Tokenisasi Selesai.")

‚è≥ Loading Tokenizer & Model: bert-base-uncased...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 805.24it/s, Materializing param=bert.pooler.dense.weight]                               
BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias

‚öôÔ∏è Melakukan Tokenisasi...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10396/10396 [00:00<00:00, 18279.67 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 3793.35 examples/s]

‚úÖ Tokenisasi Selesai.





In [18]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred, average='macro')
    return {"accuracy": acc, "f1": f1}

# Konfigurasi Training
args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    
    # Batch Size Aman untuk CPU
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    
    num_train_epochs=3, 
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="mlflow",
    run_name="Local_CPU_BERT_GoEmotions",
    
    # --- CPU SETTINGS ---
    use_cpu=True,  # Pastikan ini True
)

# Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    
    # --- PERBAIKAN DI SINI ---
    # Dulu: tokenizer=tokenizer
    # Sekarang: processing_class=tokenizer
    processing_class=tokenizer, 
    
    compute_metrics=compute_metrics,
)

print("\nüöÄ MEMULAI TRAINING...")
print("‚ö†Ô∏è Estimasi waktu: 30 menit - 2 jam (tergantung kecepatan CPU).")
print("‚òï Silakan buat kopi...")

trainer.train()

print(f"\nüíæ Training Selesai! Model disimpan di: {output_dir}")


üöÄ MEMULAI TRAINING...
‚ö†Ô∏è Estimasi waktu: 30 menit - 2 jam (tergantung kecepatan CPU).
‚òï Silakan buat kopi...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.893313,1.973003,0.26,0.224946
2,0.605922,1.941605,0.3,0.271711
3,0.515543,2.257417,0.3,0.262954


Writing model shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.22it/s]
Writing model shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.57it/s]
Writing model shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.47it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.La


üíæ Training Selesai! Model disimpan di: c:\CAWU4GROUP3\projects\projectRoodio\machineLearning\mlruns\final_model_bert
