In [None]:
# --- BLOCK 1: SETUP & AUTO-DETECT PATH ---
!pip install transformers datasets accelerate scikit-learn -q

import os
import shutil

# 1. Unzip (Jika belum)
if not os.path.exists('/content/lyrics.zip'):
    print("⚠️ Pastikan file 'lyrics.zip' sudah diupload ke Files (kiri)!")
else:
    # Hapus folder lama biar bersih
    if os.path.exists('/content/extracted_data'):
        shutil.rmtree('/content/extracted_data')

    print("Sedang mengekstrak dataset...")
    !unzip -q lyrics.zip -d /content/extracted_data
    print("Ekstraksi selesai.")

# 2. AUTO-DETECT: Cari dimana folder Q3 dan Q4 bersembunyi
def find_dataset_root(start_dir):
    for root, dirs, files in os.walk(start_dir):
        if 'Q3' in dirs and 'Q4' in dirs:
            print(f"✅ Ditemukan dataset di: {root}")
            return root
    return None

# Cari root path yang benar
REAL_ROOT_PATH = find_dataset_root('/content/extracted_data')

if REAL_ROOT_PATH is None:
    # Coba cari di root content kalau user salah zip isinya doang
    REAL_ROOT_PATH = find_dataset_root('/content/')

if REAL_ROOT_PATH is None:
    raise ValueError("❌ Gagal menemukan folder Q3 dan Q4. Cek file zip kamu, pastikan isinya folder Q3 dan Q4!")

# --- BLOCK 2: TRAINING CODE (DistilBERT) ---
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

LIMIT_PER_CLASS = 500  # Total 1000 data
MODEL_NAME = 'distilbert-base-uncased'

# 1. LOAD DATASET (Pakai Path Otomatis tadi)
def load_lyrics_data(root_path, limit):
    texts = []
    labels = []
    target_quadrants = {'Q3': 0, 'Q4': 1} # 0: Sad, 1: Relaxed

    for q_folder, label_code in target_quadrants.items():
        folder_path = os.path.join(root_path, q_folder) # Path dinamis

        files = [f for f in os.listdir(folder_path) if f.endswith('.txt')][:limit]
        print(f"Folder {q_folder}: Mengambil {len(files)} lirik...")

        for f in files:
            try:
                with open(os.path.join(folder_path, f), 'r', encoding='utf-8') as tf:
                    lyric = tf.read().replace('\n', ' ').strip()
                    if lyric:
                        texts.append(lyric)
                        labels.append(label_code)
            except: pass

    return texts, labels

# Load Data
texts, labels = load_lyrics_data(REAL_ROOT_PATH, LIMIT_PER_CLASS)
print(f"Total Data Terkumpul: {len(texts)}")

if len(texts) == 0:
    raise ValueError("Data kosong! Pastikan file .txt ada di dalam folder Q3/Q4.")

# Split Data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, stratify=labels, random_state=42)

# 2. TOKENIZATION
print("Sedang Tokenisasi...")
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = LyricsDataset(train_encodings, train_labels)
val_dataset = LyricsDataset(val_encodings, val_labels)

# 3. SETUP MODEL & TRAINING
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch", # Sudah diperbaiki
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 4. MULAI TRAINING
print("\n--- Mulai Training... ---")
trainer.train()

# 5. HASIL AKHIR
print("\n--- Evaluasi Akhir ---")
eval_result = trainer.evaluate()
print(f"Akurasi Final: {eval_result['eval_accuracy']*100:.2f}%")

# Simpan
model.save_pretrained("./my_emotion_model")
tokenizer.save_pretrained("./my_emotion_model")

Sedang mengekstrak dataset...
Ekstraksi selesai.
✅ Ditemukan dataset di: /content/extracted_data
Folder Q3: Mengambil 500 lirik...
Folder Q4: Mengambil 500 lirik...
Total Data Terkumpul: 1000
Sedang Tokenisasi...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.



--- Mulai Training... ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.624595,0.547594,0.725
2,0.329053,0.624905,0.735
3,0.136572,0.697398,0.75


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].



--- Evaluasi Akhir ---


Akurasi Final: 73.00%


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./my_emotion_model/tokenizer_config.json',
 './my_emotion_model/tokenizer.json')

In [None]:
from transformers import EarlyStoppingCallback

# KITA GANTI SETUPNYA:
# 1. Epoch kita naikkan jadi 10
# 2. Kita pasang EarlyStopping (Rem Otomatis)
#    patience=3 artinya: Kalau 3 epoch berturut-turut gak ada perbaikan, STOP.

training_args_agresif = TrainingArguments(
    output_dir='./results_v2',
    num_train_epochs=10,             # GAS TERUS SAMPAI 10!
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,     # Tetap ambil yang terbaik, bukan yang terakhir
    metric_for_best_model="accuracy", # Fokus kita sekarang AKURASI
    report_to="none"
)

trainer_v2 = Trainer(
    model=model, # Kita pakai model yang tadi (lanjutkan belajar)
    args=training_args_agresif,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # INI REM-NYA
)

print("\n--- Melanjutkan Training (Mencari Akurasi Maksimal) ---")
trainer_v2.train()

# Cek hasil akhirnya
print("\n--- Evaluasi Final ---")
final_metrics = trainer_v2.evaluate()
print(f"Akurasi Tertinggi yang Didapat: {final_metrics['eval_accuracy']*100:.2f}%")

# Simpan lagi model versi terbaik ini
trainer_v2.save_model("./my_best_model_10epochs")
tokenizer.save_pretrained("./my_best_model_10epochs")

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.



--- Melanjutkan Training (Mencari Akurasi Maksimal) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.622411,0.620421,0.745
2,0.222601,0.777306,0.77
3,0.181057,1.057034,0.75
4,0.116702,1.243068,0.75
5,0.010471,1.306938,0.76


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].



--- Evaluasi Final ---


Akurasi Tertinggi yang Didapat: 77.00%


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./my_best_model_10epochs/tokenizer_config.json',
 './my_best_model_10epochs/tokenizer.json')

In [None]:
# --- TUNING HYPERPARAMETER ---

# 1. Reset Model dulu biar fresh (Wajib!)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Argument Baru dengan "Rem Pakem"
training_args_tuned = TrainingArguments(
    output_dir='./results_tuned',

    # --- TUNING 1: Learning Rate Lebih Kecil ---
    # Biar belajarnya pelan-pelan & teliti (sebelumnya 5e-5 default)
    learning_rate=2e-5,

    # --- TUNING 2: Epoch Lebih Banyak ---
    # Karena LR kecil, kita butuh waktu lebih lama
    num_train_epochs=5,

    # --- TUNING 3: Batch Size ---
    per_device_train_batch_size=16, # Naikkan dikit biar gradien lebih stabil
    per_device_eval_batch_size=16,

    # --- TUNING 4: Regularisasi (Weight Decay) ---
    # Naikkan dari 0.01 jadi 0.1 (Biar gak overfitting)
    weight_decay=0.1,

    # --- TUNING 5: LABEL SMOOTHING (The Magic Sauce) ---
    # Ini kuncinya! Mencegah loss meledak
    label_smoothing_factor=0.1,

    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# 3. Trainer Baru
trainer_tuned = Trainer(
    model=model,
    args=training_args_tuned,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # Kita tetap pasang Early Stopping buat jaga-jaga
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("\n--- Mulai Training (Versi Tuning) ---")
trainer_tuned.train()

# Cek Hasil
eval_result = trainer_tuned.evaluate()
print(f"\nAkurasi Setelah Tuning: {eval_result['eval_accuracy']*100:.2f}%")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.



--- Mulai Training (Versi Tuning) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.669873,0.649541,0.705
2,0.590089,0.569686,0.735
3,0.415085,0.534008,0.765
4,0.34872,0.555526,0.765
5,0.292452,0.562221,0.78


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].



Akurasi Setelah Tuning: 78.00%


In [None]:
import re
import torch
from transformers import TrainerCallback

# --- 1. ADVANCED PREPROCESSING (BERSIH-BERSIH) ---
def clean_lyrics(text):
    # Hapus penanda bagian lagu: [Chorus], (Verse 1), dll
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    # Hapus karakter aneh selain huruf, angka, dan tanda baca dasar
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\']', '', text)
    # Hapus spasi berlebih / baris baru ganda
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Reload dataset dengan pembersihan
texts_clean = [clean_lyrics(t) for t in texts] # Menggunakan variabel 'texts' yg sudah di-load sebelumnya

# Split Ulang
train_texts, val_texts, train_labels, val_labels = train_test_split(texts_clean, labels, test_size=0.2, stratify=labels, random_state=42)

# Tokenisasi Ulang
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Buat Dataset Object Baru
train_dataset = LyricsDataset(train_encodings, train_labels)
val_dataset = LyricsDataset(val_encodings, val_labels)

# --- 2. SETUP MODEL DENGAN FREEZING (RAHASIA 80%) ---
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Bekukan (Freeze) 4 Layer pertama DistilBERT
# Biar dia gak "lupa daratan" soal bahasa Inggris dasar
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = False

for i in range(4): # Freeze layer 0 sampai 3
    for param in model.distilbert.transformer.layer[i].parameters():
        param.requires_grad = False

# Layer 4 dan 5 (Top Layers) TETAP DILATIH biar pinter klasifikasi mood
print("Status Model: 4 Layer Bawah Dibekukan, 2 Layer Atas Dilatih.")

# --- 3. TRAINING ARGUMENTS (FINAL TUNING) ---
training_args_final = TrainingArguments(
    output_dir='./results_final',

    # Kita bisa train lebih lama karena layer bawah dibekukan (gak gampang overfit)
    num_train_epochs=8,

    # Learning rate sedikit dinaikkan karena yang dilatih cuma sedikit layer
    learning_rate=3e-5,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    # Cosine Scheduler: Turun pelan-pelan seperti seluncuran
    lr_scheduler_type="cosine",

    warmup_steps=100,
    weight_decay=0.15, # Perbesar lagi dikit
    label_smoothing_factor=0.1, # Tetap pakai ini

    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer_final = Trainer(
    model=model,
    args=training_args_final,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("\n--- Mulai Training Final (The Push to 80%) ---")
trainer_final.train()

# Cek Hasil
eval_result = trainer_final.evaluate()
print(f"\nAkurasi Final: {eval_result['eval_accuracy']*100:.2f}%")

# Simpan Model
if eval_result['eval_accuracy'] > 0.78:
    print("MANTAP! KITA TEMBUS!")
    trainer_final.save_model("./best_model_80percent")
    tokenizer.save_pretrained("./best_model_80percent")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


Status Model: 4 Layer Bawah Dibekukan, 2 Layer Atas Dilatih.

--- Mulai Training Final (The Push to 80%) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.677897,0.658252,0.68
2,0.568928,0.55016,0.725
3,0.440165,0.520278,0.785
4,0.453069,0.488703,0.8
5,0.394079,0.495926,0.805
6,0.346635,0.501295,0.815
7,0.308398,0.504746,0.815
8,0.321566,0.503319,0.82


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].



Akurasi Final: 82.00%
MANTAP! KITA TEMBUS!


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import shutil
from google.colab import files

# 1. Zip folder model terbaik tadi
print("Sedang membungkus model juara 82%...")
shutil.make_archive('roodio_stage2b_model', 'zip', './best_model_80percent')

# 2. Download ke laptop
print("Mengirim ke laptop...")
files.download('roodio_stage2b_model.zip')

Sedang membungkus model juara 82%...
Mengirim ke laptop...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Percobaan kedua menggabungkan mir dan dataset saya sendiri

In [1]:
# 1. Install Library
!pip install transformers datasets accelerate scikit-learn openpyxl -q

# 2. Unzip Dataset MIR (Biar folder Q3/Q4 keluar)
import os
import shutil

if os.path.exists('/content/lyrics.zip'):
    print("Mengekstrak dataset MIR...")
    !unzip -q /content/lyrics.zip -d /content/MERGE_Bimodal_Balanced
    print("Selesai ekstrak.")
else:
    print("⚠️ WARNING: File lyrics.zip belum diupload!")

Mengekstrak dataset MIR...
Selesai ekstrak.


In [13]:
import pandas as pd
import os
import random
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# --- 1. KONFIGURASI ---
MIR_DATA_PATH = '/content/MERGE_Bimodal_Balanced'  # Path folder MIR
USER_EXCEL_PATH = 'lyrics.xlsx'           # GANTI DENGAN NAMA FILE EXCEL KAMU
MODEL_NAME = 'distilbert-base-uncased'

# BOOSTING: Kita duplikat data Excel kamu biar setara dengan data MIR
# MIR = 1000 lagu. Excel = 100 lagu.
# Kita kali 10 biar jadi 1000 vs 1000 (Seimbang)
BOOST_FACTOR = 10

# --- 2. FUNGSI LOAD DATA MIR (FOLDER) ---
def load_mir_dataset(path):
    texts = []
    labels = []
    mapping = {'Q3': 0, 'Q4': 1} # Q3=Sad, Q4=Relaxed

    print(f"📂 Loading MIR Dataset dari: {path}")
    for folder, label in mapping.items():
        # Cek path variasi (langsung atau dalam subfolder lyrics)
        paths_to_check = [os.path.join(path, folder), os.path.join(path, 'lyrics', folder)]

        found_path = None
        for p in paths_to_check:
            if os.path.exists(p):
                found_path = p
                break

        if found_path:
            files = [f for f in os.listdir(found_path) if f.endswith('.txt')]
            print(f"   └─ Folder '{folder}': {len(files)} lagu.")
            for f in files:
                try:
                    with open(os.path.join(found_path, f), 'r', encoding='utf-8') as tf:
                        lyric = tf.read().replace('\n', ' ').strip()
                        if len(lyric) > 10:
                            texts.append(lyric)
                            labels.append(label)
                except: pass
    return texts, labels

# --- 3. FUNGSI LOAD DATA USER (EXCEL) ---
def load_user_excel(path):
    texts = []
    labels = []

    print(f"\n📂 Loading Excel User dari: {path}")
    if not os.path.exists(path):
        raise ValueError(f"❌ File Excel tidak ditemukan: {path}. Upload dulu ke Colab!")

    try:
        df = pd.read_excel(path)

        # Normalisasi nama kolom (jaga-jaga huruf besar/kecil)
        df.columns = [c.lower() for c in df.columns]

        if 'lyrics' not in df.columns or 'mood' not in df.columns:
            raise ValueError("Kolom 'lyrics' atau 'mood' tidak ditemukan di Excel!")

        count_sad = 0
        count_relaxed = 0

        for index, row in df.iterrows():
            lyric = str(row['lyrics']).replace('\n', ' ').strip()
            mood = str(row['mood']).lower().strip() # Ubah ke huruf kecil biar aman

            # Labeling: 0 = Sad, 1 = Relaxed
            label = -1
            if mood in ['sad', 'sedih', 'galau']:
                label = 0
                count_sad += 1
            elif mood in ['relaxed', 'relax', 'santai', 'tenang']:
                label = 1
                count_relaxed += 1

            # Masukkan jika valid
            if label != -1 and len(lyric) > 10:
                texts.append(lyric)
                labels.append(label)

        print(f"   └─ Ditemukan: {count_sad} Sad, {count_relaxed} Relaxed.")
        return texts, labels

    except Exception as e:
        print(f"❌ Error baca Excel: {e}")
        return [], []

# --- 4. EKSEKUSI & PENGGABUNGAN ---

# A. Load Data
mir_texts, mir_labels = load_mir_dataset(MIR_DATA_PATH)
user_texts, user_labels = load_user_excel(USER_EXCEL_PATH)

if len(user_texts) == 0:
    raise ValueError("Data Excel kosong atau salah format kolom!")

# B. Boosting Data Excel
print(f"\n🚀 Boosting Data User {BOOST_FACTOR}x lipat...")
boosted_user_texts = user_texts * BOOST_FACTOR
boosted_user_labels = user_labels * BOOST_FACTOR

# C. Gabung Semua
final_texts = mir_texts + boosted_user_texts
final_labels = mir_labels + boosted_user_labels

# Shuffle (Acak)
combined = list(zip(final_texts, final_labels))
random.shuffle(combined)
final_texts, final_labels = zip(*combined)
final_texts = list(final_texts)
final_labels = list(final_labels)

print(f"\n✅ Total Data Training Siap: {len(final_texts)} Sampel")

# --- 5. PREPARASI TRAINING ---
# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(final_texts, final_labels, test_size=0.2, stratify=final_labels, random_state=42)

# Tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = LyricsDataset(train_encodings, train_labels)
val_dataset = LyricsDataset(val_encodings, val_labels)

# --- 6. TRAINING ---
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Freeze layer bawah (Opsional, tapi bagus untuk stabilitas)
for i in range(2):
    for param in model.distilbert.transformer.layer[i].parameters():
        param.requires_grad = False

training_args = TrainingArguments(
    output_dir='./results_excel_hybrid',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.1,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

def compute_metrics(pred):
    from sklearn.metrics import accuracy_score
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("\n--- MULAI TRAINING ---")
trainer.train()

# Hasil Akhir
res = trainer.evaluate()
print(f"Akurasi Final: {res['eval_accuracy']*100:.2f}%")

# Simpan
trainer.save_model("./model_hybrid_excel")
tokenizer.save_pretrained("./model_hybrid_excel")

📂 Loading MIR Dataset dari: /content/MERGE_Bimodal_Balanced
   └─ Folder 'Q3': 500 lagu.
   └─ Folder 'Q4': 500 lagu.

📂 Loading Excel User dari: lyrics.xlsx
   └─ Ditemukan: 50 Sad, 50 Relaxed.

🚀 Boosting Data User 10x lipat...

✅ Total Data Training Siap: 2000 Sampel


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.



--- MULAI TRAINING ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.623152,0.528636,0.75
2,0.297778,0.317633,0.8525
3,0.151168,0.378463,0.8775
4,0.058826,0.471687,0.875


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


Akurasi Final: 85.25%


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./model_hybrid_excel/tokenizer_config.json',
 './model_hybrid_excel/tokenizer.json')

In [14]:
import shutil
from google.colab import files

# 1. Zip folder hasil training
print("Sedang membungkus model Hybrid 85%...")
shutil.make_archive('model_hybrid_final', 'zip', './model_hybrid_excel')

# 2. Download ke laptop
print("Mengirim ke laptop...")
files.download('model_hybrid_final.zip')

Sedang membungkus model Hybrid 85%...
Mengirim ke laptop...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>