In [None]:
import fasttext
from huggingface_hub import hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# hf_hub_download(repo_id="facebook/fasttext-id-vectors",
#                 filename="model.bin",
#                 cache_dir="./models",
#                 force_download=True)

In [None]:
model_path = "models/models--facebook--fasttext-id-vectors/snapshots/77c30f24dea48d507180be003faad9ebdc070621/model.bin"
model_fasttext = fasttext.load_model(model_path)

In [None]:
model_fasttext.get_nearest_neighbors("cantik")

[(0.7911218404769897, 'canti'),
 (0.7720605134963989, 'cantikdan'),
 (0.7554269433021545, 'cantik.Cantik'),
 (0.7534511089324951, 'cantikan'),
 (0.751379668712616, 'cantiknya'),
 (0.7337489724159241, '.cantik'),
 (0.7271543741226196, 'cantik.Ia'),
 (0.7211070656776428, 'cantk'),
 (0.7113025188446045, 'cantik.Di'),
 (0.7021833062171936, 'menawan')]

# Import Data

In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt' ,quiet=True)
nltk.download('punkt_tab')
nltk.download('stopwords',quiet=True)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
import pandas as pd

labelled_df = pd.read_csv("clean_label.csv", encoding='utf-8')

# Text Preprocessing

## Drop Duplicates

In [4]:
labelled_df = labelled_df.drop_duplicates(subset=['clean_text'])
print(len(labelled_df))

8603


After applying the drop-duplicates process, the dataset now contains 8,603 remaining entries.

## Cleaning Data

In [5]:
def clean_alpha_numeric(text):
    if pd.isna(text):
        return text
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text).lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

The `clean_alpha_numeric()` function cleans each text entry by removing zero-width characters (in case of encoding errors), converting the text to lowercase, normalizing whitespace, and trimming unnecessary spaces. Missing values are returned unchanged.

In [6]:
labelled_df['clean_tokens'] = labelled_df['clean_text'].apply(clean_alpha_numeric)

In [7]:
labelled_df['clean_tokens'].head(10)

Unnamed: 0,clean_tokens
0,bandung mau di gimanain juga teteep estetik anjit
1,plis pulang sekarang!! dapat info dari sodara ...
2,"sadar enggak, demo kita sekarang sudah melence..."
3,guys kita fokus sama dpr saja kalo kalian nger...
4,ini kapan ya damainya
5,ada apa dengan negara yang kucintai ini?
6,pak soekarno lihat negaramu pak..
7,agustus kali ini kacau banget ya....
8,gays plis jangan begini kita marah tapi jangan...
9,"loh ini kan jalan tol ya , kok dibakar yang ru..."


# Split Data

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    labelled_df["clean_tokens"], labelled_df["classification"], test_size=0.15, random_state=42
)

In [10]:
print("Train:", len(X_train), " | Test:", len(X_test))

Train: 7312  | Test: 1291


# Feature Extraction for Machine Learning

## Fast Text

In [12]:
import string
import numpy as np

In [None]:
X_train_tokens = X_train.apply(word_tokenize)
X_test_tokens = X_test.apply(word_tokenize)

In [None]:
def sent2vec_fasttext(tokens):
    vectors = [model_fasttext.get_word_vector(w) for w in tokens]

    if len(vectors) == 0:
        return np.zeros((model_fasttext.get_dimension(),), dtype=np.float32)

    return np.mean(vectors, axis=0).astype(np.float32)


In [None]:
X_train_ft = np.vstack([sent2vec_fasttext(t) for t in X_train_tokens])
X_test_ft  = np.vstack([sent2vec_fasttext(t) for t in X_test_tokens])

print("FastText shapes:", X_train_ft.shape, X_test_ft.shape)

FastText shapes: (7312, 300) (1291, 300)


## TF-IDF

In [11]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [13]:
from multiprocessing import Pool, cpu_count
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from functools import lru_cache

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [14]:
custom_stopwords = {
    'ya','yg','yang','aja','sih','nih','loh','kok','kan','pun','lah','nya','pake','punya','trus','sampe','biar','sma','sm',
    'kayak','gitu','gini','jadi','udah','sudah','udh','belum','blum','aq','jg','jd','tau','dgn','krn','karna','gara',
    'iya','oh','eh','deh','mah','si','dong','toh','bun','sya','sy','kalo','klo','tp','ku','tpi','gimana','pa',
    'bunda','bund','sist','gan','ibu','ayah','suami','istri','anak','ortu','orangtua','teman','tmn','dlu','dy','gtu',
    'hari','malam','pagi','siang','besok','kemarin','tadi','sana','sini','situ','daerah','tempat','rumah','kampung','banget',
    'aku','kamu','dia','kami','kita','mereka','saya','diriku','diri','pernah','lagi','untk','nb','lg','tdk','bikin','kali',
    'jdi','rb','dr','ak','blm','liat','tuh','krna','jt','thn','lgi','th','yah','dg','dah','ny','kk','jga','pdhl','apapun',
    'jarang','juta','dn',

    'rt', 'ttp', 'sihh', 'udh', 'udh', 'yaaa', 'yaa', 'yaah', 'yaaah',
    'min', 'admin', 'kak', 'bro', 'sis',
    'hehe', 'huhu', 'haha', 'wkwk', 'wkwkwk', 'wkwwk',
    'yaaampun', 'ampun',
    'pls', 'please',
    'dll', 'dst',
    'mah', 'lahh', 'loh', 'yaampun',
}

nltk_stopwords = set(stopwords.words('indonesian'))
all_stopwords = custom_stopwords.union(nltk_stopwords)

In [15]:
normalization_map = {
    "ga": "gak",
    "g": "gak",
    "gk": "gak",
    "gak": "gak",
    "gaaa": "gak",
    "nggak": "gak",
    "ngga": "gak",
    "nggaaa": "gak",

    "bgt": "banget",
    "bget": "banget",
    "bgttt": "banget",
    "bngt": "banget",

    "sm": "sama",
    "sama2": "sama",

    "dgn": "dengan",
    "dgnn": "dengan",

    "krn": "karena",
    "krna": "karena",

    "tp": "tapi",
    "tpi": "tapi",

    "jg": "juga",
    "jga": "juga",

    "udh": "sudah",
    "udhh": "sudah",

    "blm": "belum",
    "blm2": "belum",

    "skrg": "sekarang",
    "skg": "sekarang",

    "btw": "ngomong-ngomong",

    "pdhl": "padahal",
    "pdahal": "padahal",

    "rmh": "rumah",
    "rmah": "rumah",

    "org": "orang",
    "orang2": "orang",

    "ank": "anak",
    "anak2": "anak",

    "duit": "uang",
    "duwit": "uang",

    "cewe": "perempuan",
    "cewe2": "perempuan",
    "cowo": "laki-laki",
}

In [16]:
@lru_cache(maxsize=50000)
def cached_stem(word):
    return stemmer.stem(word)

def process_single_document(text):
    if not isinstance(text, str):
        text = str(text)

    text = text.replace('#', '')
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if not text: return ""
    tokens = word_tokenize(text)

    final_tokens = []
    for token in tokens:
        token = normalization_map.get(token, token)
        if token in all_stopwords or len(token) < 2:
            continue

        if not token.isdigit():
            token = cached_stem(token)

        if len(token) > 1:
            final_tokens.append(token)

    return " ".join(final_tokens)

The `process_single_document()` function cleans and normalizes text by removing hashtags, numbers, and stopwords. It then tokenizes, applies normalization and stemming, filters short tokens, and returns the processed text. Non-string inputs are converted to strings.

In [17]:
def preprocess_optimized(texts, n_jobs=None):
    if n_jobs is None:
        n_jobs = max(1, cpu_count() - 1)

    print(f"Mulai preprocessing dengan {n_jobs} CPU cores...")

    with Pool(processes=n_jobs) as pool:
        clean_texts = pool.map(process_single_document, texts, chunksize=100)

    print("Selesai!")
    return clean_texts

In [18]:
X_train_clean = preprocess_optimized(X_train)
X_test_clean  = preprocess_optimized(X_test)

Mulai preprocessing dengan 1 CPU cores...
Selesai!
Mulai preprocessing dengan 1 CPU cores...
Selesai!


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1, 2),
    stop_words=None
)

In [20]:
X_train_tfidf = tfidf.fit_transform(X_train_clean)
X_test_tfidf = tfidf.transform(X_test_clean)

In [21]:
print("\n===== TF-IDF =====")
print("Shape (train):", X_train_tfidf.shape)
print("Number of Unique Words:", len(tfidf.vocabulary_))
print("List of Vocab:", list(tfidf.vocabulary_.keys())[:20])


===== TF-IDF =====
Shape (train): (7312, 7000)
Number of Unique Words: 7000
List of Vocab: ['mahasiswa', 'demo', 'rusuh', 'polisi', 'jalan', 'tugas', 'keluarga', 'sana', 'rugi', 'rakyat', 'beras', 'langka', 'depan', 'mahasiswa demo', 'demo rusuh', 'rusuh polisi', 'polisi jalan', 'jalan tugas', 'tugas keluarga', 'rugi rakyat']


The TF-IDF matrix has a shape of (7312, 7000), meaning 7,312 documents are represented using 7,000 unique terms. The vocabulary includes key unigrams and bigrams such as 'mahasiswa', 'demo', and 'rusuh polisi', reflecting important patterns in the dataset.

# Modeling - Fast Text

In [24]:
import time
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold
)

import pickle
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
def run_ml_experiment(X_train, y_train, X_test, y_test,
                      model_type,
                      representation_name="Model",
                      n_iter=10,
                      cv=4,
                      use_class_weight=False,
                      random_state=42,
                      use_tuning=False,
                      cache_dir="saved_models_2_class"):

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    balance_str = "balanced" if use_class_weight else "none"
    model_type_str = model_type.upper()

    safe_rep_name = representation_name.replace(" ", "_")
    filename = f"{model_type_str}_{safe_rep_name}_Tuning{use_tuning}_{balance_str}.pkl"
    filepath = os.path.join(cache_dir, filename)

    print(f"Experiment: {representation_name} + {model_type_str} ({balance_str})")

    best_model = None
    best_params = None
    best_cv_score = None
    tuning_time = 0
    model_name = ""
    balance_status_return = "Class Weight" if use_class_weight else "None"

    if os.path.exists(filepath):
        print(f"Found cached model at: {filepath}")
        print("   Loading model from pickle... (Skipping training)")

        try:
            with open(filepath, 'rb') as f:
                cached_data = pickle.load(f)

            # Ambil data penting dari pickle
            best_model = cached_data['best_model']
            best_params = cached_data['best_params']
            best_cv_score = cached_data['best_cv_score_f1_macro']
            tuning_time = cached_data['tuning_time_seconds']
            model_name = cached_data['model_name']

        except Exception as e:
            print(f"Error loading pickle ({e}). Will re-train model.")
            os.remove(filepath)

    if best_model is None:
        print("No cache found. Starting training...")
        start_time = time.time()

        class_weight_param = 'balanced' if use_class_weight else None

        if model_type.lower() == "svm":
            model_name = "Support Vector Machine"
            model = SVC(C=1.0, kernel="rbf", gamma="scale", probability=True,
                        random_state=random_state, class_weight=class_weight_param)
            param_dist = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale", "auto"]}

        elif model_type.lower() == "rf":
            model_name = "Random Forest"
            model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2,
                                           random_state=random_state, n_jobs=-1, class_weight=class_weight_param)
            param_dist = {"n_estimators": [50, 100, 150], "max_depth": [None, 3, 5, 10], "min_samples_split": [2, 5, 10]}

        elif model_type.lower() == "dt":
            model_name = "Decision Tree"
            model = DecisionTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2,
                                           random_state=random_state, class_weight=class_weight_param)
            param_dist = {"max_depth": [None, 3, 5, 10], "min_samples_split": [2, 5, 10], "criterion": ["gini", "entropy"]}

        elif model_type.lower() == "xgb":
            model_name = "XGBoost"
            scale_pos_weight = 1
            try:
                classes, counts = np.unique(y_train, return_counts=True)
                if len(classes) == 2:
                    neg = counts.max()
                    pos = counts.min()
                    if pos > 0: scale_pos_weight = float(neg) / float(pos)
            except: pass

            model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.01, max_depth=5, subsample=0.8,
                                      random_state=random_state, n_jobs=-1, eval_metric="logloss",
                                      tree_method="hist", scale_pos_weight=scale_pos_weight)
            param_dist = {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}
        else:
            raise ValueError("Choose model_type = 'svm', 'rf', 'dt', or 'xgb'")

        if not use_tuning:
            model.fit(X_train, y_train)
            best_model = model
            best_params = getattr(model, "get_params", lambda: {})()
            print(f"Fitted {model_name} with default parameters.")
        else:
            stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
            random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                               n_iter=n_iter, cv=stratified_kfold, verbose=1,
                                               random_state=random_state, n_jobs=-1, scoring="f1_macro")
            random_search.fit(X_train, y_train)
            best_model = random_search.best_estimator_
            best_cv_score = random_search.best_score_
            best_params = random_search.best_params_
            print(f"Best CV F1-Macro: {best_cv_score:.4f}")

        tuning_time = time.time() - start_time
        print(f"Training done in {tuning_time:.2f} seconds.")

    y_pred = best_model.predict(X_test)

    test_acc = accuracy_score(y_test, y_pred)
    test_report = classification_report(y_test, y_pred, output_dict=True)
    test_cm = confusion_matrix(y_test, y_pred)

    print(f"\nEvaluation on TEST SET")
    print(f"Accuracy: {test_acc:.4f}")
    if "macro avg" in test_report:
        print(f"F1-Macro: {test_report['macro avg']['f1-score']:.4f}")
    else:
        print("F1-Macro: (not available)")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    experiment = {
        "representation_name": representation_name,
        "model_name": model_name,
        "balance_strategy": balance_status_return,
        "best_model": best_model,
        "best_cv_score_f1_macro": best_cv_score,
        "best_params": best_params,
        "test_accuracy": test_acc,
        "test_f1_macro": test_report.get("macro avg", {}).get("f1-score", None),
        "test_classification_report_dict": test_report,
        "test_confusion_matrix": test_cm,
        "tuning_time_seconds": tuning_time,
        "tuning": use_tuning
    }

    if not os.path.exists(filepath):
        print(f"Saving model to {filepath} ...")
        with open(filepath, 'wb') as f:
            pickle.dump(experiment, f)
    else:
        print("Model loaded from cache, no overwrite needed.")

    return experiment

## Baseline

### SVM

In [None]:
exp_svm = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "svm")

Experiment: Model + SVM (none)
Found cached model at: saved_models_2_class\SVM_Model_TuningFalse_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9775
F1-Macro: 0.4943

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.00      0.00      0.00        27
  NON_BUZZER       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.49      0.50      0.49      1291
weighted avg       0.96      0.98      0.97      1291

Model loaded from cache, no overwrite needed.


### Random Forest

In [None]:
exp_rf  = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "rf")

Experiment: Model + RF (none)
Found cached model at: saved_models_2_class\RF_Model_TuningFalse_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9837
F1-Macro: 0.7121

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.80      0.30      0.43        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.89      0.65      0.71      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


### Decision Tree

In [None]:
exp_dt  = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "dt")

Experiment: Model + DT (none)
Found cached model at: saved_models_2_class\DT_Model_TuningFalse_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9582
F1-Macro: 0.6142

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.20      0.33      0.25        27
  NON_BUZZER       0.99      0.97      0.98      1264

    accuracy                           0.96      1291
   macro avg       0.59      0.65      0.61      1291
weighted avg       0.97      0.96      0.96      1291

Model loaded from cache, no overwrite needed.


### XGBoost

In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [None]:
exp_xgb = run_ml_experiment(X_train_ft, y_train_enc, X_test_ft, y_test_enc, "xgb")

Experiment: Model + XGB (none)
Found cached model at: saved_models_2_class\XGB_Model_TuningFalse_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9791
F1-Macro: 0.4947

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.49      0.50      0.49      1291
weighted avg       0.96      0.98      0.97      1291

Model loaded from cache, no overwrite needed.


## Fine-Tuning

### SVM

In [None]:
exp_svm_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "svm", use_tuning=True)

Experiment: Model + SVM (none)
Found cached model at: saved_models_2_class\SVM_Model_TuningTrue_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9853
F1-Macro: 0.7753

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.75      0.44      0.56        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.87      0.72      0.78      1291
weighted avg       0.98      0.99      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_svm_ft['best_params']

{'kernel': 'rbf', 'gamma': 'scale', 'C': 10}

### Random Forest

In [None]:
exp_rf_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "rf", use_tuning=True)

Experiment: Model + RF (none)
Found cached model at: saved_models_2_class\RF_Model_TuningTrue_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9830
F1-Macro: 0.6722

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.86      0.22      0.35        27
  NON_BUZZER       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.92      0.61      0.67      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_rf_ft['best_params']

{'n_estimators': 150, 'min_samples_split': 10, 'max_depth': None}

### Decision Tree

In [None]:
exp_dt_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "dt", use_tuning=True)

Experiment: Model + DT (none)
Found cached model at: saved_models_2_class\DT_Model_TuningTrue_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9752
F1-Macro: 0.6127

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.33      0.19      0.24        27
  NON_BUZZER       0.98      0.99      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.66      0.59      0.61      1291
weighted avg       0.97      0.98      0.97      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_dt_ft['best_params']

{'min_samples_split': 2, 'max_depth': 5, 'criterion': 'entropy'}

### XGBoost

In [None]:
exp_xgb_ft = run_ml_experiment(X_train_ft, y_train_enc, X_test_ft, y_test_enc, "xgb", use_tuning=True)

Experiment: Model + XGB (none)
Found cached model at: saved_models_2_class\XGB_Model_TuningTrue_none.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9845
F1-Macro: 0.7020

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.26      0.41        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.99      0.63      0.70      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_xgb_ft['best_params']

{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}

## Class Weight

### SVM

In [None]:
exp_svm_cw = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "svm", use_class_weight=True)

Experiment: Model + SVM (balanced)
Found cached model at: saved_models_2_class\SVM_Model_TuningFalse_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9342
F1-Macro: 0.6427

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.20      0.74      0.32        27
  NON_BUZZER       0.99      0.94      0.97      1264

    accuracy                           0.93      1291
   macro avg       0.60      0.84      0.64      1291
weighted avg       0.98      0.93      0.95      1291

Model loaded from cache, no overwrite needed.


### Random Forest

In [None]:
exp_rf_cw  = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "rf", use_class_weight=True)

Experiment: Model + RF (balanced)
Found cached model at: saved_models_2_class\RF_Model_TuningFalse_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9837
F1-Macro: 0.6777

Classification Report:
              precision    recall  f1-score   support

      BUZZER       1.00      0.22      0.36        27
  NON_BUZZER       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.99      0.61      0.68      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


### Decision Tree

In [None]:
exp_dt_cw  = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "dt", use_class_weight=True)

Experiment: Model + DT (balanced)
Found cached model at: saved_models_2_class\DT_Model_TuningFalse_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9729
F1-Macro: 0.6499

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.33      0.30      0.31        27
  NON_BUZZER       0.99      0.99      0.99      1264

    accuracy                           0.97      1291
   macro avg       0.66      0.64      0.65      1291
weighted avg       0.97      0.97      0.97      1291

Model loaded from cache, no overwrite needed.


### XGBoost

In [None]:
exp_xgb_cw = run_ml_experiment(X_train_ft, y_train_enc, X_test_ft, y_test_enc, "xgb", use_class_weight=True)

Experiment: Model + XGB (balanced)
Found cached model at: saved_models_2_class\XGB_Model_TuningFalse_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9791
F1-Macro: 0.4947

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.49      0.50      0.49      1291
weighted avg       0.96      0.98      0.97      1291

Model loaded from cache, no overwrite needed.


## Fine Tuning - Class Weight

### SVM

In [None]:
exp_svm_cw_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "svm", use_class_weight=True, use_tuning=True)

Experiment: Model + SVM (balanced)
Found cached model at: saved_models_2_class\SVM_Model_TuningTrue_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9737
F1-Macro: 0.7357

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.41      0.59      0.48        27
  NON_BUZZER       0.99      0.98      0.99      1264

    accuracy                           0.97      1291
   macro avg       0.70      0.79      0.74      1291
weighted avg       0.98      0.97      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_svm_cw_ft["best_params"]

{'kernel': 'rbf', 'gamma': 'scale', 'C': 10}

### Random Forest

In [None]:
exp_rf_cw_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "rf", use_class_weight=True, use_tuning=True)

Experiment: Model + RF (balanced)
Found cached model at: saved_models_2_class\RF_Model_TuningTrue_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9845
F1-Macro: 0.7580

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.73      0.41      0.52        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.86      0.70      0.76      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_rf_cw_ft["best_params"]

{'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 10}

### Decision Tree

In [None]:
exp_dt_cw_ft = run_ml_experiment(X_train_ft, y_train, X_test_ft, y_test, "dt", use_class_weight=True, use_tuning=True)

Experiment: Model + DT (balanced)
Found cached model at: saved_models_2_class\DT_Model_TuningTrue_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9729
F1-Macro: 0.6499

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.33      0.30      0.31        27
  NON_BUZZER       0.99      0.99      0.99      1264

    accuracy                           0.97      1291
   macro avg       0.66      0.64      0.65      1291
weighted avg       0.97      0.97      0.97      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_dt_cw_ft["best_params"]

{'min_samples_split': 2, 'max_depth': None, 'criterion': 'gini'}

### XGBoost

In [None]:
exp_xgb_cw_ft = run_ml_experiment(X_train_ft, y_train_enc, X_test_ft, y_test_enc, "xgb", use_class_weight=True, use_tuning=True)

Experiment: Model + XGB (balanced)
Found cached model at: saved_models_2_class\XGB_Model_TuningTrue_balanced.pkl
   Loading model from pickle... (Skipping training)

Evaluation on TEST SET
Accuracy: 0.9845
F1-Macro: 0.7020

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.26      0.41        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.99      0.63      0.70      1291
weighted avg       0.98      0.98      0.98      1291

Model loaded from cache, no overwrite needed.


In [None]:
exp_xgb_cw_ft["best_params"]

{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}

## Table Comparison

In [46]:
def create_comparison_table(experiment_results_list):
    rows = []

    for exp in experiment_results_list:
        report = exp.get('test_classification_report_dict', {})
        buzzer_metrics = report.get('0') or report.get('BUZZER') or {}

        row = {
            'Model': exp['model_name'],
            'Balance Strategy': exp['balance_strategy'],
            'Tuning': exp['tuning'],
            'Precision (Buzzer)': buzzer_metrics.get('precision', 0),
            'Recall (Buzzer)': buzzer_metrics.get('recall', 0),
            'F1 Score (Buzzer)': buzzer_metrics.get('f1-score', 0),
            'F1 Score (Macro)': exp['test_f1_macro']
        }
        rows.append(row)

    df_comparison = pd.DataFrame(rows)

    df_comparison = df_comparison.sort_values(by='F1 Score (Macro)', ascending=False).reset_index(drop=True)

    return df_comparison

In [None]:
all_experiments = [exp_svm, exp_svm_cw, exp_svm_cw_ft, exp_svm_ft,
                   exp_rf, exp_rf_cw, exp_rf_cw_ft, exp_rf_ft,
                   exp_dt_ft, exp_dt_cw, exp_dt_cw_ft, exp_dt_ft,
                   exp_xgb, exp_xgb_cw, exp_xgb_cw_ft, exp_xgb_ft]

In [None]:
df_result = create_comparison_table(all_experiments)

print("\n=== FINAL COMPARISON TABLE - FAST TEXT ===")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_result


=== FINAL COMPARISON TABLE - FAST TEXT ===


Unnamed: 0,Model,Balance Strategy,Tuning,Precision (Buzzer),Recall (Buzzer),F1 Score (Buzzer),F1 Score (Macro)
0,Support Vector Machine,,True,0.75,0.444,0.558,0.775
1,Random Forest,Class Weight,True,0.733,0.407,0.524,0.758
2,Support Vector Machine,Class Weight,True,0.41,0.593,0.485,0.736
3,Random Forest,,False,0.8,0.296,0.432,0.712
4,XGBoost,,True,1.0,0.259,0.412,0.702
5,XGBoost,Class Weight,True,1.0,0.259,0.412,0.702
6,Random Forest,Class Weight,False,1.0,0.222,0.364,0.678
7,Random Forest,,True,0.857,0.222,0.353,0.672
8,Decision Tree,Class Weight,False,0.333,0.296,0.314,0.65
9,Decision Tree,Class Weight,True,0.333,0.296,0.314,0.65


The comparison table shows that Support Vector Machine (SVM) with hyperparameter tuning and no class-weighting achieves the best overall performance, with an F1-Macro score of 0.775, the highest among all tested models. This indicates that SVM is the most effective classifier for distinguishing between the two classes using FastText embeddings. This model strikes the best balance of Precision (0.75) and Recall (0.44) in predicting Buzzer class. It shows that while it detects fewer than half of the actual "Buzzer", it is highly reliable when it does make a prediction (75% of its predictions are correct).

SVM with tuning achieves the highest macro-F1 score because it effectively utilizes FastText’s dense semantic embeddings and benefits from optimized hyperparameters (e.g., C, kernel). These adjustments improve its ability to separate classes in high-dimensional space, outperforming Random Forest, XGBoost, and Decision Tree models.

Random Forest and XGBoost also achieve high accuracy, but their macro-recall and macro-F1 scores are consistently lower, suggesting difficulty in balancing performance across both classes, especially the minority class. On the other side, overall Decision Tree models show the weakest performance, which is expected because they tend to overfit on dense text embeddings like FastText.

Our comparison table reveals a trade-offs between Precision and Recall. Row 4 and row 6 achieved a perfect Precision (1.0), but this came at the cost of the Recall (~0.22 - 0.26). They missed over 75% of the actual buzzer class, meaning these models are too conservative at predicting "Buzzer". On the other hand, applying Class Weight to SVM successfully shifted the focus into the minority class (Buzzer), raising the Recall to 0.593, but this caused the Precision to drop to 0.41.

# Modeling - TF-IDF

In [None]:
!unzip saved_models_tfidf_2_class.zip

Archive:  saved_models_tfidf_2_class.zip
   creating: saved_models_tfidf_2_class/
  inflating: saved_models_tfidf_2_class/DT_TFIDF_Model_TuningFalse_balanced.pkl  
  inflating: saved_models_tfidf_2_class/DT_TFIDF_Model_TuningFalse_none.pkl  
  inflating: saved_models_tfidf_2_class/DT_TFIDF_Model_TuningTrue_balanced.pkl  
  inflating: saved_models_tfidf_2_class/DT_TFIDF_Model_TuningTrue_none.pkl  
  inflating: saved_models_tfidf_2_class/RF_TFIDF_Model_TuningFalse_balanced.pkl  
  inflating: saved_models_tfidf_2_class/RF_TFIDF_Model_TuningFalse_none.pkl  
  inflating: saved_models_tfidf_2_class/RF_TFIDF_Model_TuningTrue_balanced.pkl  
  inflating: saved_models_tfidf_2_class/RF_TFIDF_Model_TuningTrue_none.pkl  
  inflating: saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningFalse_balanced.pkl  
  inflating: saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningFalse_none.pkl  
  inflating: saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningTrue_balanced.pkl  
  inflating: saved_models_tfidf_2_clas

In [22]:
def run_ml_experiment_tfidf(X_train, y_train, X_test, y_test,
                      model_type,
                      representation_name="TFIDF_Model",
                      n_iter=10,
                      cv=4,
                      use_class_weight=False,
                      random_state=42,
                      use_tuning=False,
                      cache_dir="saved_models_tfidf_2_class"):

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    balance_str = "balanced" if use_class_weight else "none"
    model_type_str = model_type.upper()

    safe_rep_name = representation_name.replace(" ", "_")
    filename = f"{model_type_str}_{safe_rep_name}_Tuning{use_tuning}_{balance_str}.pkl"
    filepath = os.path.join(cache_dir, filename)

    print(f"Experiment: {representation_name} + {model_type_str} ({balance_str})")

    best_model = None
    best_params = None
    best_cv_score = None
    tuning_time = 0
    model_name = ""
    balance_status_return = "Class Weight" if use_class_weight else "None"

    if os.path.exists(filepath):
        print(f"Found cached model at: {filepath}")
        print("   Loading model from pickle... (Skipping training)")

        try:
            with open(filepath, 'rb') as f:
                cached_data = pickle.load(f)

            best_model = cached_data['best_model']
            best_params = cached_data['best_params']
            best_cv_score = cached_data['best_cv_score_f1_macro']
            tuning_time = cached_data['tuning_time_seconds']
            model_name = cached_data['model_name']

        except Exception as e:
            print(f"Error loading pickle ({e}). Will re-train model.")
            os.remove(filepath)

    if best_model is None:
        print("No cache found. Starting training...")
        start_time = time.time()

        class_weight_param = 'balanced' if use_class_weight else None

        if model_type.lower() == "svm":
            model_name = "Support Vector Machine"
            model = SVC(C=1.0, kernel="rbf", gamma="scale", probability=True,
                        random_state=random_state, class_weight=class_weight_param)
            param_dist = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale", "auto"]}

        elif model_type.lower() == "rf":
            model_name = "Random Forest"
            model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2,
                                           random_state=random_state, n_jobs=-1, class_weight=class_weight_param)
            param_dist = {"n_estimators": [50, 100, 150], "max_depth": [None, 3, 5, 10], "min_samples_split": [2, 5, 10]}

        elif model_type.lower() == "dt":
            model_name = "Decision Tree"
            model = DecisionTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2,
                                           random_state=random_state, class_weight=class_weight_param)
            param_dist = {"max_depth": [None, 3, 5, 10], "min_samples_split": [2, 5, 10], "criterion": ["gini", "entropy"]}

        elif model_type.lower() == "xgb":
            model_name = "XGBoost"
            scale_pos_weight = 1
            try:
                classes, counts = np.unique(y_train, return_counts=True)
                if len(classes) == 2:
                    neg = counts.max()
                    pos = counts.min()
                    if pos > 0: scale_pos_weight = float(neg) / float(pos)
            except: pass

            model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.01, max_depth=5, subsample=0.8,
                                      random_state=random_state, n_jobs=-1, eval_metric="logloss",
                                      tree_method="hist", scale_pos_weight=scale_pos_weight)
            param_dist = {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}
        else:
            raise ValueError("Choose model_type = 'svm', 'rf', 'dt', or 'xgb'")

        if not use_tuning:
            model.fit(X_train, y_train)
            best_model = model
            best_params = getattr(model, "get_params", lambda: {})()
            print(f"Fitted {model_name} with default parameters.")
        else:
            stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
            random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                               n_iter=n_iter, cv=stratified_kfold, verbose=1,
                                               random_state=random_state, n_jobs=-1, scoring="f1_macro")
            random_search.fit(X_train, y_train)
            best_model = random_search.best_estimator_
            best_cv_score = random_search.best_score_
            best_params = random_search.best_params_
            print(f"Best CV F1-Macro: {best_cv_score:.4f}")

        tuning_time = time.time() - start_time
        print(f"Training done in {tuning_time:.2f} seconds.")


    y_pred = best_model.predict(X_test)

    test_acc = accuracy_score(y_test, y_pred)
    test_report = classification_report(y_test, y_pred, output_dict=True)
    test_cm = confusion_matrix(y_test, y_pred)

    print(f"\nEvaluation on TEST SET")
    print(f"Accuracy: {test_acc:.4f}")
    if "macro avg" in test_report:
        print(f"F1-Macro: {test_report['macro avg']['f1-score']:.4f}")
    else:
        print("F1-Macro: (not available)")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    experiment = {
        "representation_name": representation_name,
        "model_name": model_name,
        "balance_strategy": balance_status_return,
        "best_model": best_model,
        "best_cv_score_f1_macro": best_cv_score,
        "best_params": best_params,
        "test_accuracy": test_acc,
        "test_f1_macro": test_report.get("macro avg", {}).get("f1-score", None),
        "test_classification_report_dict": test_report,
        "test_confusion_matrix": test_cm,
        "tuning_time_seconds": tuning_time,
        "tuning": use_tuning
    }

    if not os.path.exists(filepath):
        print(f"Saving model to {filepath} ...")
        with open(filepath, 'wb') as f:
            pickle.dump(experiment, f)
    else:
        print("Model loaded from cache, no overwrite needed.")

    return experiment

## Baseline

### SVM

In [25]:
exp_svm_tfidf = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "svm")

Experiment: TFIDF_Model + SVM (none)
No cache found. Starting training...
Fitted Support Vector Machine with default parameters.
Training done in 7.59 seconds.

Evaluation on TEST SET
Accuracy: 0.9861
F1-Macro: 0.7715

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.85      0.41      0.55        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.92      0.70      0.77      1291
weighted avg       0.98      0.99      0.98      1291

Saving model to saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningFalse_none.pkl ...


### Random Forest

In [26]:
exp_rf_tfidf = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "rf")

Experiment: TFIDF_Model + RF (none)
No cache found. Starting training...
Fitted Random Forest with default parameters.
Training done in 5.02 seconds.

Evaluation on TEST SET
Accuracy: 0.9884
F1-Macro: 0.8226

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.88      0.52      0.65        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.93      0.76      0.82      1291
weighted avg       0.99      0.99      0.99      1291

Saving model to saved_models_tfidf_2_class/RF_TFIDF_Model_TuningFalse_none.pkl ...


### Decision Tree

In [27]:
exp_dt_tfidf = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "dt")

Experiment: TFIDF_Model + DT (none)
No cache found. Starting training...
Fitted Decision Tree with default parameters.
Training done in 1.19 seconds.

Evaluation on TEST SET
Accuracy: 0.9837
F1-Macro: 0.7816

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.64      0.52      0.57        27
  NON_BUZZER       0.99      0.99      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.81      0.76      0.78      1291
weighted avg       0.98      0.98      0.98      1291

Saving model to saved_models_tfidf_2_class/DT_TFIDF_Model_TuningFalse_none.pkl ...


### XGBoost

In [32]:
exp_xgb_tfidf = run_ml_experiment_tfidf(X_train_tfidf, y_train_enc, X_test_tfidf, y_test_enc, "xgb")

Experiment: TFIDF_Model + XGB (none)
No cache found. Starting training...
Fitted XGBoost with default parameters.
Training done in 5.18 seconds.

Evaluation on TEST SET
Accuracy: 0.9806
F1-Macro: 0.5641

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.07      0.14        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.99      0.54      0.56      1291
weighted avg       0.98      0.98      0.97      1291

Saving model to saved_models_tfidf_2_class/XGB_TFIDF_Model_TuningFalse_none.pkl ...


## Fine-Tuning

### SVM

In [33]:
exp_svm_tfidf_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "svm", use_tuning=True)

Experiment: TFIDF_Model + SVM (none)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.7263
Training done in 88.01 seconds.

Evaluation on TEST SET
Accuracy: 0.9861
F1-Macro: 0.8008

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.74      0.52      0.61        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.86      0.76      0.80      1291
weighted avg       0.98      0.99      0.98      1291

Saving model to saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningTrue_none.pkl ...


### Random Forest

In [34]:
exp_rf_tfidf_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "rf", use_tuning=True)

Experiment: TFIDF_Model + RF (none)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.7082
Training done in 39.94 seconds.

Evaluation on TEST SET
Accuracy: 0.9884
F1-Macro: 0.8226

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.88      0.52      0.65        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.93      0.76      0.82      1291
weighted avg       0.99      0.99      0.99      1291

Saving model to saved_models_tfidf_2_class/RF_TFIDF_Model_TuningTrue_none.pkl ...


### Decision Tree

In [35]:
exp_dt_tfidf_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "dt", use_tuning=True)

Experiment: TFIDF_Model + DT (none)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.7361
Training done in 11.01 seconds.

Evaluation on TEST SET
Accuracy: 0.9853
F1-Macro: 0.7753

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.75      0.44      0.56        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.87      0.72      0.78      1291
weighted avg       0.98      0.99      0.98      1291

Saving model to saved_models_tfidf_2_class/DT_TFIDF_Model_TuningTrue_none.pkl ...


### XGBoost

In [36]:
exp_xgb_tfidf_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train_enc, X_test_tfidf, y_test_enc, "xgb", use_tuning=True)

Experiment: TFIDF_Model + XGB (none)
No cache found. Starting training...
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Best CV F1-Macro: 0.5932
Training done in 32.62 seconds.

Evaluation on TEST SET
Accuracy: 0.9837
F1-Macro: 0.6959

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.26      0.40        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.93      0.63      0.70      1291
weighted avg       0.98      0.98      0.98      1291

Saving model to saved_models_tfidf_2_class/XGB_TFIDF_Model_TuningTrue_none.pkl ...


## Class Weight

### SVM

In [37]:
exp_svm_tfidf_cw = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "svm", use_class_weight=True)

Experiment: TFIDF_Model + SVM (balanced)
No cache found. Starting training...
Fitted Support Vector Machine with default parameters.
Training done in 8.60 seconds.

Evaluation on TEST SET
Accuracy: 0.9342
F1-Macro: 0.6195

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.18      0.59      0.27        27
  NON_BUZZER       0.99      0.94      0.97      1264

    accuracy                           0.93      1291
   macro avg       0.58      0.77      0.62      1291
weighted avg       0.97      0.93      0.95      1291

Saving model to saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningFalse_balanced.pkl ...


### Random Forest

In [38]:
exp_rf_tfidf_cw = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "rf", use_class_weight=True)

Experiment: TFIDF_Model + RF (balanced)
No cache found. Starting training...
Fitted Random Forest with default parameters.
Training done in 5.76 seconds.

Evaluation on TEST SET
Accuracy: 0.9187
F1-Macro: 0.5897

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.14      0.56      0.22        27
  NON_BUZZER       0.99      0.93      0.96      1264

    accuracy                           0.92      1291
   macro avg       0.56      0.74      0.59      1291
weighted avg       0.97      0.92      0.94      1291

Saving model to saved_models_tfidf_2_class/RF_TFIDF_Model_TuningFalse_balanced.pkl ...


### Decision Tree

In [39]:
exp_dt_tfidf_cw = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "dt", use_class_weight=True)

Experiment: TFIDF_Model + DT (balanced)
No cache found. Starting training...
Fitted Decision Tree with default parameters.
Training done in 0.91 seconds.

Evaluation on TEST SET
Accuracy: 0.8675
F1-Macro: 0.5429

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.09      0.59      0.16        27
  NON_BUZZER       0.99      0.87      0.93      1264

    accuracy                           0.87      1291
   macro avg       0.54      0.73      0.54      1291
weighted avg       0.97      0.87      0.91      1291

Saving model to saved_models_tfidf_2_class/DT_TFIDF_Model_TuningFalse_balanced.pkl ...


### XGBoost

In [40]:
exp_xgb_tfidf_cw = run_ml_experiment_tfidf(X_train_tfidf, y_train_enc, X_test_tfidf, y_test_enc, "xgb", use_class_weight=True)

Experiment: TFIDF_Model + XGB (balanced)
No cache found. Starting training...
Fitted XGBoost with default parameters.
Training done in 2.53 seconds.

Evaluation on TEST SET
Accuracy: 0.9806
F1-Macro: 0.5641

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.07      0.14        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.99      0.54      0.56      1291
weighted avg       0.98      0.98      0.97      1291

Saving model to saved_models_tfidf_2_class/XGB_TFIDF_Model_TuningFalse_balanced.pkl ...


## Fine Tuning - Class Weight

### SVM

In [41]:
exp_svm_tfidf_cw_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "svm", use_class_weight=True, use_tuning=True)

Experiment: TFIDF_Model + SVM (balanced)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.6783
Training done in 258.63 seconds.

Evaluation on TEST SET
Accuracy: 0.9613
F1-Macro: 0.6924

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.30      0.63      0.40        27
  NON_BUZZER       0.99      0.97      0.98      1264

    accuracy                           0.96      1291
   macro avg       0.65      0.80      0.69      1291
weighted avg       0.98      0.96      0.97      1291

Saving model to saved_models_tfidf_2_class/SVM_TFIDF_Model_TuningTrue_balanced.pkl ...


### Random Forest

In [42]:
exp_rf_tfidf_cw_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "rf", use_class_weight=True, use_tuning=True)

Experiment: TFIDF_Model + RF (balanced)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.7034
Training done in 37.25 seconds.

Evaluation on TEST SET
Accuracy: 0.9868
F1-Macro: 0.7893

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.86      0.44      0.59        27
  NON_BUZZER       0.99      1.00      0.99      1264

    accuracy                           0.99      1291
   macro avg       0.92      0.72      0.79      1291
weighted avg       0.99      0.99      0.98      1291

Saving model to saved_models_tfidf_2_class/RF_TFIDF_Model_TuningTrue_balanced.pkl ...


### Decision Tree

In [43]:
exp_dt_tfidf_cw_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train, X_test_tfidf, y_test, "dt", use_class_weight=True, use_tuning=True)

Experiment: TFIDF_Model + DT (balanced)
No cache found. Starting training...
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best CV F1-Macro: 0.6585
Training done in 10.22 seconds.

Evaluation on TEST SET
Accuracy: 0.9620
F1-Macro: 0.6546

Classification Report:
              precision    recall  f1-score   support

      BUZZER       0.26      0.44      0.33        27
  NON_BUZZER       0.99      0.97      0.98      1264

    accuracy                           0.96      1291
   macro avg       0.62      0.71      0.65      1291
weighted avg       0.97      0.96      0.97      1291

Saving model to saved_models_tfidf_2_class/DT_TFIDF_Model_TuningTrue_balanced.pkl ...


### XGBoost

In [44]:
exp_xgb_tfidf_cw_ft = run_ml_experiment_tfidf(X_train_tfidf, y_train_enc, X_test_tfidf, y_test_enc, "xgb", use_class_weight=True, use_tuning=True)

Experiment: TFIDF_Model + XGB (balanced)
No cache found. Starting training...
Fitting 4 folds for each of 8 candidates, totalling 32 fits
Best CV F1-Macro: 0.5932
Training done in 32.12 seconds.

Evaluation on TEST SET
Accuracy: 0.9837
F1-Macro: 0.6959

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.26      0.40        27
           1       0.98      1.00      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.93      0.63      0.70      1291
weighted avg       0.98      0.98      0.98      1291

Saving model to saved_models_tfidf_2_class/XGB_TFIDF_Model_TuningTrue_balanced.pkl ...


## Table Comparison

In [45]:
all_experiments_tfidf = [exp_svm_tfidf, exp_svm_tfidf_cw, exp_svm_tfidf_cw_ft, exp_svm_tfidf_ft,
                   exp_rf_tfidf, exp_rf_tfidf_cw, exp_rf_tfidf_cw_ft, exp_rf_tfidf_ft,
                   exp_dt_tfidf, exp_dt_tfidf_cw, exp_dt_tfidf_cw_ft, exp_dt_tfidf_ft,
                   exp_xgb_tfidf, exp_xgb_tfidf_cw, exp_xgb_tfidf_cw_ft, exp_xgb_tfidf_ft]

In [47]:
df_result = create_comparison_table(all_experiments_tfidf)

print("\n=== FINAL COMPARISON TABLE - TF-IDF ===")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_result


=== FINAL COMPARISON TABLE - TF-IDF ===


Unnamed: 0,Model,Balance Strategy,Tuning,Precision (Buzzer),Recall (Buzzer),F1 Score (Buzzer),F1 Score (Macro)
0,Random Forest,,True,0.875,0.519,0.651,0.823
1,Random Forest,,False,0.875,0.519,0.651,0.823
2,Support Vector Machine,,True,0.737,0.519,0.609,0.801
3,Random Forest,Class Weight,True,0.857,0.444,0.585,0.789
4,Decision Tree,,False,0.636,0.519,0.571,0.782
5,Decision Tree,,True,0.75,0.444,0.558,0.775
6,Support Vector Machine,,False,0.846,0.407,0.55,0.771
7,XGBoost,Class Weight,True,0.875,0.259,0.4,0.696
8,XGBoost,,True,0.875,0.259,0.4,0.696
9,Support Vector Machine,Class Weight,True,0.298,0.63,0.405,0.692


The results show that Random Forest without imbalance handling, both with and without tuning, achieves the strongest performance with an F1-Macro score of 0.823 and 0.651 F1-Score (Buzzer), making it the best-performing model under the TF-IDF representation. This indicates that Random Forest is highly effective at capturing patterns from the sparse, high-dimensional TF-IDF features.

Random Forest with tuning achieves the highest performance (F1-Macro 0.823) on TF-IDF features because tree-based models handle sparse, high-dimensional data effectively. Hyperparameter tuning further improves the model's ability to capture class patterns, outperforming SVM, XGBoost, and Decision Tree models.

Support Vector Machine (SVM) also performs well, but its F1-Macro scores remain slightly lower than Random Forest, suggesting that the TF-IDF feature space may be less linearly separable compared to FastText, reducing SVM’s advantage.

XGBoost and Decision Tree models generally show weaker macro-F1, with Decision Tree being the least stable model, likely due to overfitting on sparse TF-IDF vectors.

In [None]:
from google.colab import files
import shutil

shutil.make_archive('saved_models_tfidf_2_class', 'zip', 'saved_models_tfidf_2_class')

files.download('saved_models_tfidf_2_class.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# IndoBERT

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "indolem/indobertweet-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="./models/indobert")
model = AutoModel.from_pretrained(model_name, cache_dir="./models/indobert")

print("Model saved in ./models/indobert")

Model saved in ./models/indobert


## Split Data

In [None]:
val_size = 15/85

X_train_indobert, X_val_indobert, y_train_indobert, y_val_indobert = train_test_split(
    X_train, y_train, test_size=val_size, random_state=42, stratify=y_train
)

In [None]:
train_texts = set(X_train_indobert)
val_texts = set(X_val_indobert)
test_texts = set(X_test)

print(f"Overlap train-val: {len(train_texts.intersection(val_texts))}")
print(f"Overlap train-test: {len(train_texts.intersection(test_texts))}")
print(f"Overlap val-test: {len(val_texts.intersection(test_texts))}")

Overlap train-val: 0
Overlap train-test: 0
Overlap val-test: 0


## Make Dataset

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_dict({"text": X_train_indobert, "label": y_train_indobert}),
    "validation": Dataset.from_dict({"text": X_val_indobert, "label": y_val_indobert}),
    "test": Dataset.from_dict({"text": X_test, "label": y_test}),
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6021
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1291
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1291
    })
})

## Y Encode

In [None]:
from sklearn.preprocessing import LabelEncoder

train_labels = dataset["train"]["label"]
le = LabelEncoder()
le.fit(train_labels)

def encode_labels(batch):
    batch["label"] = [le.transform([l])[0] if l in le.classes_ else -1 for l in batch["label"]]
    return batch

dataset = dataset.map(encode_labels, batched=True)

Map: 100%|██████████| 6021/6021 [00:00<00:00, 12150.48 examples/s]
Map: 100%|██████████| 1291/1291 [00:00<00:00, 9851.32 examples/s] 
Map: 100%|██████████| 1291/1291 [00:00<00:00, 11541.80 examples/s]


## Tokenization

In [None]:
def preprocess_indobert(data):
    return tokenizer(data["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(preprocess_indobert, batched=True)
dataset = dataset.remove_columns(["text"])
dataset.set_format(type="torch", columns=["input_ids","attention_mask","label"])

Map:   0%|          | 0/6021 [00:00<?, ? examples/s]

Map: 100%|██████████| 6021/6021 [00:00<00:00, 12463.40 examples/s]
Map: 100%|██████████| 1291/1291 [00:00<00:00, 14342.75 examples/s]
Map: 100%|██████████| 1291/1291 [00:00<00:00, 15805.66 examples/s]


## Class Weight IndoBERT

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

labels_array = np.array(y_train_indobert)

classes = np.unique(labels_array)
class_weights = compute_class_weight("balanced", classes=classes, y=labels_array)
class_weights = torch.tensor(class_weights, dtype=torch.float).cuda()
print("Class Weights:", class_weights)

Class Weights: tensor([25.7308,  0.5099], device='cuda:0')


## Custom IndoBERT Trainer

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(classes),
    cache_dir="./models/indobert"
)


class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)

        pt = torch.exp(-ce_loss)

        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

class FocalLossTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.0, **kwargs):
        super().__init__(**kwargs)
        self.focal_loss = FocalLoss(alpha=alpha, gamma=gamma)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels") if "labels" in inputs else inputs.get("label")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss = self.focal_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

training_args = TrainingArguments(
    output_dir="./results/indobert",
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50
)

trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.0311,0.021305,0.984508,0.732911
2,0.0129,0.018593,0.986832,0.798978
3,0.0051,0.023471,0.985283,0.752675
4,0.0038,0.021592,0.986832,0.798978
5,0.0013,0.024392,0.986832,0.789338


TrainOutput(global_step=1885, training_loss=0.010520842414477776, metrics={'train_runtime': 445.1956, 'train_samples_per_second': 67.622, 'train_steps_per_second': 4.234, 'total_flos': 1980239580403200.0, 'train_loss': 0.010520842414477776, 'epoch': 5.0})

The model shows consistent performance improvement over the 5 training epochs. Training loss decreases steadily from 0.0276 to 0.0016, indicating that the model is learning effectively. Validation loss remains stable across epochs, with only minor fluctuations, showing no signs of overfitting.

Accuracy increases slightly from 0.9837 to 0.9876, while the F1-Macro score improves more substantially from 0.6959 to 0.7968, suggesting that IndoBERT becomes better at correctly handling both majority and minority classes as training progresses. The stable gap between training and validation metrics indicates a well-regularized model with good generalization.

In [None]:
test_results = trainer.predict(dataset["test"])

preds = np.argmax(test_results.predictions, axis=-1)

labels = test_results.label_ids

print("Classification Report IndoBERT")
print(classification_report(labels, preds, target_names=le.classes_.astype(str), digits=2))

Classification Report IndoBERT
              precision    recall  f1-score   support

      BUZZER       0.54      0.52      0.53        27
  NON_BUZZER       0.99      0.99      0.99      1264

    accuracy                           0.98      1291
   macro avg       0.76      0.75      0.76      1291
weighted avg       0.98      0.98      0.98      1291



IndoBERT failed to beat our machine learning models in predicting the minority class. It struggle to detect the "Buzzer", showing only 0.53 F1-Score in "Buzzer" class.

# **Conclusion**

Based on the evaluation using Precision, Recall, and F1-Score, the TF-IDF representation delivers the strongest overall performance across all tested models. Random Forest paired with TF-IDF consistently achieves the highest macro scores, showing superior ability to distinguish between buzzer and non-buzzer comments even under class imbalance. FastText performs reasonably well but remains below TF-IDF, as its subword semantic features do not match the lexical clarity offered by TF-IDF for this task. Meanwhile, IndoBERT achieves very high accuracy, but its F1-Macro remains slightly lower than TF-IDF, indicating that the model still struggles to fully capture the minority class despite its contextual understanding. Overall, TF-IDF provides the most effective and balanced classification performance for buzzer detection in this study.