In [2]:
# 3_ClassImbalance_FNN_XGBoost.ipynb
# Experiments with:
# - only POW.* features
# - baseline vs class_weight='balanced' vs SMOTE
# - FNN (Keras) and XGBoost

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -------------------------------------------------------
# 1. LOAD DATA 
# -------------------------------------------------------

raw = pd.read_csv("data/raw/EEG_data.csv")


# -------------------------------------------------------
# 2. SELECT FEATURES: ONLY POW.* + LABEL + GROUP
# -------------------------------------------------------

subjects = raw["subject_id"]
y = raw["subject_understood"].astype(int)
pow_cols = [c for c in raw.columns if c.startswith("POW")]
X = raw[pow_cols]

print("Number of POW features:", len(pow_cols))
print("Label distribution:", np.bincount(y))


# -------------------------------------------------------
# 3. SUBJECT-WISE TRAIN / TEST SPLIT
# -------------------------------------------------------

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups=subjects))

X_train_raw, X_test_raw = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print("Train size:", X_train_raw.shape, "Test size:", X_test_raw.shape)
print("Train label distribution:", np.bincount(y_train))
print("Test label distribution:", np.bincount(y_test))

# -------------------------------------------------------
# 4. HELPERS: SCALING, CLASS WEIGHTS, FNN BUILDER, RUNNER
# -------------------------------------------------------

def scale_data(X_train_raw, X_test_raw):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw)
    return X_train_scaled, X_test_scaled

def compute_class_weights_vector(y_train):
    """Tak samo jak w Notebooku 2: wagi w zależności od klasy."""
    pos = (y_train == 1).sum()
    neg = (y_train == 0).sum()
    w_pos = neg / (pos + neg)
    w_neg = pos / (pos + neg)
    sample_weight = np.where(y_train == 1, w_pos, w_neg)
    return sample_weight, (w_neg, w_pos)

def evaluate_binary(y_true, y_pred, y_prob=None, verbose=True):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    if y_prob is not None:
        try:
            auc = roc_auc_score(y_true, y_prob)
        except ValueError:
            auc = np.nan
    else:
        auc = np.nan

    if verbose:
        print(f"Accuracy: {acc:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"ROC-AUC:  {auc:.4f}")
        print("\nClassification report:")
        print(classification_report(y_true, y_pred, zero_division=0))

    return {"accuracy": acc, "f1": f1, "auc": auc}

results = []

# -------------------------------------------------------
# 5. Define experiment variants (FNN = MLPClassifier)
#    - baseline: no reweighting, no SMOTE
#    - balanced: class_weight / scale_pos_weight
#    - smote: SMOTE on training (roughly half-half)
# -------------------------------------------------------

def make_fnn():
    return MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation='relu',
        solver='adam',
        alpha=1e-4,
        batch_size=256,
        learning_rate_init=1e-3,
        max_iter=50,
        random_state=RANDOM_STATE,
        verbose=False
    )

# 5a) baseline: bez wag, bez SMOTE
print("\n========== FNN (MLPClassifier) - baseline ==========")
X_train_scaled, X_test_scaled = scale_data(X_train_raw, X_test_raw)

fnn_baseline = make_fnn()
fnn_baseline.fit(X_train_scaled, y_train)
y_pred = fnn_baseline.predict(X_test_scaled)
y_prob = fnn_baseline.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "FNN", "variant": "baseline", **metrics})

# 5b) balanced_only: sample_weight jak w Notebooku 2
print("\n========== FNN (MLPClassifier) - balanced_only (sample_weight) ==========")
X_train_scaled, X_test_scaled = scale_data(X_train_raw, X_test_raw)
sample_weight, (w_neg, w_pos) = compute_class_weights_vector(y_train)
print("w_neg (class 0):", w_neg, "w_pos (class 1):", w_pos)

fnn_balanced = make_fnn()
fnn_balanced.fit(X_train_scaled, y_train, sample_weight=sample_weight)
y_pred = fnn_balanced.predict(X_test_scaled)
y_prob = fnn_balanced.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "FNN", "variant": "balanced_only", **metrics})

# 5c) smote_only: SMOTE na train, bez dodatkowych wag
print("\n========== FNN (MLPClassifier) - smote_only ==========")
sm = SMOTE(sampling_strategy="auto", random_state=RANDOM_STATE)
X_train_sm_raw, y_train_sm = sm.fit_resample(X_train_raw, y_train)
print("After SMOTE train distribution:", np.bincount(y_train_sm))

X_train_scaled_sm, X_test_scaled = scale_data(X_train_sm_raw, X_test_raw)

fnn_smote = make_fnn()
fnn_smote.fit(X_train_scaled_sm, y_train_sm)
y_pred = fnn_smote.predict(X_test_scaled)
y_prob = fnn_smote.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "FNN", "variant": "smote_only", **metrics})

# -------------------------------------------------------
# 6. XGBoost – three variants
# -------------------------------------------------------

def make_xgb(scale_pos_weight=1.0):
    return xgb.XGBClassifier(
        n_estimators=5,           # tak jak w Notebooku 2
        learning_rate=0.1,
        random_state=RANDOM_STATE,
        scale_pos_weight=scale_pos_weight,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist"
    )

# 6a) baseline: bez wag, bez SMOTE
print("\n========== XGBoost - baseline ==========")
X_train_scaled, X_test_scaled = scale_data(X_train_raw, X_test_raw)

xgb_baseline = make_xgb(scale_pos_weight=1.0)
xgb_baseline.fit(X_train_scaled, y_train)
y_pred = xgb_baseline.predict(X_test_scaled)
y_prob = xgb_baseline.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "XGBoost", "variant": "baseline", **metrics})

# 6b) balanced_only: scale_pos_weight zgodnie z proporcjami
print("\n========== XGBoost - balanced_only (scale_pos_weight) ==========")
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / max(pos, 1)
print("scale_pos_weight:", scale_pos_weight)

X_train_scaled, X_test_scaled = scale_data(X_train_raw, X_test_raw)

xgb_balanced = make_xgb(scale_pos_weight=scale_pos_weight)
xgb_balanced.fit(X_train_scaled, y_train)
y_pred = xgb_balanced.predict(X_test_scaled)
y_prob = xgb_balanced.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "XGBoost", "variant": "balanced_only", **metrics})

# 6c) smote_only: SMOTE na train, scale_pos_weight = 1
print("\n========== XGBoost - smote_only ==========")
sm = SMOTE(sampling_strategy="auto", random_state=RANDOM_STATE)
X_train_sm_raw, y_train_sm = sm.fit_resample(X_train_raw, y_train)
print("After SMOTE train distribution:", np.bincount(y_train_sm))

X_train_scaled_sm, X_test_scaled = scale_data(X_train_sm_raw, X_test_raw)

xgb_smote = make_xgb(scale_pos_weight=1.0)
xgb_smote.fit(X_train_scaled_sm, y_train_sm)
y_pred = xgb_smote.predict(X_test_scaled)
y_prob = xgb_smote.predict_proba(X_test_scaled)[:, 1]
metrics = evaluate_binary(y_test, y_pred, y_prob)

results.append({"model": "XGBoost", "variant": "smote_only", **metrics})

# -------------------------------------------------------
# 7. SUMMARY TABLE
# -------------------------------------------------------

results_df = pd.DataFrame(results).sort_values(["model", "variant"])
results_df

Number of POW features: 70
Label distribution: [14461 54370]
Train size: (58110, 70) Test size: (10721, 70)
Train label distribution: [12072 46038]
Test label distribution: [2389 8332]





Accuracy: 0.7344
F1-score: 0.8360
ROC-AUC:  0.7037

Classification report:
              precision    recall  f1-score   support

           0       0.36      0.26      0.30      2389
           1       0.80      0.87      0.84      8332

    accuracy                           0.73     10721
   macro avg       0.58      0.56      0.57     10721
weighted avg       0.71      0.73      0.72     10721


w_neg (class 0): 0.7922560660815694 w_pos (class 1): 0.20774393391843057




Accuracy: 0.7317
F1-score: 0.8379
ROC-AUC:  0.6774

Classification report:
              precision    recall  f1-score   support

           0       0.31      0.17      0.22      2389
           1       0.79      0.89      0.84      8332

    accuracy                           0.73     10721
   macro avg       0.55      0.53      0.53     10721
weighted avg       0.68      0.73      0.70     10721


After SMOTE train distribution: [46038 46038]




Accuracy: 0.7178
F1-score: 0.8320
ROC-AUC:  0.6267

Classification report:
              precision    recall  f1-score   support

           0       0.19      0.08      0.12      2389
           1       0.77      0.90      0.83      8332

    accuracy                           0.72     10721
   macro avg       0.48      0.49      0.47     10721
weighted avg       0.64      0.72      0.67     10721


Accuracy: 0.7629
F1-score: 0.8651
ROC-AUC:  0.4601

Classification report:
              precision    recall  f1-score   support

           0       0.13      0.01      0.02      2389
           1       0.78      0.98      0.87      8332

    accuracy                           0.76     10721
   macro avg       0.45      0.49      0.44     10721
weighted avg       0.63      0.76      0.68     10721


scale_pos_weight: 0.2622181676006777
Accuracy: 0.5771
F1-score: 0.7097
ROC-AUC:  0.4448

Classification report:
              precision    recall  f1-score   support

           0       0.19    

Unnamed: 0,model,variant,accuracy,f1,auc
1,FNN,balanced_only,0.731741,0.837935,0.677431
0,FNN,baseline,0.734446,0.836031,0.703672
2,FNN,smote_only,0.71775,0.832001,0.626722
4,XGBoost,balanced_only,0.577092,0.709694,0.444811
3,XGBoost,baseline,0.762895,0.865103,0.460108
5,XGBoost,smote_only,0.609272,0.732759,0.489045
