In [1]:
import sys
sys.path.append("/home/oldrain123/IMBALANCED_CLASSIFICATION/MOMs")

In [5]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from imblearn.metrics import geometric_mean_score
from ctgan import CTGAN
from moms_generate import transform
from moms_losses import MMD_est_torch
import torch
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN

from tqdm import tqdm 

In [6]:
# ----------------------------------------
# Experimental Parameters
# ----------------------------------------
SEED = 1203
random.seed(SEED)
np.random.seed(SEED)

MAJORITY_COUNT = 2000
MINORITY_COUNT = 200
N_TEST_PER_CLASS = 2000
# Classifiers to evaluate
CLASSIFIERS = {
    'SVM': SVC(probability=True, random_state=SEED),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=SEED),
    'DecisionTree': DecisionTreeClassifier(random_state=SEED),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=10000, random_state=SEED),
    'kNN': KNeighborsClassifier()
}

# Dimensions to evaluate
DIM_LIST = [50, 100, 200, 500, 1000]
TRIALS = 10

In [7]:
# ----------------------------------------
# Oversamplers to compare
# ----------------------------------------
oversamplers = {
    'Original': None,
    'SMOTE': SMOTE(random_state=SEED),
    'bSMOTE': BorderlineSMOTE(random_state=SEED),
    'ADASYN': ADASYN(random_state=SEED)
}

In [9]:
# ----------------------------------------
# Main Experiment Loop
# ----------------------------------------
records = []
for DIM in DIM_LIST:
    # Generate fixed test set for this dimension
    mu_maj = np.zeros(DIM)
    mu_min = np.ones(DIM) * 0.5
    cov = np.eye(DIM)

    X_test_maj = np.random.multivariate_normal(mu_maj, cov, size=N_TEST_PER_CLASS)
    X_test_min = np.random.multivariate_normal(mu_min, cov, size=N_TEST_PER_CLASS)
    X_test = np.vstack([X_test_maj, X_test_min])
    y_test = np.hstack([np.zeros(N_TEST_PER_CLASS), np.ones(N_TEST_PER_CLASS)])

    for trial in tqdm(range(TRIALS), desc=f"DIM={DIM}"):
        # Generate train set with fixed minority count
        X_maj = np.random.multivariate_normal(mu_maj, cov, size=MAJORITY_COUNT)
        X_min = np.random.multivariate_normal(mu_min, cov, size=MINORITY_COUNT)
        y_maj = np.zeros(MAJORITY_COUNT)
        y_min = np.ones(MINORITY_COUNT)

        X = np.vstack([X_maj, X_min])
        y = np.hstack([y_maj, y_min])

        X_train, _, y_train, _ = train_test_split(
            X, y, stratify=y, train_size=0.7, random_state=SEED + trial
        )

        # Evaluate each oversampling method
        for method, sampler in oversamplers.items():
            if sampler is not None:
                try:
                    X_res, y_res = sampler.fit_resample(X_train, y_train)
                except RuntimeError as e:
                    print(f"Warning: {method} failed with error '{e}'. Falling back to original training data.")
                    X_res, y_res = X_train, y_train
            else:
                X_res, y_res = X_train, y_train

            # Train and evaluate classifier
            for clf_name, clf in CLASSIFIERS.items():
                model = clf.fit(X_res, y_res)
                y_prob = model.predict_proba(X_test)[:, 1]
                y_pred = (y_prob >= 0.5).astype(int)

                records.append({
                    'dimension': DIM,
                    'trial': trial,
                    'method': method,
                    'classifier': clf_name,
                    'AUROC': roc_auc_score(y_test, y_prob),
                    'F1': f1_score(y_test, y_pred)
                })

DIM=50: 100%|██████████| 10/10 [02:59<00:00, 17.97s/it]
DIM=100: 100%|██████████| 10/10 [03:21<00:00, 20.17s/it]
DIM=200: 100%|██████████| 10/10 [04:35<00:00, 27.59s/it]
DIM=500: 100%|██████████| 10/10 [07:17<00:00, 43.72s/it]
DIM=1000:   0%|          | 0/10 [00:00<?, ?it/s]



DIM=1000:  30%|███       | 3/10 [03:36<08:31, 73.06s/it]



DIM=1000:  50%|█████     | 5/10 [06:10<06:18, 75.61s/it]



DIM=1000:  60%|██████    | 6/10 [07:16<04:49, 72.40s/it]



DIM=1000:  90%|█████████ | 9/10 [10:54<01:12, 72.86s/it]



DIM=1000: 100%|██████████| 10/10 [12:04<00:00, 72.48s/it]


In [14]:
# Compile results
results_df = pd.DataFrame(records)
results = results_df.groupby(['dimension', 'classifier', 'method']).agg({
    'AUROC': ['mean', 'std'],
    'F1': ['mean', 'std']
}).reset_index()

In [17]:
results.tail(30)

Unnamed: 0_level_0,dimension,classifier,method,AUROC,AUROC,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,std,mean,std
70,500,RandomForest,SMOTE,0.999878,3.494286e-05,0.428012,0.016538
71,500,RandomForest,bSMOTE,0.997924,0.0007182985,0.001199,0.001228
72,500,SVM,ADASYN,1.0,0.0,1.0,0.0
73,500,SVM,Original,1.0,0.0,1.0,0.0
74,500,SVM,SMOTE,1.0,0.0,1.0,0.0
75,500,SVM,bSMOTE,1.0,0.0,1.0,0.0
76,500,kNN,ADASYN,0.603875,0.0358297,0.675793,0.005293
77,500,kNN,Original,0.999975,7.907018e-05,0.998647,0.000955
78,500,kNN,SMOTE,0.6181,0.02446233,0.675238,0.003715
79,500,kNN,bSMOTE,0.96395,0.07782034,0.955457,0.092294
