In [1]:
import sys
sys.path.append("/home/oldrain123/IMBALANCED_CLASSIFICATION/MOMs")
sys.path.append("/home/oldrain123/IMBALANCED_CLASSIFICATION/boost")
sys.path.append('/home/oldrain123/IMBALANCED_CLASSIFICATION/')
sys.path.append('/home/oldrain123/IMBALANCED_CLASSIFICATION/SMOTE_variants/')

In [2]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, matthews_corrcoef, f1_score
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.metrics import geometric_mean_score
from collections import Counter
from moms_generate import apply_transformation
from moms_losses import MMD_est_torch
from boost import AdaBoostClassifier, SMOTEBoost, RUSBoost, OUBoost
from moms_utils import set_seed
from sklearn.svm import SVC
from SMOTE_variants.sm_variants.oversampling.mwmote import MWMOTE

In [3]:
device = "cuda:3"
print(f"Using device: {device}")

Using device: cuda:3


In [None]:
def run_experiment(data, categorical_indices, device, h_dim, num_layers, beta, lr, n_runs=10, n_splits=10, save_path = '/results', data_name = 'wine'):
    """
    Run experiments on the provided dataset.

    Parameters:
    - data: pandas DataFrame with the last column as labels.
    - n_runs: Number of repeated experiments.
    - n_splits: Number of splits for Stratified K-Fold.

    Returns:
    - final_results: Dictionary containing averaged metrics for all methods.
    """
    X = data.iloc[:, :-1]
    Y = np.where(data.iloc[:, -1].values == 'negative', 0, 1)  # Convert labels
    print(f"X shape: {X.shape}")
    print(f"Class distribution: {Counter(Y)}")

    # One-Hot Encoding for Categorical Features
    if categorical_indices != None:
        encoder = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")
        categorical_data = encoder.fit_transform(X.iloc[:, categorical_indices])
        numeric_data = X.drop(X.columns[categorical_indices], axis=1).values
        X = np.hstack((numeric_data, categorical_data))
        
    # Parameters
    n_epochs, h = 2000, 1.0


    # Initialize final results storage
    final_results = {
        method: {"AUC": [], "G-Mean": [], "MCC": [], "F1-score": []}
        for method in ["AdaBoost", "SMOTEBoost", "RUSBoost", "OUBoost", "SVM", "SMOTE", "ADASYN", "bSMOTE", "ROS", "MWMOTE", "Trans(Direct)"]
    }

    for run in range(n_runs):
        kf = StratifiedKFold(n_splits=n_splits, random_state=1203 + run, shuffle=True)
        print(f"\nStarting experiment {run + 1}/{n_runs}")
        results = {
            method: {"AUC": [], "G-Mean": [], "MCC": [], "F1-score": []}
            for method in ["AdaBoost", "SMOTEBoost", "RUSBoost", "OUBoost", "SVM", "SMOTE", "ADASYN", "bSMOTE", "ROS", "MWMOTE", "Trans(Direct)"]
        }

        for fold, (train_index, test_index) in enumerate(kf.split(X, Y)):
            # if fold < 5:
            #     continue
            print(f"  Fold {fold + 1}/{n_splits} - Experiment {run + 1}/{n_runs}")
            seed = 1203 + fold + 10 * run
            set_seed(seed)
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
            
            # print(X_train)
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = np.ascontiguousarray(X_train, dtype=np.float64)
            X_test = np.ascontiguousarray(X_test, dtype=np.float64)

            X_maj = X_train[Y_train == 0]
            X_min = X_train[Y_train == 1]

            input_dim = X_train.shape[1]

            # Apply transformations
            X_maj_direct, X_min_direct, X_trans_direct = apply_transformation(
                X_maj,
                X_min,
                in_dim=input_dim,
                h_dim=h_dim,
                num_layers=num_layers,
                loss_fn=MMD_est_torch,
                device=device,
                method='direct',
                # selection="overlap",
                n_epochs=n_epochs,
                h=h,
                beta=beta,
                lr=lr,
                seed=seed,
                batch_size=128,
                k=3,
                undersample=False
            )

            datasets = {
                "SVM": (X_train, Y_train),
                "Boost": (X_train, Y_train),
                "SMOTE": SMOTE(random_state=seed).fit_resample(X_train, Y_train),
                "ADASYN": ADASYN(random_state=seed).fit_resample(X_train, Y_train),
                "bSMOTE": BorderlineSMOTE(random_state=seed).fit_resample(X_train, Y_train),
                "ROS": RandomOverSampler(random_state=seed).fit_resample(X_train, Y_train),
                "MWMOTE": MWMOTE(random_state=seed).sample(X_train, Y_train),
                "Trans(Direct)": (
                    np.vstack((X_maj_direct, X_min_direct, X_trans_direct)),
                    np.hstack((np.zeros(len(X_maj_direct)), np.ones(len(X_min_direct) + len(X_trans_direct)))),
                ),
            }

            for method, (X_resampled, Y_resampled) in datasets.items():
                if method == "Boost":
                    # AdaBoost
                    model_ada = AdaBoostClassifier(
                        DecisionTreeClassifier(max_depth=5),
                        n_estimators=100,
                        algorithm="SAMME",
                        learning_rate=0.1,
                        random_state=seed,
                    )
                    model_ada.fit(X_resampled, Y_resampled)
                    predictions_ada = model_ada.predict(X_test)
                    proba_ada = model_ada.predict_proba(X_test)[:, 1]
                    results["AdaBoost"]["AUC"].append(roc_auc_score(Y_test, proba_ada))
                    results["AdaBoost"]["G-Mean"].append(
                        np.mean(geometric_mean_score(Y_test, predictions_ada, average=None))
                    )
                    results["AdaBoost"]["MCC"].append(matthews_corrcoef(Y_test, predictions_ada))
                    results['AdaBoost']['F1-score'].append(f1_score(Y_test, predictions_ada))

                    # SMOTEBoost
                    classification_smote = SMOTEBoost(
                        learning_rate=0.1, n_samples=5, n_estimators=100, random_state=seed
                    )
                    classification_smote.fit(X_resampled, Y_resampled)
                    y_pred_smote = classification_smote.predict(X_test)
                    proba_smote = classification_smote.predict_proba(X_test)[:, 1]
                    results["SMOTEBoost"]["AUC"].append(roc_auc_score(Y_test, proba_smote))
                    results["SMOTEBoost"]["G-Mean"].append(
                        np.mean(geometric_mean_score(Y_test, y_pred_smote, average=None))
                    )
                    results["SMOTEBoost"]["MCC"].append(matthews_corrcoef(Y_test, y_pred_smote))
                    results['SMOTEBoost']['F1-score'].append(f1_score(Y_test, y_pred_smote))

                    # RUSBoost
                    classification_rusboost = RUSBoost(
                        learning_rate=0.1, n_samples=5, n_estimators=100, random_state=seed
                    )
                    classification_rusboost.fit(X_resampled, Y_resampled)
                    y_pred_rus = classification_rusboost.predict(X_test)
                    proba_rus = classification_rusboost.predict_proba(X_test)[:, 1]
                    results["RUSBoost"]["AUC"].append(roc_auc_score(Y_test, proba_rus))
                    results["RUSBoost"]["G-Mean"].append(
                        np.mean(geometric_mean_score(Y_test, y_pred_rus, average=None))
                    )
                    results["RUSBoost"]["MCC"].append(matthews_corrcoef(Y_test, y_pred_rus))
                    results['RUSBoost']['F1-score'].append(f1_score(Y_test, y_pred_rus))

                    # OUBoost
                    classification_ouboost = OUBoost(
                        learning_rate=0.1, n_samples=5, n_estimators=100, random_state=seed
                    )
                    classification_ouboost.fit(X_resampled, Y_resampled)
                    y_pred_ouboost = classification_ouboost.predict(X_test)
                    proba_ouboost = classification_ouboost.predict_proba(X_test)[:, 1]
                    results["OUBoost"]["AUC"].append(roc_auc_score(Y_test, proba_ouboost))
                    results["OUBoost"]["G-Mean"].append(
                        np.mean(geometric_mean_score(Y_test, y_pred_ouboost, average=None))
                    )
                    results["OUBoost"]["MCC"].append(matthews_corrcoef(Y_test, y_pred_ouboost))
                    results['OUBoost']['F1-score'].append(f1_score(Y_test, y_pred_ouboost))
                else:
                    svm = SVC(kernel='rbf', probability=True, random_state=seed)
                    svm.fit(X_resampled, Y_resampled)

                    # Predict on the test data
                    y_pred = svm.predict(X_test)
                    y_pred_prob = svm.predict_proba(X_test)[:, 1]

                    # xgb = XGBClassifier(n_estimators=100, max_depth=5, random_state=1203)
                    # xgb.fit(X_resampled, Y_resampled)
                    # y_pred = xgb.predict(X_test)
                    # y_pred_prob = xgb.predict_proba(X_test)[:, 1]
                    # Calculate performance metrics
                    results[method]["AUC"].append(roc_auc_score(Y_test, y_pred_prob))
                    results[method]["G-Mean"].append(
                        np.mean(geometric_mean_score(Y_test, y_pred, average=None))
                    )
                    results[method]["MCC"].append(matthews_corrcoef(Y_test, y_pred))
                    results[method]["F1-score"].append(f1_score(Y_test, y_pred))

            # Print fold-wise results for monitoring
            print(f"    Intermediate Fold Results for Fold {fold + 1}:")
            for method, metrics in results.items():
                print(f"      {method}: AUC = {np.mean(metrics['AUC']):.4f}, "
                      f"G-Mean = {np.mean(metrics['G-Mean']):.4f}, "
                      f"MCC = {np.mean(metrics['MCC']):.4f}, "
                      f"F1-score = {np.mean(metrics['F1-score']):.4f}")
            
            # Fold별 변환 데이터에 대해 t-SNE 시각화
            plot_tsne(X_train, X_trans_direct, Y_train, fold, "Trans(Direct)", save_path)

        # Aggregate results across folds for this experiment
        for method, metrics in results.items():
            for metric, values in metrics.items():
                final_results[method][metric].append(np.round(np.mean(values), 4))

    # Print final averaged results
    print("\nFinal Averaged Results Across Experiments:")
    for method, metrics in final_results.items():
        print(f"  {method}:")
        for metric, values in metrics.items():
            print(f"    {metric}: {np.mean(values):.4f}")

    # Convert final_results to a pandas DataFrame
    result_data = {
        "Method": [],
        "Metric": [],
        "Value": [],
    }

    for method, metrics in final_results.items():
        for metric, values in metrics.items():
            avg_value = np.mean(values) if values else "N/A"
            result_data["Method"].append(method)
            result_data["Metric"].append(metric)
            result_data["Value"].append(avg_value)

    results_df = pd.DataFrame(result_data)

    # Save results as CSV
    os.makedirs(save_path, exist_ok=True)
    save_file = os.path.join(save_path, f"{data_name}_final_results.csv")
    results_df.to_csv(save_file, index=False)
    print(f"\nFinal results saved to {save_file}")
    
    return final_results


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# t-SNE 시각화 함수
def plot_tsne(X_original, X_trans, Y_original, fold, method, save_path):
    tsne = TSNE(n_components=2, random_state=42)
    X_combined = np.vstack((X_original, X_trans))
    Y_combined = np.hstack((Y_original, np.full(len(X_trans), 2)))  # 변환된 샘플을 별도 레이블(2)로 지정

    X_embedded = tsne.fit_transform(X_combined)

    plt.figure(figsize=(8, 6))
    plt.scatter(X_embedded[Y_combined == 0, 0], X_embedded[Y_combined == 0, 1], label="Majority", alpha=0.5)
    plt.scatter(X_embedded[Y_combined == 1, 0], X_embedded[Y_combined == 1, 1], label="Minority", alpha=0.5)
    plt.scatter(X_embedded[Y_combined == 2, 0], X_embedded[Y_combined == 2, 1], label="Transformed", alpha=0.7, marker='x', c='red')
    
    plt.legend()
    plt.title(f"t-SNE Visualization (Fold {fold+1}, {method})")
    plt.xlabel("t-SNE Dim 1")
    plt.ylabel("t-SNE Dim 2")
    plt.grid(True)
    
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(os.path.join(save_path, f"tSNE_Fold{fold+1}_{method}.png"))
    plt.show()


In [5]:
# Save path 
save_path = "/data4/oldrain123/oldrain123/results/real_results"

In [None]:
# abalone9-18
data_name = 'abalone9-18'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.dat")
results = run_experiment(dataframe, categorical_indices=[0], device=device, h_dim=64, num_layers=10, beta=0.1, lr = 0.001, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")

In [None]:
# abalone19
data_name = 'abalone19'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.dat")
results = run_experiment(dataframe, categorical_indices=[0], device=device, h_dim=256, num_layers=10, beta=0.1, lr = 0.01, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")

In [None]:
# ionosphere
data_name = 'ionosphere'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.data")
results = run_experiment(dataframe, categorical_indices=[], device=device, h_dim=256, num_layers=10, beta=0.1, lr = 0.1, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")

In [6]:
# ecoli3
data_name = 'ecoli3'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.dat")
results = run_experiment(dataframe, categorical_indices=[], device=device, h_dim=8, num_layers=4, beta=0.1, lr = 0.1, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")

X shape: (335, 7)
Class distribution: Counter({np.int64(0): 300, np.int64(1): 35})

Starting experiment 1/10
  Fold 1/10 - Experiment 1/10
Epoch 100/2000, Avg Loss: 0.09307, Reg Loss: 0.12153
Epoch 200/2000, Avg Loss: 0.09030, Reg Loss: 0.10551
Epoch 300/2000, Avg Loss: 0.09192, Reg Loss: 0.12006
Epoch 400/2000, Avg Loss: 0.08843, Reg Loss: 0.11127
Epoch 500/2000, Avg Loss: 0.08948, Reg Loss: 0.10938
Epoch 600/2000, Avg Loss: 0.08902, Reg Loss: 0.11043
Epoch 700/2000, Avg Loss: 0.08964, Reg Loss: 0.11260
Epoch 800/2000, Avg Loss: 0.08972, Reg Loss: 0.11226
Epoch 900/2000, Avg Loss: 0.08806, Reg Loss: 0.11102
Epoch 1000/2000, Avg Loss: 0.08894, Reg Loss: 0.11074
Epoch 1100/2000, Avg Loss: 0.08767, Reg Loss: 0.10184
Epoch 1200/2000, Avg Loss: 0.08810, Reg Loss: 0.10963
Epoch 1300/2000, Avg Loss: 0.09044, Reg Loss: 0.11260
Epoch 1400/2000, Avg Loss: 0.08799, Reg Loss: 0.10812
Epoch 1500/2000, Avg Loss: 0.08762, Reg Loss: 0.10545
Epoch 1600/2000, Avg Loss: 0.08975, Reg Loss: 0.10914
Epoch 



predict_proba
_compute_proba_from_decision
    Intermediate Fold Results for Fold 1:
      AdaBoost: AUC = 0.9000, G-Mean = 0.4916, MCC = 0.2967, F1-score = 0.3333
      SMOTEBoost: AUC = 0.9833, G-Mean = 0.8515, MCC = 0.7167, F1-score = 0.7500
      RUSBoost: AUC = 0.9417, G-Mean = 0.8367, MCC = 0.4641, F1-score = 0.4706
      OUBoost: AUC = 0.9833, G-Mean = 0.7071, MCC = 0.6847, F1-score = 0.6667
      SVM: AUC = 0.9833, G-Mean = 0.8515, MCC = 0.7167, F1-score = 0.7500
      SMOTE: AUC = 0.9750, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
      ADASYN: AUC = 0.9667, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
      bSMOTE: AUC = 0.9750, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
      ROS: AUC = 0.9750, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
      MWMOTE: AUC = 0.9750, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
      Trans(Direct): AUC = 0.9750, G-Mean = 0.9487, MCC = 0.7171, F1-score = 0.7273
  Fold 2/10 - Experiment 1/10
Epoch 100/2000, Avg Loss: 0.0

KeyboardInterrupt: 

In [None]:
# wisconsin
data_name = 'wisconsin'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.dat")
results = run_experiment(dataframe, categorical_indices=[], device=device, h_dim=256, num_layers=10, beta=0.5, lr = 0.01, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")

In [None]:
# cleveland-0_vs_4
data_name = 'cleveland-0_vs_4'
dataframe = pd.read_csv(f"/data4/oldrain123/oldrain123/dataset/{data_name}.dat")
results = run_experiment(dataframe, categorical_indices=[], device="cpu", h_dim=256, num_layers=10, beta=0.5, lr = 0.01, save_path=save_path, data_name = data_name)

for method, metrics in results.items():
    print(f"\nMethod: {method}")
    for metric, values in metrics.items():
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"  {metric}: {mean_value:.4f} ± {std_value:.4f}")