In [None]:
import sys
sys.path.append("/home/oldrain123/IMBALANCED_CLASSIFICATION/MOMs")

import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from experiment import run_exp
from imblearn.datasets import fetch_datasets

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda"
print(f"Using device: {device}")

In [None]:
def load_data(filepath, target_variable = '+1'):
    """
    Load and preprocess the australian_scale.txt dataset.

    Parameters:
    - filepath: str, path to the australian_scale.txt file.

    Returns:
    - X: np.ndarray, feature matrix.
    - Y: np.ndarray, binary labels.
    """
    data = []
    labels = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            label = 1 if parts[0] == target_variable else 0
            features = [float(pair.split(':')[1]) for pair in parts[1:] if ':' in pair]
            labels.append(label)
            data.append(features)

    max_features = max(len(row) for row in data)
    data = [row + [0] * (max_features - len(row)) for row in data]

    return np.array(data, dtype=np.float64), np.array(labels, dtype=np.int64)


In [None]:
data_path = '/data4/oldrain123/oldrain123/dataset/LIBSVM'
save_path = "/data4/oldrain123/oldrain123/results/ablation_results"

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from experiment import run_exp

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
methods = ["Original", "SMOTE", "ADASYN", "bSMOTE", "ROS", "MWMOTE", "CTGAN", "Ours"]
base_models = {
    "SVM": SVC(kernel='rbf', probability=True, random_state=1203),
    "DecisionTree": DecisionTreeClassifier(max_depth=6, random_state=1203),
    "kNN": KNeighborsClassifier(n_neighbors=5),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, early_stopping=True, random_state=1203),
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=1203)
}

In [None]:
# australian
data_name = 'australian'
target_variable = '+1'
australian_path = f'{data_path}/{data_name}_scale.txt'
australian, label = load_data(australian_path, target_variable=target_variable)
australian = pd.DataFrame(np.column_stack([australian, label]))

n_epochs = 2000
hidden_dims = [16, 32, 64, 128]
latent_dim = 256
lr = 0.005
beta = 0.01

# Run experiment
final_results = run_exp(
    data=australian, 
    cat_idx=[0, 6, 7, 12, 13], 
    methods=methods, 
    base_model=base_models, 
    device=device,
    n_epochs=n_epochs, 
    hidden_dims=hidden_dims, 
    latent_dim=latent_dim, 
    lr=lr, 
    beta=beta,
    data_name=data_name, 
    maj_target_name=0,
    seed=1203, 
    visualize=True, 
    save_path=save_path
)
# Expecting keys like "Original - DecisionTree", "SMOTE - kNN", etc.
res_data = {
    "Classifier": [],
    "Method": [],
    "Metric": [],
    "Value": [],
}

for key, metrics in final_results.items():
    # Split the key into oversampling method and classifier if possible
    if " - " in key:
        method_name, clf_name = key.split(" - ", 1)
    else:
        method_name, clf_name = key, "Default"
    for metric, values in metrics.items():
        avg_val = np.mean(values) if values else "N/A"
        res_data["Classifier"].append(clf_name)
        res_data["Method"].append(method_name)
        res_data["Metric"].append(metric)
        res_data["Value"].append(avg_val)

res_df = pd.DataFrame(res_data)

# Save the final results with classifier information
os.makedirs(save_path, exist_ok=True)
save_file = os.path.join(save_path, f"{data_name}_results.csv")
res_df.to_csv(save_file, index=False)
print(f"\n[Saved] Final results are saved to {save_file}")

In [None]:
# breast-cancer
data_name = 'breast-cancer'
target_variable = '4'
breast_path = f'{data_path}/{data_name}_scale.txt'
breast_cancer, label = load_data(breast_path, target_variable=target_variable)
bc = pd.DataFrame(np.column_stack([breast_cancer, label]))

n_epochs = 2000
hidden_dims = [16, 32, 64, 128]
latent_dim = 256
lr = 0.005
beta = 0.001

# Run experiment
final_results = run_exp(
    data=bc, 
    cat_idx=[], 
    methods=methods, 
    base_model=base_models, 
    device=device,
    n_epochs=n_epochs, 
    hidden_dims=hidden_dims, 
    latent_dim=latent_dim, 
    lr=lr, 
    beta=beta,
    data_name=data_name, 
    maj_target_name=0,
    seed=1203, 
    visualize=True, 
    save_path=save_path
)
# Expecting keys like "Original - DecisionTree", "SMOTE - kNN", etc.
res_data = {
    "Classifier": [],
    "Method": [],
    "Metric": [],
    "Value": [],
}

for key, metrics in final_results.items():
    # Split the key into oversampling method and classifier if possible
    if " - " in key:
        method_name, clf_name = key.split(" - ", 1)
    else:
        method_name, clf_name = key, "Default"
    for metric, values in metrics.items():
        avg_val = np.mean(values) if values else "N/A"
        res_data["Classifier"].append(clf_name)
        res_data["Method"].append(method_name)
        res_data["Metric"].append(metric)
        res_data["Value"].append(avg_val)

res_df = pd.DataFrame(res_data)

# Save the final results with classifier information
os.makedirs(save_path, exist_ok=True)
save_file = os.path.join(save_path, f"{data_name}_results.csv")
res_df.to_csv(save_file, index=False)
print(f"\n[Saved] Final results are saved to {save_file}")

In [None]:
# diabetes
data_name = 'diabetes'
target_variable = '-1'
diabetes_path = f'{data_path}/{data_name}_scale.txt'
diabetes, label = load_data(diabetes_path)
diabetes = pd.DataFrame(np.column_stack([diabetes, label]))

n_epochs = 2000
hidden_dims = [16, 32, 64, 128]
latent_dim = 256
lr = 0.005
beta = 0.001

# Run experiment
final_results = run_exp(
    data=diabetes, 
    cat_idx=[], 
    methods=methods, 
    base_model=base_models, 
    device=device,
    n_epochs=n_epochs, 
    hidden_dims=hidden_dims, 
    latent_dim=latent_dim, 
    lr=lr, 
    beta=beta,
    data_name=data_name, 
    maj_target_name=1,
    seed=1203, 
    visualize=True, 
    save_path=save_path
)
# Expecting keys like "Original - DecisionTree", "SMOTE - kNN", etc.
res_data = {
    "Classifier": [],
    "Method": [],
    "Metric": [],
    "Value": [],
}

for key, metrics in final_results.items():
    # Split the key into oversampling method and classifier if possible
    if " - " in key:
        method_name, clf_name = key.split(" - ", 1)
    else:
        method_name, clf_name = key, "Default"
    for metric, values in metrics.items():
        avg_val = np.mean(values) if values else "N/A"
        res_data["Classifier"].append(clf_name)
        res_data["Method"].append(method_name)
        res_data["Metric"].append(metric)
        res_data["Value"].append(avg_val)

res_df = pd.DataFrame(res_data)

# Save the final results with classifier information
os.makedirs(save_path, exist_ok=True)
save_file = os.path.join(save_path, f"{data_name}_results.csv")
res_df.to_csv(save_file, index=False)
print(f"\n[Saved] Final results are saved to {save_file}")

In [None]:
# german
data_name = 'german'
target_variable = '+1'
german_path = f'{data_path}/{data_name}_scale.txt'
german, label = load_data(german_path)
german = pd.DataFrame(np.column_stack([german, label]))

n_epochs = 2000
hidden_dims = [32, 64, 128, 256]
latent_dim = 512
lr = 0.005
beta = 0.001

# Run experiment
final_results = run_exp(
    data=german, 
    cat_idx=[], 
    methods=methods, 
    base_model=base_models, 
    device=device,
    n_epochs=n_epochs, 
    hidden_dims=hidden_dims, 
    latent_dim=latent_dim, 
    lr=lr, 
    beta=beta,
    data_name=data_name, 
    maj_target_name=0,
    seed=1203, 
    visualize=True, 
    save_path=save_path
)
# Expecting keys like "Original - DecisionTree", "SMOTE - kNN", etc.
res_data = {
    "Classifier": [],
    "Method": [],
    "Metric": [],
    "Value": [],
}

for key, metrics in final_results.items():
    # Split the key into oversampling method and classifier if possible
    if " - " in key:
        method_name, clf_name = key.split(" - ", 1)
    else:
        method_name, clf_name = key, "Default"
    for metric, values in metrics.items():
        avg_val = np.mean(values) if values else "N/A"
        res_data["Classifier"].append(clf_name)
        res_data["Method"].append(method_name)
        res_data["Metric"].append(metric)
        res_data["Value"].append(avg_val)

res_df = pd.DataFrame(res_data)

# Save the final results with classifier information
os.makedirs(save_path, exist_ok=True)
save_file = os.path.join(save_path, f"{data_name}_results.csv")
res_df.to_csv(save_file, index=False)
print(f"\n[Saved] Final results are saved to {save_file}")