In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from moms_losses import MMD_est_torch
from moms_generate import transform
from moms_visualize import visualize_samples
from moms_metrics import Metrics

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler

ModuleNotFoundError: No module named 'moms_losses'

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device = "cuda"
print(f"Using device: {device}")

In [None]:
# Define the generate_datasets function with swapped roles of major and minor samples
def generate_datasets(n_samples, imbalance_ratio, dataset_type="gaussian_vs_ring", random_state=1203):
    np.random.seed(random_state)
    if dataset_type == "gaussian_vs_ring":
        # gaussian : majority samples
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major
        X_major = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_major)
        
        # ring : minority samples
        angles = np.random.uniform(0, 2 * np.pi, n_minor)
        radii = 2 + np.random.uniform(-0.5, 0.5, n_minor)
        X_minor = np.stack([radii * np.cos(angles), radii * np.sin(angles)], axis=1)
    elif dataset_type == "gaussian_vs_exponential":
        # gaussian : majority samples
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major   
        X_major = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_major)

        # exponential : minority sasmples
        X_minor = np.random.exponential(scale=1.0, size=(n_minor, 2))
    elif dataset_type == "xo":
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major
        # Generate XO-shaped clusters for minority
        minor_clusters = 2
        X_minor = []
        for i in range(minor_clusters):
            center = np.array([(-5 if i == 0 else 5), (-5 if i == 0 else 5)])
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_minor // minor_clusters)
            )
        X_minor = np.vstack(X_minor)

        # Generate XO-shaped clusters for majority
        major_clusters = 2
        X_major = []
        for i in range(major_clusters):
            center = np.array([(3 if i == 0 else -3), (-3 if i == 0 else 3)])
            X_major.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_major // major_clusters)
            )
        X_major = np.vstack(X_major)
    elif dataset_type == "clustered_minority":

        # Majority samples follow a 2d uniform distribution
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major

        X_major = np.random.uniform(-4, 4, size=(n_major, 2))

        # Minority samples form clusters with different sample sizes
        cluster_sizes = [n_minor // 3, n_minor // 4, n_minor - (n_minor // 3) - (n_minor // 4) - (n_minor // 5), n_minor//5]
        cluster_centers = [
            np.array([-4, 4]),  # Left top
            np.array([4, -4]),  # Right bottom
            np.array([-4, -4]),  # Left bottom
            np.array([4, 4])
        ]

        X_minor = []
        for i, cluster_size in enumerate(cluster_sizes):
            center = cluster_centers[i]
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], cluster_size)
            )

        X_minor = np.vstack(X_minor)
    else:
        raise ValueError("Invalid dataset type.")
    
    labels = np.hstack((np.zeros(len(X_major)), np.ones(len(X_minor))))
    return X_major, X_minor, labels

# Define the generate_datasets function with swapped roles of major and minor samples
# def generate_datasets(n_samples, imbalance_ratio, dataset_type="gaussian_vs_ring", random_state=1203):
    np.random.seed(random_state)
    if dataset_type == "gaussian_vs_ring":
        # gaussian : majority samples
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major
        X_major = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_major)

        # Add outliers to the majority samples
        n_outliers = max(1, n_major // 20)  # Define outliers as 5% of the majority samples
        outliers = np.random.uniform(-10, 10, size=(n_outliers, 2))
        X_major = np.vstack([X_major, outliers])

        # ring : minority samples
        angles = np.random.uniform(0, 2 * np.pi, n_minor)
        radii = 2 + np.random.uniform(-0.5, 0.5, n_minor)
        X_minor = np.stack([radii * np.cos(angles), radii * np.sin(angles)], axis=1)
    elif dataset_type == "gaussian_vs_exponential":
        # gaussian : majority samples
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major   
        X_major = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_major)

        # Add outliers to the majority samples
        n_outliers = max(1, n_major // 20)
        outliers = np.random.uniform(-10, 10, size=(n_outliers, 2))
        X_major = np.vstack([X_major, outliers])

        # exponential : minority samples
        X_minor = np.random.exponential(scale=1.0, size=(n_minor, 2))
    elif dataset_type == "xo":
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major
        # Generate XO-shaped clusters for minority
        minor_clusters = 2
        X_minor = []
        for i in range(minor_clusters):
            center = np.array([(-5 if i == 0 else 5), (-5 if i == 0 else 5)])
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_minor // minor_clusters)
            )
        X_minor = np.vstack(X_minor)

        # Generate XO-shaped clusters for majority
        major_clusters = 2
        X_major = []
        for i in range(major_clusters):
            center = np.array([(3 if i == 0 else -3), (-3 if i == 0 else 3)])
            X_major.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_major // major_clusters)
            )
        X_major = np.vstack(X_major)

        # Add outliers to the majority samples
        n_outliers = max(1, n_major // 20)
        outliers = np.random.uniform(-10, 10, size=(n_outliers, 2))
        X_major = np.vstack([X_major, outliers])
    elif dataset_type == "clustered_minority":

        # Majority samples follow a 2d uniform distribution
        n_major = round(n_samples * imbalance_ratio / (1 + imbalance_ratio))
        n_minor = n_samples - n_major

        X_major = np.random.uniform(-4, 4, size=(n_major, 2))

        # Add outliers to the majority samples
        n_outliers = max(1, n_major // 20)
        outliers = np.random.uniform(-10, 10, size=(n_outliers, 2))
        X_major = np.vstack([X_major, outliers])

        # Minority samples form clusters with different sample sizes
        cluster_sizes = [n_minor // 3, n_minor // 4, n_minor - (n_minor // 3) - (n_minor // 4) - (n_minor // 5), n_minor//5]
        cluster_centers = [
            np.array([-4, 4]),  # Left top
            np.array([4, -4]),  # Right bottom
            np.array([-4, -4]),  # Left bottom
            np.array([4, 4])
        ]

        X_minor = []
        for i, cluster_size in enumerate(cluster_sizes):
            center = cluster_centers[i]
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], cluster_size)
            )

        X_minor = np.vstack(X_minor)
    else:
        raise ValueError("Invalid dataset type.")
    
    labels = np.hstack((np.zeros(len(X_major)), np.ones(len(X_minor))))
    return X_major, X_minor, labels


# parameters
input_dim = 2 
bw = 1.0 
imb_ratio = 30
n_train = 2000  # Total sample size 
n_epochs = 2000
lr = 0.01
eps = 0.2
n_test = n_train//10
te_imb_ratio = imb_ratio
n_runs = 1
bs = n_train
seed = 1203

# Dataset types to visualize
dataset_types = ["gaussian_vs_ring", "gaussian_vs_exponential", "xo", "clustered_minority"]
# dataset_types = ["xo"]

# Generate and visualize datasets
for dataset_type in dataset_types:
    print(f"Visualizing {dataset_type} dataset...")
    X_maj, X_min, labels = generate_datasets(
        n_samples=n_train,
        imbalance_ratio=imb_ratio,
        dataset_type=dataset_type,
        random_state=seed
    )

    plt.figure(figsize=(8, 6))
    plt.scatter(X_maj[:, 0], X_maj[:, 1], color='red', alpha=0.6, label="Majority")
    plt.scatter(X_min[:, 0], X_min[:, 1], color='blue', alpha=0.6, label="Minority")
    plt.title(f"{dataset_type.replace('_', ' ').capitalize()} Dataset")
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
# Define models
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=1203)
knn = KNeighborsClassifier(n_neighbors=5)
log_reg = LogisticRegression(max_iter=1000, random_state=1203)
xgb = XGBClassifier(n_estimators=100, max_depth=None, eval_metric="logloss", random_state=1203)
svm = SVC(kernel="rbf", probability=True, random_state=1203)

models = {
    "Random Forest": rf,
    "KNN": knn,
    "Logistic Regression": log_reg,
    "XGBoost": xgb,
    "SVM": svm,
}

dataset_types = ["clustered_minority", "gaussian_vs_ring", "xo", "gaussian_vs_exponential"]
# dataset_types = ["xo"]
methods = ["Original", "SMOTE", "ADASYN", "Borderline-SMOTE", "Random Oversampling", "Trans(Direct)", "Trans(Noise)"]
te_imb_ratios = [imb_ratio]  # Define test imbalance ratios

In [None]:
k = 5

X_all = torch.cat([torch.tensor(X_min), torch.tensor(X_maj)], dim=0)

from sklearn.neighbors import NearestNeighbors

# Convert tensors to NumPy arrays for NearestNeighbors
X_min_np = X_min
X_all_np = X_all

# Use NearestNeighbors to find the k nearest neighbors
nn = NearestNeighbors(n_neighbors=k+1, algorithm='auto', n_jobs=-1)
nn.fit(X_all_np)
_, nn_idx = nn.kneighbors(X_min_np)
nn_idx = torch.tensor(nn_idx, device=X_min.device)[:, 1:]


# Step 2: Identify DANGER set and refined DANGER set
danger_mask = []
refined_danger_mask = []
for i, nn_indices in enumerate(nn_idx):
    # Count majority neighbors among the k nearest neighbors
    majority_count = sum(1 for idx in nn_indices if idx >= X_min.shape[0])  # Majority samples have indices >= len(X_min)
    # Identify DANGER set: More than half of the neighbors are majority class
    if majority_count > (k // 2):
        danger_mask.append(i)
        # Identify refined DANGER set: All k neighbors are majority class
        all_majority = all(idx >= X_min.shape[0] for idx in nn_indices)
        if all_majority:
            refined_danger_mask.append(i)

danger_set = X_min[torch.tensor(danger_mask, device=X_min.device)]
refined_danger_set = X_min[torch.tensor(refined_danger_mask, device=X_min.device)]

# Define borderline DANGER set: DANGER set minus refined DANGER set
borderline_danger_mask = list(set(danger_mask) - set(refined_danger_mask))
borderline_danger_set = X_min[torch.tensor(borderline_danger_mask, device=X_min.device)]


In [None]:
plt.scatter(X_maj[:, 0], X_maj[:, 1], color='red', alpha=0.5, label='Majority')
plt.scatter(X_min[:, 0], X_min[:, 1], color='blue', alpha=0.7, label='Minority')
plt.scatter(danger_set[:, 0], danger_set[:, 1], color='skyblue', alpha=0.5, label='Danger Set')
plt.scatter(borderline_danger_set[:, 0], borderline_danger_set[:, 1], color='yellow', alpha=0.7, edgecolor='black', linewidth=1, label='Borderline Set')
plt.xlabel('X1')
plt.ylabel('X2')
plt.title("Borderline Set")
plt.legend()

In [None]:
for dataset_type in dataset_types:
    print(f"Processing dataset type: {dataset_type}")
    
    results = {
        method: {model_name: {"G-mean": [], "F1-score": [], "MCC": [], "Accuracy": [], "AUROC": []} for model_name in models.keys()}
        for method in methods
    }
    for te_imb_ratio in te_imb_ratios:
        print(f"Processing experiments with test imbalance ratio: {te_imb_ratio}")

        for run in range(n_runs):
            print(f"Run {run + 1}/{n_runs} - Generating {dataset_type} dataset...")
            seed = 1203 * te_imb_ratio + run
            # Generate training data
            X_tr_maj, X_tr_min, _ = generate_datasets(
                n_samples=n_train,
                imbalance_ratio=imb_ratio,
                dataset_type=dataset_type,
                random_state=seed
            )
            n_maj = len(X_tr_maj)
            n_min = len(X_tr_min)
            X_tr = np.vstack((X_tr_maj, X_tr_min))
            tr_labels = np.hstack((np.ones(n_maj), np.zeros(n_min)))

            # Generate test data with the specified imbalance ratio
            X_te_maj, X_te_min, _ = generate_datasets(
                n_samples=n_test,
                imbalance_ratio=te_imb_ratio,
                dataset_type=dataset_type,
                random_state=seed + n_test
            )
            X_te = np.vstack((X_te_maj, X_te_min))
            te_labels = np.hstack((np.ones(len(X_te_maj)), np.zeros(len(X_te_min))))
 
            # Apply transformations
            X_tr_maj_direct, X_tr_min_direct, X_tr_trans_direct = apply_transformation(
                X_tr_maj,
                X_tr_min,
                in_dim=input_dim,
                h_dim=256,
                num_layers=10,
                loss_fn=MMD_est_torch,
                device=device,
                method='direct',
                selection="random",
                n_epochs=n_epochs,
                beta=0.0,
                lr=lr,
                seed=seed,
                k=5
            )

            X_tr_maj_noise, X_tr_min_noise, X_tr_trans_noise = apply_transformation(
                X_tr_maj,
                X_tr_min,
                in_dim=input_dim,
                h_dim=256,
                num_layers=10,
                loss_fn=MMD_est_torch,
                device=device,
                method="noise",
                selection='random',
                n_epochs=n_epochs,
                beta=0.0,
                lr=lr,
                seed=seed,
                k=5
            )
            

            datasets = {
                "Original": (X_tr, tr_labels),
                "SMOTE": SMOTE(random_state=seed).fit_resample(X_tr, tr_labels),
                "ADASYN": ADASYN(random_state=seed).fit_resample(X_tr, tr_labels),
                "Borderline-SMOTE": BorderlineSMOTE(random_state=seed).fit_resample(X_tr, tr_labels),
                "Random Oversampling": RandomOverSampler(random_state=seed).fit_resample(X_tr, tr_labels),
                "Trans(Direct)": (np.vstack((X_tr_maj_direct, X_tr_min_direct, X_tr_trans_direct)),
                                    np.hstack((np.ones(len(X_tr_maj_direct)), np.zeros(len(X_tr_min_direct)), np.zeros(len(X_tr_trans_direct))))),
                "Trans(Noise)": (np.vstack((X_tr_maj_noise, X_tr_min_noise, X_tr_trans_noise)),
                                    np.hstack((np.ones(len(X_tr_maj_noise)), np.zeros(len(X_tr_min_noise)), np.zeros(len(X_tr_trans_noise)))))
            }   

            # Evaluate each model and method
            for method, (X_train, y_train) in datasets.items():
                for model_name, model in models.items():
                    model.fit(X_train, y_train)
                    
                    y_pred = model.predict(X_te)
                    y_pred_prob = model.predict_proba(X_te)[:, 1] if hasattr(model, "predict_proba") else None

                    # Calculate metrics
                    metrics = Metrics(te_labels, y_pred, y_pred_prob)
                    results[method][model_name]["G-mean"].append(metrics.g_mean())
                    results[method][model_name]["F1-score"].append(metrics.f1_score())
                    results[method][model_name]["MCC"].append(metrics.mcc())
                    results[method][model_name]["Accuracy"].append(metrics.accuracy())
                    if y_pred_prob is not None:
                        results[method][model_name]["AUROC"].append(metrics.roc_auc())

            if run >= 0:
                # Visualize results
                fig, axes = plt.subplots(2, 4, figsize=(24, 12))
                axes = axes.ravel()  # Flatten the 3x2 grid to a 1D array for easier iteration

                for i, (method, (X_method, _)) in enumerate(datasets.items()):
                    if i >= len(axes):  # Skip if there are more datasets than subplots
                        break
                    ax = axes[i]
                    if method == "Trans(Direct)":
                        generated_samples = X_method[len(X_tr_maj_direct) + len(X_tr_min_direct):]
                        visualize_samples(ax, X_tr_maj_direct, X_tr_min_direct, generated_samples, f"{method} - {dataset_type}")
                    elif method == "Trans(Noise)":
                        generated_samples_noise = X_method[len(X_tr_maj_noise) + len(X_tr_min_noise):]
                        visualize_samples(ax, X_tr_maj_direct, X_tr_min_noise, generated_samples_noise, f"{method} - {dataset_type}")
                    else:
                        generated_samples = X_method[len(X_tr):]
                        visualize_samples(ax, X_tr_maj, X_tr_min, generated_samples, f"{method} - {dataset_type}")

                # Adjust layout and display the plot
                plt.tight_layout()
                plt.show()
                    
        # Summarize results
        result_summary = {}
        for method, method_results in results.items():
            summary = {}
            for model_name, metrics in method_results.items():
                for metric_name, values in metrics.items():
                    # Compute the average metric across runs
                    avg_value = np.round(np.mean(values), 4) if values else "Not available"
                    summary[f"{model_name}_{metric_name}"] = avg_value
            result_summary[method] = summary

        # Convert result_summary to a DataFrame
        df = pd.DataFrame.from_dict(result_summary, orient="index")

        # Save to CSV
        output_path = f"results/simulation/{dataset_type}_te_imb_{te_imb_ratio}_results.csv"
        df.to_csv(output_path, index=True)
        print(f"Results saved to {output_path}")

In [None]:
pd.DataFrame(X_tr_trans_direct).nunique()

In [None]:
indices = [
    [0, 101, 102, 200, 201],  # Sample 0
    [1, 100, 201, 202, 200],  # Sample 1
    [2, 200, 201, 202, 100],  # Sample 2
]

maj_indices = [100, 101, 102]
trans_indices = [200, 201, 202]

filtered_trans_samples = []
for idx, neighbors in enumerate(indices):
    neighbor_set = set(neighbors[1:])
    majority_count = sum(1 for neighbor in neighbor_set if neighbor in maj_indices)
    transformed_count = sum(1 for neighbor in neighbor_set if neighbor in trans_indices)
    if majority_count <= 1 and transformed_count != len(neighbor_set):
        filtered_trans_samples.append(idx)

print("Filtered indices:", filtered_trans_samples)


In [None]:
majority_count

In [None]:
from sklearn.neighbors import NearestNeighbors
combined_data = np.vstack([X_tr_min, X_tr_maj, X_tr_trans_direct])  # Include existing X_trans samples
nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='auto').fit(combined_data)
_, indices = nbrs.kneighbors(X_tr_trans_direct)

min_indices = np.arange(len(X_tr_min))
maj_indices = np.arange(len(X_tr_min), len(X_tr_min) + len(X_tr_maj))
trans_indices = np.arange(len(X_tr_min) + len(X_tr_maj), combined_data.shape[0])

filtered_trans_samples = []
for idx, neighbors in enumerate(indices):
    neighbor_set = set(neighbors[1:])  # Exclude the sample itself
    # Check if all neighbors are majority or X_trans samples
    if not (neighbor_set.issubset(maj_indices) or neighbor_set.issubset(trans_indices)):
        filtered_trans_samples.append(X_tr_trans_direct[idx])

X_trans_samples = np.array(filtered_trans_samples)

In [None]:
# Filtering samples
from sklearn.neighbors import NearestNeighbors
combined_data = np.vstack([X_tr_maj, X_tr_min, X_tr_trans_direct])
nbrs = NearestNeighbors(n_neighbors=5 + 1, algorithm='auto').fit(combined_data)
_, indices = nbrs.kneighbors(X_tr_trans_direct)


# Identify minority indices in the combined data
min_indices = np.arange(len(X_min)) + len(X_maj)  # Adjust indices for X_min in the combined dataset

# Retain samples where at least one neighbor belongs to X_min
keep_mask = np.any(np.isin(indices[:, 1:], min_indices), axis=1)
X_trans_samples = X_tr_trans_direct[keep_mask]

In [None]:
print(len(X_tr_trans_direct))

In [None]:
print(len(X_tr_maj) - len(X_tr_min))
len(X_trans_samples)

In [None]:
# Define the generate_datasets function
def generate_datasets(n_samples, imbalance_ratio, dataset_type="gaussian_vs_ring", random_state=1203):
    np.random.seed(random_state)
    if dataset_type == "gaussian_vs_ring":
        n_major = round(n_samples * imbalance_ratio)
        n_minor = n_samples - n_major
        X_minor = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_minor)
        angles = np.random.uniform(0, 2 * np.pi, n_major)
        radii = 3 + np.random.uniform(-0.5, 0.5, n_major)
        X_major = np.stack([radii * np.cos(angles), radii * np.sin(angles)], axis=1)
    elif dataset_type == "gaussian_vs_spiral":
        n_major = round(n_samples * imbalance_ratio)
        n_minor = n_samples - n_major
        X_minor = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_minor)
        theta = np.sqrt(np.random.uniform(0, 4 * np.pi, n_major)) * 2
        r = theta
        X_major = np.stack([r * np.cos(theta), r * np.sin(theta)], axis=1)
    elif dataset_type == "gaussian_vs_exponential":
        n_major = round(n_samples * imbalance_ratio)
        n_minor = n_samples - n_major
        X_minor = np.random.multivariate_normal([0, 0], [[1, 0], [0, 1]], n_minor)
        X_major = np.random.exponential(scale=1.0, size=(n_major, 2))
    elif dataset_type == "clustered_gaussian":
        n_major = round(n_samples * imbalance_ratio)
        n_minor = n_samples - n_major
        # Generate XO-shaped clusters for majority
        major_clusters = 2
        X_major = []
        for i in range(major_clusters):
            center = np.array([(-5 if i == 0 else 5), (-5 if i == 0 else 5)])
            X_major.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_major // major_clusters)
            )
        X_major = np.vstack(X_major)

        # Generate XO-shaped clusters for minority
        minor_clusters = 2
        X_minor = []
        for i in range(minor_clusters):
            center = np.array([(5 if i == 0 else -5), (-5 if i == 0 else 5)])
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], n_minor // minor_clusters)
            )
        X_minor = np.vstack(X_minor)
    elif dataset_type == "clustered_minority":
        # Majority samples follow a uniform distribution
        n_major = round(n_samples * imbalance_ratio)
        n_minor = n_samples - n_major

        X_major = np.random.uniform(-5, 5, size=(n_major, 2))

        # Minority samples form clusters with different sample sizes
        minor_clusters = 3
        cluster_sizes = [n_minor // 3, n_minor // 4, n_minor - (n_minor // 3) - (n_minor // 4)]
        cluster_centers = [
            np.array([-5, 5]),  # Left top
            np.array([5, -5]),  # Right bottom
            np.array([-5, -5])  # Left bottom
        ]

        X_minor = []
        for i, cluster_size in enumerate(cluster_sizes):
            center = cluster_centers[i]
            X_minor.append(
                np.random.multivariate_normal(center, [[1, 0], [0, 1]], cluster_size)
            )
        X_minor = np.vstack(X_minor)

    else:
        raise ValueError("Invalid dataset type.")
    
    labels = np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))
    return X_major, X_minor, labels

In [None]:
# Define models
rf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=1203)
knn = KNeighborsClassifier(n_neighbors=5)
log_reg = LogisticRegression(max_iter=1000, random_state=1203)
xgb = XGBClassifier(n_estimators=500, max_depth=10, eval_metric="logloss", random_state=1203)
svm = SVC(kernel="rbf", probability=True, random_state=1203)

models = {
    "Random Forest": rf,
    "KNN": knn,
    "Logistic Regression": log_reg,
    "XGBoost": xgb,
    "SVM": svm,
}

dataset_types = ["clustered_minority", "clustered_gaussian", "gaussian_vs_ring", "gaussian_vs_spiral", "gaussian_vs_exponential"]
methods = ["Original", "SMOTE", "Borderline-SMOTE", "Random Oversampling", "Trans(Direct)", "Trans(Sampling)"]

for dataset_type in dataset_types:
    print(f"Processing dataset type: {dataset_type}")
    
    results = {
        method: {model_name: {"G-mean": [], "F1-score": [], "MCC": [], "Accuracy": [], "AUROC": []} for model_name in models.keys()}
        for method in methods
    }

    for run in range(n_runs):
        print(f"Run {run + 1}/{n_runs} - Generating {dataset_type} dataset...")
        X_major, X_minor, labels = generate_datasets(
            n_samples=n_train,
            imbalance_ratio=imb_ratio,
            dataset_type=dataset_type,
            random_state=1203 + run
        )

        # Create test set
        test_X_major, test_X_minor, test_labels = generate_datasets(
            n_samples=n_test,
            imbalance_ratio=te_imb_ratio,
            dataset_type=dataset_type,
            random_state=1203 + run
        )
        test_samples = np.vstack((test_X_major, test_X_minor))
        test_labels = np.hstack((np.ones(len(test_X_major)), np.zeros(len(test_X_minor))))

        # Apply transformations
        directly_transformed = apply_transformation_with_drop(
            X_major,
            X_minor,
            in_dim=input_dim,
            loss_fn=MMD_est_torch,
            device="cpu",
            method="direct",
            selection="random",
            n_epochs=epochs,
            lr=lr,
            seed=1203 + run
        )

        sampling_transformed = apply_transformation_with_drop(
            X_major,
            X_minor,
            in_dim=input_dim,
            loss_fn=MMD_est_torch,
            device="cpu",
            method="sampling",
            n_epochs=epochs,
            lr=lr,
            seed=1203 + run
        )

        datasets = {
            "Original": (np.vstack((X_major, X_minor)), np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))),
            "SMOTE": SMOTE(random_state=1203).fit_resample(np.vstack((X_major, X_minor)), np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))),
            # "ADASYN": ADASYN(random_state=1203).fit_resample(np.vstack((X_major, X_minor)), np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))),
            "Borderline-SMOTE": BorderlineSMOTE(random_state=1203).fit_resample(np.vstack((X_major, X_minor)), np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))),
            "Random Oversampling": RandomOverSampler(random_state=1203).fit_resample(np.vstack((X_major, X_minor)), np.hstack((np.ones(len(X_major)), np.zeros(len(X_minor))))),
            "Trans(Direct)": (np.vstack((X_major, directly_transformed, X_minor)),
                                np.hstack((np.ones(len(X_major)), np.zeros(len(directly_transformed)), np.zeros(len(X_minor))))),
            "Trans(Sampling)": (np.vstack((X_major, sampling_transformed, X_minor)),
                                  np.hstack((np.ones(len(X_major)), np.zeros(len(sampling_transformed)), np.zeros(len(X_minor)))))
        }

        # Evaluate each model and method
        for method, (X_train, y_train) in datasets.items():
            for model_name, model in models.items():
                model.fit(X_train, y_train)
                y_pred = model.predict(test_samples)
                y_pred_prob = model.predict_proba(test_samples)[:, 1] if hasattr(model, "predict_proba") else None

                # Calculate metrics
                metrics = Metrics(test_labels, y_pred, y_pred_prob)
                results[method][model_name]["G-mean"].append(metrics.g_mean())
                results[method][model_name]["F1-score"].append(metrics.f1_score())
                results[method][model_name]["MCC"].append(metrics.mcc())
                results[method][model_name]["Accuracy"].append(metrics.accuracy())
                if y_pred_prob is not None:
                    results[method][model_name]["AUROC"].append(metrics.roc_auc())

    # Visualize results
    for method, (X_method, _) in datasets.items():
        generated_samples = X_method[len(X_major) + len(X_minor):]
        visualize_samples(X_major, X_minor, generated_samples, f"{method} - {dataset_type}")

    result_summary = {}
    for method, method_results in results.items():
        summary = {}
        for model_name, metrics in method_results.items():
            for metric_name, values in metrics.items():
                # Compute the average metric across runs
                avg_value = np.mean(values) if values else "Not available"
                summary[f"{model_name}_{metric_name}"] = avg_value
        result_summary[method] = summary

    # Convert result_summary to a DataFrame
    df = pd.DataFrame.from_dict(result_summary, orient="index")

    # Save to CSV
    df.to_csv(f"results/simulation/{dataset_type}_results2.csv", index=True)