In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Identifying the target class


In [None]:
# Ensure dataset is clean and use for processing
df = data.copy()

# Define the target column and feature set
target_col = "Class"
features = df.drop(columns=[target_col], errors='ignore')


#Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    features, df[target_col], test_size=0.2, random_state=42, stratify=df[target_col]
)

#Overlap detection and majority samples removal from overlapping region

In [None]:
from itertools import combinations
from sklearn.svm import OneClassSVM
from sklearn.neighbors import NearestNeighbors


# Check the number of unique classes in the target column
unique_classes = sorted(df[target_col].unique())

# Prepare result storage
overlap_results = []

# prepare adjacent pairwise classes in the target column

#check if target class has only two label (pass or fail)
if len(unique_classes) == 2:
    class_pairs = [tuple(unique_classes)]
# check if there are more than two labels (eg. c1, c2, c3, ...., cn)
else:
    class_pairs = [(unique_classes[i], unique_classes[i + 1]) for i in range(len(unique_classes) - 1)]

# Merge target column back into X_train to ensure alignment
X_train["Class"] = y_train.values

# Apply One-Class SVM for overlap detection on X_cleaned
X_cleaned = X_train.copy()

#predict the overlapping instances
oc_svm = OneClassSVM(nu=0.07, kernel="rbf", gamma="auto")
X_cleaned["Overlap_Label"] = oc_svm.fit_predict(X_cleaned)

# Apply Modified Tomek Links for neighborhood detection in Overlap regions
k_neighbors = 3  # Set k for nearest neighbors
for class_a, class_b in class_pairs:
    subset_a = X_cleaned[X_cleaned["Class"] == class_a]
    subset_b = X_cleaned[X_cleaned["Class"] == class_b]

    if subset_a.empty or subset_b.empty:
        continue

    # Determine majority (negative) and minority (positive) classes
    majority_class, minority_class = (class_a, class_b) if len(subset_a) > len(subset_b) else (class_b, class_a)
    majority_subset = X_cleaned[X_cleaned["Class"] == majority_class]
    minority_subset = X_cleaned[X_cleaned["Class"] == minority_class]

    # Fit nearest neighbors
    nn_majority = NearestNeighbors(n_neighbors=k_neighbors).fit(majority_subset.drop(columns=["Class"]))
    nn_minority = NearestNeighbors(n_neighbors=k_neighbors).fit(minority_subset.drop(columns=["Class"]))

    distances_majority, indices_majority = nn_majority.kneighbors(majority_subset.drop(columns=["Class"]))
    distances_minority, indices_minority = nn_minority.kneighbors(minority_subset.drop(columns=["Class"]))

    to_remove = set()
    for i, neighbors in enumerate(indices_majority):
        for neighbor_idx in neighbors:
            if neighbor_idx < len(indices_minority):
                minority_neighbor_idx = indices_minority[neighbor_idx][0]
                if majority_subset.index[i] in minority_subset.index and minority_subset.index[minority_neighbor_idx] in majority_subset.index:
                    to_remove.add(majority_subset.index[i])

    X_cleaned = X_cleaned.drop(index=to_remove)

# Preserve the 'Class' column in the final dataset
df_without_overlap = X_cleaned.drop(columns=["Overlap_Label"], errors='ignore')

#CTGAN


In [None]:
!pip install ctgan

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from ctgan import CTGAN


# Identify class distribution
class_distribution = df_without_overlap[target_col].value_counts()

# Identify the majority class and its sample count
majority_class = class_distribution.idxmax()
majority_count = class_distribution.max()

# Sampling strategy based on imbalance ratio conditions
sampling_strategy = {}
for cls, count in class_distribution.items():
    imbalance_ratio = count / majority_count  # Compute imbalance ratio
    if imbalance_ratio < 0.50:  # If the imbalance ratio is more than 50%
        sampling_strategy[cls] = int(0.80 * majority_count)

# Apply CTGAN-based augmentation only where needed
if sampling_strategy:
    print("Training CTGAN for Imbalance Correction...")
    ctgan = CTGAN(epochs=300, batch_size=32, pac=8, generator_dim=(256, 512, 256), discriminator_dim=(256, 128, 64))
    categorical_columns = [df_without_overlap.columns.get_loc(col) for col in df_without_overlap.select_dtypes(include=['category', 'object']).columns]

    ctgan.fit(df_without_overlap.drop(columns=[target_col]), categorical_columns)

    df_synthetic_ctgan_list = []
    for cls, new_count in sampling_strategy.items():
        num_samples = new_count - class_distribution[cls]
        synthetic_data = ctgan.sample(num_samples)

        # Convert to DataFrame and assign synthetic labels
        df_synthetic_ctgan = pd.DataFrame(synthetic_data, columns=df_without_overlap.columns)
        df_synthetic_ctgan[target_col] = cls  # Assign class label
        df_synthetic_ctgan_list.append(df_synthetic_ctgan)

    # Combine all synthetic data
    df_synthetic_ctgan_final = pd.concat(df_synthetic_ctgan_list, ignore_index=True) if df_synthetic_ctgan_list else pd.DataFrame()

    # Merge synthetic data with the original dataset
    df_balanced_ctgan = pd.concat([df_without_overlap, df_synthetic_ctgan_final], ignore_index=True)


Training CTGAN for Imbalance Correction...


# Evaluation metrics


In [None]:
from sklearn.metrics import classification_report, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
from imblearn.metrics import geometric_mean_score
from scipy.stats import entropy

# Function to calculate G-Mean
def g_mean_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = cm.diagonal() / cm.sum(axis=1)
    g_mean = np.sqrt(np.prod(sensitivity))
    return g_mean

# Compute Class Balance Accuracy (CBA)
def class_balance_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    num_classes = cm.shape[0]
    cba = np.sum([np.max(cm[i, :]) / np.sum(cm[i, :]) for i in range(num_classes)]) / num_classes
    return cba

def compute_mean_sensitivity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = np.diag(cm)
    FN = np.sum(cm, axis=1) - TP
    sensitivities = TP / (TP + FN + 1e-10)
    return np.mean(sensitivities)


def confusion_entropy(y_true, y_pred):
   # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred).astype(float)
    C = cm.shape[0]  # Number of classes

    # Compute class probabilities (Pj)
    row_sums = np.sum(cm, axis=1, keepdims=True)  # Sum of instances per actual class
    Pj = np.sum(cm, axis=1) / (2 * np.sum(cm))  # Class probability weighting

    # Compute normalized confusion probabilities (Pij)
    Pij = np.maximum(cm / (row_sums + 1e-9), 1e-9)  # Prevent division by zero and log(0)

    # Compute CEN_j for each class
    CEN_j = -np.sum(
        Pij * np.log2(np.maximum(Pij, 1e-9)) + (1 - Pij) * np.log2(np.maximum(1 - Pij, 1e-9)),
        axis=1
    )

    # Compute final Confusion Entropy score
    CEN = np.sum(Pj * CEN_j)

    # Normalize to [0,1] by dividing by log2(C) (max entropy possible)
    CEN_normalized = CEN / np.log2(C + 1e-9)  # Ensure valid normalization

    return np.clip(CEN_normalized, 0, 1)

#Cross validation

In [None]:
# ---- Cross-Validation ----
def evaluate_with_cv(X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    results = {
        'Class Balance Accuracy': [],
        'G-Mean': [],
        'Mean Sensitivity': [],
        'CEN': []
    }

#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
        rf_classifier.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        results['Class Balance Accuracy'].append(class_balance_accuracy(y_test, y_pred))
        results['G-Mean'].append(geometric_mean_score(y_test, y_pred, average='macro'))
        results['Mean Sensitivity'].append(compute_mean_sensitivity(y_test, y_pred))
        results['CEN'].append(confusion_entropy(y_test, y_pred))

        print(f"Fold {fold} completed.")

    return pd.DataFrame(results).mean().to_frame(name='Average').T

# ---- Run it ----
target_col = 'Class'  # change if needed
X = df_balanced_ctgan.drop(columns=[target_col])
y = df_balanced_ctgan[target_col]

final_metrics = evaluate_with_cv(X, y)
print(final_metrics)