<a href="https://colab.research.google.com/github/vibhuverma17/RECXGB/blob/main/RECXGB%20Execution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")
!git clone https://github.com/vibhuverma17/RECXGB.git

In [None]:
cd RECXGB

In [None]:
!pip install -r requirements.txt

In [None]:
# Standard Libraries
import os

# Scientific Libraries
import numpy as np
import pandas as pd
import scipy.io as scio
import matplotlib.pyplot as plt
from scipy.stats import scoreatpercentile

# Sklearn Libraries
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, roc_auc_score, roc_curve, f1_score

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# Imbalanced Learning Libraries
from imblearn.ensemble import BalancedBaggingClassifier

# UMAP and HDBSCAN
import umap
import hdbscan

# JSON Library (assuming it's needed elsewhere)
import json


In [None]:
# load data file
# mat = scio.loadmat(os.path.join('datasets', 'mnist.mat'))
# mat = scio.loadmat(os.path.join('datasets', 'speech.mat'))
# mat = scio.loadmat(os.path.join('datasets', 'arrhythmia.mat'))
# mat = scio.loadmat(os.path.join('datasets', 'cardio.mat'))
mat = scio.loadmat(os.path.join('datasets', 'letter.mat'))
# mat = scio.loadmat(os.path.join('datasets', 'mammography.mat'))

X = mat['X']
y = mat['y']

In [None]:
def get_precn(y, y_pred):
    '''
    Utlity function to calculate precision@n
    :param y: ground truth
    :param y_pred: number of outliers
    :return: score
    '''
    # calculate the percentage of outliers
    out_perc = np.count_nonzero(y) / len(y)

    threshold = scoreatpercentile(y_pred, 100 * (1 - out_perc))
    y_pred = (y_pred > threshold).astype('int')
    return precision_score(y, y_pred)


class XGBoostModelEvaluator:
    def __init__(self, X, y, test_size=0.40, random_seed=None):
        # Initialize class variables
        self.X = X
        self.y = y
        self.test_size = test_size
        self.random_seed = random_seed
        if self.random_seed is not None:
            np.random.seed(self.random_seed)

    def evaluate(self, n=1):
        # Store the results for each run
        results = {'iterations': []}

        for i in range(n):
            print(f"--- Iteration {i+1}/{n} ---")

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size)

            # Initialize the XGBoost model with default parameters
            model = xgb.XGBClassifier()

            # Train the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # Calculate evaluation metrics
            precision = get_precn(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred_proba)

            # Store metrics for the current iteration
            iteration_results = {
                'iteration': i + 1,
                'precision': precision * 100,
                'roc_auc': roc_auc,
                'classification_report': classification_report(y_test, y_pred, output_dict=True)
            }

            results['iterations'].append(iteration_results)

            # Print results for the current iteration
            print(f"Iteration {i+1} Results:")
            print(f"  Precision: {precision * 100:.4f}%")
            print(f"  ROC AUC: {roc_auc:.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))

        # Compute the averages for each metric
        avg_precision = np.mean([result['precision'] / 100 for result in results['iterations']])
        avg_roc_auc = np.mean([result['roc_auc'] for result in results['iterations']])

        # Create average metrics dictionary
        averages = {
            'average_precision': avg_precision * 100,
            'average_roc_auc': avg_roc_auc
        }

        # Print the average metrics
        print("\n--- Average Metrics After All Iterations ---")
        print(f"Average Precision: {avg_precision * 100:.4f}%")
        print(f"Average ROC AUC: {avg_roc_auc:.4f}")

        # Return final results dictionary
        return results, averages


class ModelEvaluator:
    def __init__(self, X, y, classifier, test_size=0.40, random_seed=None):
        """
        Initialize the ModelEvaluator with dataset, classifier, and optional parameters.

        :param X: Features of the dataset
        :param y: Target variable of the dataset
        :param classifier: The classifier to be used for training and evaluation
        :param test_size: Proportion of the dataset to be used for testing (default is 0.40)
        :param random_seed: Random seed for reproducibility (default is None)
        """
        self.X = X
        self.y = y
        self.classifier = classifier
        self.test_size = test_size
        self.random_seed = random_seed
        if self.random_seed is not None:
            np.random.seed(self.random_seed)


    def evaluate(self, n=1):
        """
        Evaluate the classifier over multiple iterations and save results to files.

        :param n: Number of iterations to run (default is 1)
        :param results_file: File to save individual results (default is 'results.json')
        :param averages_file: File to save average metrics (default is 'averages.json')
        :return: Tuple of results dictionary and averages dictionary
        """
        results = {'iterations': []}

        for i in range(n):
            print(f"--- Iteration {i+1}/{n} ---")

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=self.random_seed)

            # Initialize and train the classifier
            model = BalancedBaggingClassifier(estimator=self.classifier,sampling_strategy='auto',replacement=False,random_state=self.random_seed)
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # Calculate evaluation metrics
            precision = get_precn(y_test, y_pred_proba)
            roc_auc = roc_auc_score(y_test, y_pred_proba)

            # Store metrics for the current iteration
            iteration_results = {
                'iteration': i + 1,
                'precision': precision * 100,
                'roc_auc': roc_auc,
                'classification_report': classification_report(y_test, y_pred, output_dict=True)
            }

            results['iterations'].append(iteration_results)

            # Print results for the current iteration
            print(f"Iteration {i+1} Results:")
            print(f"  Precision: {precision * 100:.2f}%")
            print(f"  ROC AUC: {roc_auc:.2f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))

        # Compute the averages for each metric
        avg_precision = np.mean([result['precision'] / 100 for result in results['iterations']])
        avg_roc_auc = np.mean([result['roc_auc'] for result in results['iterations']])

        # Create average metrics dictionary
        averages = {
            'average_precision': avg_precision * 100,
            'average_roc_auc': avg_roc_auc
        }

        # Print the average metrics
        print("\n--- Average Metrics After All Iterations ---")
        print(f"Average Precision: {avg_precision * 100:.4f}%")
        print(f"Average ROC AUC: {avg_roc_auc:.4f}")


        # Return final results dictionary
        return results, averages




class RecursiveClustering:
    """
    A class to perform recursive clustering on high-dimensional data using UMAP and HDBSCAN.

    The `RecursiveClustering` class performs dimensionality reduction using UMAP in two stages and applies HDBSCAN
    clustering at each stage. Initially, it reduces the dimensionality of the input data and applies HDBSCAN
    clustering. For each cluster found, it further reduces dimensionality using UMAP again and applies HDBSCAN clustering on the
    resulting sub-clusters.

    Parameters:
    -----------
    initial_umap_components : int, optional, default=2
        The number of dimensions for the initial UMAP embedding.

    sub_umap_components : int, optional, default=2
        The number of dimensions for the UMAP embedding applied to sub-clusters.

    n_neighbors : int, optional, default=15
        The number of neighbors to use when computing the UMAP embedding. This parameter controls the balance
        between local versus global structure in the data.

    min_dist : float, optional, default=0.1
        The effective minimum distance between embedded points. This parameter controls how tightly UMAP is allowed
        to pack points together.

    min_cluster_size : int, optional, default=10
        The minimum size of clusters for the initial HDBSCAN clustering.

    sub_cluster_size : int, optional, default=5
        The minimum size of clusters for the sub-clustering HDBSCAN step.

    Methods:
    --------
    fit_transform(X):
        Applies UMAP dimensionality reduction and HDBSCAN clustering in a recursive manner. The method first performs
        dimensionality reduction on the entire dataset and clusters the data using HDBSCAN. It then applies UMAP and
        HDBSCAN recursively on each initial cluster.

    get_final_data():
        Returns the final DataFrame containing the clustering results after recursive UMAP and HDBSCAN clustering.

    Example:
    ---------
    >>> rc = RecursiveClustering(initial_umap_components=3, sub_umap_components=2, n_neighbors=15, min_dist=0.1, min_cluster_size=10, sub_cluster_size=5)
    >>> rc.fit_transform(X)  # where X is your high-dimensional data
    >>> final_data = rc.get_final_data()
    """

    def __init__(self, initial_umap_components=2, sub_umap_components=2, n_neighbors=15, min_dist=0.1, min_cluster_size=10, sub_cluster_size=5):
        self.initial_umap_components = initial_umap_components
        self.sub_umap_components = sub_umap_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.min_cluster_size = min_cluster_size
        self.sub_cluster_size = sub_cluster_size
        self.final_data = pd.DataFrame()

    def fit_transform(self, X):
        """
        Applies UMAP for dimensionality reduction and HDBSCAN for clustering recursively.

        This method first reduces the dimensionality of the input data `X` using UMAP and clusters the data with HDBSCAN.
        For each cluster found in the initial clustering, it performs UMAP reduction again and applies HDBSCAN clustering
        to the resulting sub-clusters.

        Parameters:
        -----------
        X : array-like, shape (n_samples, n_features)
            The input data to be clustered.

        Returns:
        --------
        None
        """
        # Apply initial UMAP for dimensionality reduction
        initial_reducer = umap.UMAP(n_neighbors=self.n_neighbors, min_dist=self.min_dist, n_components=self.initial_umap_components)
        initial_embedding = initial_reducer.fit_transform(X)

        # Apply HDBSCAN for clustering on initial UMAP embeddings
        initial_clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size)
        initial_labels = initial_clusterer.fit_predict(initial_embedding)

        # Initialize lists to store data for sub-clusters
        all_embeddings = []  # Store embeddings after the second UMAP application
        all_labels = []  # Store sub-cluster labels
        initial_labels_list = []  # Store initial cluster labels for all points

        # Iterate through each cluster from the initial clustering
        for label in set(initial_labels):
            # Extract points from the original data (X) based on initial cluster label
            cluster_points = X[initial_labels == label]

            # Apply UMAP again to the points within this cluster
            sub_reducer = umap.UMAP(n_neighbors=self.n_neighbors, min_dist=self.min_dist, n_components=self.sub_umap_components)
            sub_embedding = sub_reducer.fit_transform(cluster_points)

            if label == -1:
                # Handle noise points
                loop_data = pd.DataFrame(
                    np.hstack([
                        initial_embedding[initial_labels == label],  # Initial UMAP embeddings
                        np.full((len(sub_embedding), 1), -1),  # Initial Cluster label
                        np.full((len(sub_embedding), 1), -1),  # Sub Cluster label
                        sub_embedding  # Sub-cluster UMAP embeddings
                    ]),
                    columns=[f'Initial_UMAP_{i+1}' for i in range(self.initial_umap_components)] +
                            ['Initial_Cluster', 'Sub_Cluster_Label'] +
                            [f'Sub_UMAP_{i+1}' for i in range(self.sub_umap_components)]
                )

                self.final_data = pd.concat([self.final_data, loop_data], ignore_index=True) if not self.final_data.empty else loop_data

            else:
                # Apply HDBSCAN again for clustering within the sub-cluster points
                sub_clusterer = hdbscan.HDBSCAN(min_cluster_size=self.sub_cluster_size)
                sub_labels = sub_clusterer.fit_predict(sub_embedding)

                # Store the new UMAP embedding and unique sub-cluster labels
                loop_data = pd.DataFrame(
                    np.hstack([
                        initial_embedding[initial_labels == label],  # Initial UMAP embeddings
                        np.full((len(sub_embedding), 1), label),  # Initial Cluster label
                        sub_labels.reshape(-1, 1),  # Sub-cluster labels
                        sub_embedding  # Sub-cluster UMAP embeddings
                    ]),
                    columns=[f'Initial_UMAP_{i+1}' for i in range(self.initial_umap_components)] +
                            ['Initial_Cluster'] +
                            [f'Sub_Cluster_Label'] +
                            [f'Sub_UMAP_{i+1}' for i in range(self.sub_umap_components)]
                )

                self.final_data = pd.concat([self.final_data, loop_data], ignore_index=True) if not self.final_data.empty else loop_data

    def get_final_data(self):
        """
        Returns the final DataFrame containing the clustering results after recursive UMAP and HDBSCAN clustering.

        Returns:
        --------
        final_data : pandas.DataFrame
            The DataFrame containing the initial and sub-cluster UMAP embeddings and labels.
        """
        return self.final_data




In [None]:
### AUGMENTED DATA CREATION

rc = RecursiveClustering(initial_umap_components=3, sub_umap_components=2, n_neighbors=15, min_dist=0.1, min_cluster_size=10, sub_cluster_size=5)
rc.fit_transform(X)  # where X is your high-dimensional data
final_data = rc.get_final_data()


unique_data = final_data[['Initial_Cluster','Sub_Cluster_Label']].drop_duplicates().reset_index(drop=True).reset_index(names='Unique_ID')
final_data = final_data.merge(unique_data,on=['Initial_Cluster','Sub_Cluster_Label'],how='left')

In [None]:
# Step 6: Plot the final UMAP embedding with sub-clusters (color by sub-cluster labels)
plt.figure(figsize=(10, 8))
plt.scatter(final_data['Initial_UMAP_1'], final_data['Initial_UMAP_2'],c=final_data['Unique_ID'].values, cmap='Spectral', s=10, alpha=0.5)
plt.title('Final UMAP Embedding with HDBSCAN Sub-Clusters')
plt.xlabel('Sub_UMAP_1')
plt.ylabel('Sub_UMAP_2')
plt.colorbar(label='Unique_ID')
plt.show()

In [None]:
final_data = final_data.drop_duplicates()
for col in ['Initial_Cluster', 'Sub_Cluster_Label', 'Unique_ID']:
  final_data[col] = final_data[col].astype('category')


X_aug = pd.concat([final_data[[col for col in final_data.columns if 'Initial_UMAP' in col or col in ['Initial_Cluster', 'Sub_Cluster_Label']]], pd.DataFrame(X)], axis=1)
# Convert the NumPy array X to a Pandas DataFrame using pd.DataFrame(X)
X_aug = pd.get_dummies(X_aug, columns=['Initial_Cluster', 'Sub_Cluster_Label'], drop_first=True)
X_aug.columns = X_aug.columns.astype(str)
print(X_aug.shape)



### CHECK FOR AUGMENTED DATA -- RESULT SHOULD BE TRUE
final_data = final_data.drop_duplicates()
for col in ['Initial_Cluster', 'Sub_Cluster_Label', 'Unique_ID']:
  final_data[col] = final_data[col].astype('category').cat.as_ordered() # Added .cat.as_ordered() to convert the categorical columns to ordered ones

check = final_data.groupby('Initial_Cluster').agg({'Sub_Cluster_Label':[lambda x: x.min(),lambda x: x.max(), lambda x: x.nunique()]}) # Removed 'min' and 'max' and replaced with lambda functions that call min and max on the series
check.reset_index(inplace=True,drop=True)
check.columns = ['1','2','3']

check['4'] = check['2'] - check['1'] + 1
np.sum(check['4'] == check['3']) == check.shape[0]

In [None]:
# XGBOOST EVAL
evaluator = XGBoostModelEvaluator(X, y)
results, averages = evaluator.evaluate(n=50)

In [11]:
# AUG DATA XGBOOST EVAL
evaluator = XGBoostModelEvaluator(X_aug, y)
results, averages = evaluator.evaluate(n=50)

KeyboardInterrupt: 

In [None]:
# Balanced Bagging
evaluator = ModelEvaluator(X, y, classifier=xgb.XGBClassifier())
results, averages = evaluator.evaluate(n=50)

In [None]:
# Aug Balanced Bagging
evaluator = ModelEvaluator(X_aug, y, classifier=xgb.XGBClassifier())

# Run evaluation
results, averages = evaluator.evaluate(n=50)