<a href="https://colab.research.google.com/github/tejash09/K-means/blob/main/bottom_up.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# 1st try
```



In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

@njit(parallel=True)
def _fast_distances(X, centroids):
    """Compute squared Euclidean distances with Numba - highly optimized version."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids (constant for all data points)
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in range(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit
def _assign_labels_numba(X, centroids):
    """Assign labels to data points based on nearest centroid."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.full(n_samples, np.inf)

    for i in range(n_samples):
        for j in range(n_clusters):
            dist = 0.0
            for k in range(X.shape[1]):
                diff = X[i, k] - centroids[j, k]
                dist += diff * diff
            if dist < min_distances[i]:
                min_distances[i] = dist
                labels[i] = j

    return labels, min_distances

@njit
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Sum points in each cluster
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

# =============================================================================
# Optimized Bottom-Up KMeans Implementation
# =============================================================================
class BottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation with a bottom-up approach, gradually adding points
    to the calculation until convergence.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor=0.1,
                 batch_growth_factor=15, verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor  # For early convergence detection
        self.n_init = n_init  # Run algorithm multiple times and select best
        self.iteration_table_ = []

    def _initialize_centroids(self, X, seed=None):
        """Enhanced centroid initialization with multiple methods."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Stratified random selection for better coverage
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Divide the data range into strata for more uniform coverage
            sample_indices = []
            if n_samples >= self.n_clusters * 10:  # If we have enough samples
                # Use approximate quantiles to divide data
                from sklearn.preprocessing import KBinsDiscretizer

                # Use feature with highest variance for stratification
                feature_var = np.var(X, axis=0)
                stratify_feature = np.argmax(feature_var)

                discretizer = KBinsDiscretizer(n_bins=min(self.n_clusters, 20),
                                              encode='ordinal', strategy='quantile')
                strata = discretizer.fit_transform(X[:, stratify_feature].reshape(-1, 1)).astype(int).flatten()
                unique_strata = np.unique(strata)

                # Select points from different strata
                for stratum in unique_strata:
                    stratum_indices = np.where(strata == stratum)[0]
                    if len(stratum_indices) > 0:
                        idx = random_state.choice(stratum_indices,
                                                 size=min(max(1, self.n_clusters // len(unique_strata)),
                                                         len(stratum_indices)),
                                                 replace=False)
                        sample_indices.extend(idx)

            # If we couldn't get enough from stratification, add random ones
            if len(sample_indices) < self.n_clusters:
                remaining = self.n_clusters - len(sample_indices)
                avail_indices = list(set(range(n_samples)) - set(sample_indices))
                if avail_indices:
                    additional = random_state.choice(avail_indices,
                                                   size=min(remaining, len(avail_indices)),
                                                   replace=False)
                    sample_indices.extend(additional)

            # If we still don't have enough, allow repeats
            if len(sample_indices) < self.n_clusters:
                remaining = self.n_clusters - len(sample_indices)
                additional = random_state.choice(n_samples, size=remaining, replace=True)
                sample_indices.extend(additional)

            # Trim to exactly k clusters
            sample_indices = sample_indices[:self.n_clusters]
            return X[sample_indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ with vectorization
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()

            # Use vectorized operations for faster distance calculations
            for c in range(1, self.n_clusters):
                # Calculate squared distances to closest centroid
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    closest_dist_sq = np.min(np.sum((X[:, np.newaxis, :] -
                                              centroids[np.newaxis, :c, :])**2, axis=2), axis=1)

                # Select next centroid with probability proportional to square distance
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    next_centroid_idx = random_state.choice(n_samples, p=probs)
                else:
                    next_centroid_idx = random_state.randint(n_samples)

                centroids[c] = X[next_centroid_idx].copy()

            return centroids

        elif self.init == 'k-means++-fast':
            # Faster approximate k-means++ using subsampling for large datasets
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Subsample for very large datasets
            subsample_limit = 10000
            if n_samples > subsample_limit:
                subsample_indices = random_state.choice(n_samples,
                                                     size=subsample_limit,
                                                     replace=False)
                X_subset = X[subsample_indices]
            else:
                X_subset = X
                subsample_indices = np.arange(n_samples)

            # Choose first centroid randomly from subset
            first_idx = random_state.randint(len(X_subset))
            centroids[0] = X_subset[first_idx].copy()

            # Initialize distances array once
            subset_size = len(X_subset)
            closest_dist_sq = np.zeros(subset_size)

            for c in range(1, self.n_clusters):
                # Update distances for new centroid
                for i in range(subset_size):
                    dist = np.sum((X_subset[i] - centroids[c-1])**2)
                    if c == 1 or dist < closest_dist_sq[i]:
                        closest_dist_sq[i] = dist

                # Select next centroid
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    subset_idx = random_state.choice(subset_size, p=probs)
                    centroids[c] = X_subset[subset_idx].copy()
                else:
                    # If all points are identical to centroids, pick randomly
                    subset_idx = random_state.randint(subset_size)
                    centroids[c] = X_subset[subset_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        """Compute distances with vectorization and Numba."""
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            # Vectorized implementation for non-Numba case
            # Reshape to allow broadcasting
            expanded_X = X[:, np.newaxis, :]
            expanded_centroids = centroids[np.newaxis, :, :]

            # Calculate squared distances
            squared_diff = (expanded_X - expanded_centroids) ** 2
            squared_distances = np.sum(squared_diff, axis=2)

            return squared_distances

    def _select_next_batch(self, X, current_active, distances, batch_size, iteration):
        """
        Enhanced batch selection strategy with adaptive sampling.
        Selects points that are likely to improve centroid positions the most.
        """
        n_samples = X.shape[0]
        inactive_mask = np.ones(n_samples, dtype=bool)
        inactive_mask[current_active] = False
        inactive_indices = np.where(inactive_mask)[0]

        if len(inactive_indices) == 0:
            return np.array([])

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        labels = np.argmin(inactive_distances, axis=1)

        # Calculate multiple selection criteria
        # 1. Uncertainty: difference between closest and second closest centroid
        if self.n_clusters > 1:
            sorted_distances = np.sort(inactive_distances, axis=1)
            uncertainty = sorted_distances[:, 1] - sorted_distances[:, 0]
            uncertainty = uncertainty / (np.max(uncertainty) + 1e-10)
        else:
            uncertainty = np.zeros(len(inactive_indices))

        # 2. Representativeness: distance to closest centroid
        closest_distances = np.min(inactive_distances, axis=1)
        if np.max(closest_distances) > np.min(closest_distances):
            representativeness = (closest_distances - np.min(closest_distances)) / (np.max(closest_distances) - np.min(closest_distances) + 1e-10)
        else:
            representativeness = np.zeros_like(closest_distances)

        # 3. Cluster balance: prioritize points from underrepresented clusters
        cluster_counts = np.bincount(labels, minlength=self.n_clusters)
        balance_score = np.zeros(len(inactive_indices))
        for i, label in enumerate(labels):
            if cluster_counts[label] > 0:
                balance_score[i] = 1.0 / cluster_counts[label]
        balance_score = balance_score / (np.max(balance_score) + 1e-10)

        # Adaptive weighting of criteria based on iteration
        # Early: focus on representativeness (exploration)
        # Late: focus on uncertainty (refinement)
        exploration_weight = max(0, 1 - iteration / self.max_iterations)
        refinement_weight = 1 - exploration_weight

        # Combine criteria with adaptive weights
        combined_score = (
            exploration_weight * representativeness +
            refinement_weight * uncertainty +
            0.2 * balance_score  # Fixed weight for balance
        )

        # Select best points according to combined score
        n_to_select = min(batch_size, len(inactive_indices))
        selected_idx = np.argsort(-combined_score)[:n_to_select]  # Descending order

        return inactive_indices[selected_idx]

    def _run_single_kmeans(self, X, seed=None):
        """Run a single instance of the bottom-up K-means algorithm."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        inertia = 0.0
        n_iter = 0
        iteration_table = []

        # Compute initial distances and labels
        distances = self._compute_distances(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize batch size and active set
        initial_batch_size = max(int(n_samples * self.batch_size_factor), self.n_clusters * 3)

        # Initial active set with points closest to centroids and some distant points
        min_distances = np.min(distances, axis=1)
        closest_points = np.argsort(min_distances)[:initial_batch_size // 2]
        farthest_points = np.argsort(min_distances)[-initial_batch_size // 2:]
        active_indices = np.concatenate([closest_points, farthest_points])
        active_indices = np.unique(active_indices)  # Remove any duplicates

        # For tracking stability across iterations
        prev_inertia = float('inf')
        stability_counter = 0

        for iteration in range(self.max_iterations):
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            if len(active_indices) > 0:
                active_X = X[active_indices]
                active_labels = labels[active_indices]

                # Update centroids based on active points
                if NUMBA_AVAILABLE:
                    new_centroids, counts = _update_centroids_numba(active_X, active_labels, self.n_clusters)
                    # Handle empty clusters
                    for k in range(self.n_clusters):
                        if counts[k] == 0:
                            # Find the point furthest from its centroid
                            active_distances = distances[active_indices]
                            furthest_point_idx = np.argmax(np.min(active_distances, axis=1))
                            new_centroids[k] = active_X[furthest_point_idx]
                else:
                    new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                    for k in range(self.n_clusters):
                        cluster_mask = (active_labels == k)
                        if np.any(cluster_mask):
                            new_centroids[k] = np.mean(active_X[cluster_mask], axis=0)
                        else:
                            # For empty clusters, use old centroid or find a new representative
                            if iteration > 0:
                                new_centroids[k] = old_centroids[k]
                            else:
                                # First iteration, just pick a random point
                                random_idx = random_state.randint(len(active_X))
                                new_centroids[k] = active_X[random_idx]

                centroids = new_centroids

            # Compute distances and reassign labels
            distances = self._compute_distances(X, centroids)
            new_labels = np.argmin(distances, axis=1)

            # Calculate change in active points
            if len(active_indices) > 0:
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
            else:
                active_changed = 0
                active_changed_pct = 0

            labels = new_labels

            # Calculate inertia (sum of squared distances to closest centroid)
            min_distances = np.min(distances, axis=1)

            # Calculate inertia for active points only to monitor convergence
            active_inertia = np.sum(min_distances[active_indices]) if len(active_indices) > 0 else 0

            # Adaptive batch size growth based on convergence behavior
            convergence_factor = 1.0
            if iteration > 0 and prev_inertia > 0:
                inertia_change = abs(active_inertia - prev_inertia) / prev_inertia
                # If inertia change is small, we're converging, so grow batch size faster
                if inertia_change < 0.01:
                    convergence_factor = 1.5  # Accelerate batch growth
                elif inertia_change > 0.1:
                    convergence_factor = 0.8  # Slow down batch growth

            # Update batch size for next iteration
            next_batch_base_size = int(initial_batch_size * (self.batch_growth_factor ** iteration) * convergence_factor)
            next_batch_size = min(
                next_batch_base_size,
                n_samples - len(active_indices)
            )

            # Select next batch of points to add
            new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, iteration)

            # Record iteration information
            iteration_info = {
                'iteration': n_iter,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'active_points_changed': active_changed,
                'active_points_changed_pct': active_changed_pct * 100,
                'new_points_added': len(new_batch),
                'active_inertia': active_inertia,
                'total_coverage': len(active_indices) / n_samples * 100 if n_samples > 0 else 0
            }
            iteration_table.append(iteration_info)

            if self.verbose and (iteration + 1) % 5 == 0:
                logger.info(f"Iteration {n_iter}: "
                           f"{active_changed} active points changed ({active_changed_pct:.2%}), "
                           f"{len(new_batch)} new points added, "
                           f"{len(active_indices) / n_samples * 100:.1f}% coverage")

            # Add new batch to active set
            if len(new_batch) > 0:
                active_indices = np.append(active_indices, new_batch)

            # Calculate centroid shift for convergence check
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - centroids)**2, axis=1)))

            # Early stopping conditions

            # 1. Centroid stability
            if centroid_shift < self.tolerance:
                if len(active_indices) == n_samples:
                    # Full dataset and centroids stable = converged
                    if self.verbose:
                        logger.info(f"Converged at iteration {n_iter}: centroids stable")
                    break
                # If not full dataset but centroids stable, add more points faster
                next_batch_size = min(next_batch_size * 2, n_samples - len(active_indices))

            # 2. Inertia stability (track consecutive stable iterations)
            if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                stability_counter += 1
            else:
                stability_counter = 0

            # If inertia stable for multiple iterations and we have enough points
            if stability_counter >= 3 and len(active_indices) / n_samples > 0.5:
                if self.verbose:
                    logger.info(f"Early stopping at iteration {n_iter}: inertia stable")
                break

            # 3. All points active and no label changes
            if len(active_indices) == n_samples and active_changed == 0:
                if self.verbose:
                    logger.info(f"Converged at iteration {n_iter}: all points stable")
                break

            # 4. No new points to add and centroids stable
            if len(new_batch) == 0 and centroid_shift < self.tolerance:
                if self.verbose:
                    logger.info(f"Converged at iteration {n_iter}: no new points and centroids stable")
                break

            prev_inertia = active_inertia

        # Final update with all points
        if NUMBA_AVAILABLE:
            centroids, _ = _update_centroids_numba(X, labels, self.n_clusters)
        else:
            for k in range(self.n_clusters):
                cluster_mask = (labels == k)
                if np.any(cluster_mask):
                    centroids[k] = np.mean(X[cluster_mask], axis=0)

        # Final assignment
        distances = self._compute_distances(X, centroids)
        labels = np.argmin(distances, axis=1)
        inertia = np.sum(np.min(distances, axis=1))

        return centroids, labels, inertia, n_iter, iteration_table

    def fit(self, X, y=None):
        """
        Fit the model to data.
        Run multiple initializations and select the best result.
        """
        X = check_array(X)

        best_inertia = np.inf
        best_centroids = None
        best_labels = None
        best_n_iter = 0

        seeds = self._get_seeds()

        for seed_idx, seed in enumerate(seeds):
            if self.verbose and len(seeds) > 1:
                logger.info(f"K-means initialization {seed_idx + 1}/{len(seeds)}")

            centroids, labels, inertia, n_iter, iter_table = self._run_single_kmeans(X, seed)

            if inertia < best_inertia:
                best_centroids = centroids.copy()
                best_labels = labels.copy()
                best_inertia = inertia
                best_n_iter = n_iter
                self.iteration_table_ = iter_table

        self.cluster_centers_ = best_centroids
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        self.n_iter_ = best_n_iter

        if self.verbose:
            logger.info(f"BottomUpKMeans converged after {self.n_iter_} iterations. "
                        f"Inertia: {self.inertia_:.4f}")

        return self

    def _get_seeds(self):
        """Generate random seeds for multiple initializations."""
        random_state = check_random_state(self.random_state)
        seeds = []
        for i in range(self.n_init):
            seed = random_state.randint(0, 2**31 - 1)
            seeds.append(seed)
        return seeds

    def predict(self, X):
        """Predict the closest cluster for each sample in X."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X)

        if NUMBA_AVAILABLE:
            labels, _ = _assign_labels_numba(X, self.cluster_centers_)
            return labels
        else:
            distances = self._compute_distances(X, self.cluster_centers_)
            return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def print_iteration_table(self):
        """Prints a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            return df
        except ImportError:
            for info in self.iteration_table_:
                print(", ".join([f"{k}: {v}" for k, v in info.items()]))
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = BottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 6.824 seconds
Average execution time (OptimizedKMeans): 0.027 seconds
Average execution time (SklearnKMeans): 0.037 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 4.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available

@njit(parallel=True, fastmath=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(fastmath=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Process block
    for i in range(block_size):
        x_idx = start_idx + i
        # Compute x_norm for this sample
        x_norm = 0.0
        for k in range(n_features):
            x_norm += X[x_idx, k] * X[x_idx, k]

        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norm + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[x_idx, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(X.shape[0]):
        counts[labels[i]] += 1

    # Sum points in each cluster - vectorized across features for each sample
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 1.224 seconds
Average execution time (OptimizedKMeans): 0.022 seconds
Average execution time (SklearnKMeans): 0.020 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [None]:
! pip install line_profiler
%load_ext line_profiler
%lprun -f run_bench_evaluation run_bench_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.462 seconds
Average execution time (OptimizedKMeans): 0.027 seconds
Average execution time (SklearnKMeans): 0.022 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [None]:
X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
n_clusters = 3
n_runs = 3
bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42, verbose=True)
#bu_kmeans.fit(X)
%lprun -f HybridBottomUpKMeans.fit bu_kmeans.fit(X)


In [None]:
import cProfile, pstats

# Profile your function
cProfile.run('run_bench_evaluation()', 'profile_output')

# Load the profiling data
p = pstats.Stats('profile_output')
p.strip_dirs().sort_stats('cumulative').print_stats(15)  # Top 10 functions by cumulative time

# For a recursive breakdown, check the callers and callees of a specific function:
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.463 seconds
Average execution time (OptimizedKMeans): 0.028 seconds
Average execution time (SklearnKMeans): 0.021 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations
Fri Mar 14 10:27:37 2025    profile_output

         678370 function 

<pstats.Stats at 0x7d3a85252690>

In [None]:
p.strip_dirs().sort_stats('cumulative').print_stats(100)
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')


Fri Mar 14 10:27:37 2025    profile_output

         678370 function calls (677501 primitive calls) in 118.214 seconds

   Ordered by: cumulative time
   List reduced from 650 to 100 due to restriction <100>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  118.215  118.215 {built-in method builtins.exec}
        1    0.000    0.000  118.214  118.214 <string>:1(<module>)
        1    0.001    0.001  118.214  118.214 <ipython-input-3-d31acea3e1da>:1170(run_bench_evaluation)
   360/63    0.004    0.000  116.702    1.852 _param_validation.py:185(wrapper)
        9    0.000    0.000  116.142   12.905 _unsupervised.py:42(silhouette_score)
        9    0.005    0.001  116.141   12.905 _unsupervised.py:196(silhouette_samples)
       72    0.196    0.003  116.114    1.613 pairwise.py:2082(pairwise_distances_chunked)
      117    0.001    0.000   71.929    0.615 pairwise.py:2266(pairwise_distances)
      117    0.001    0.000   71.928    0.615

<pstats.Stats at 0x7d3a85252690>

In [None]:
!pip install -U scalene

Collecting scalene
  Downloading scalene-1.5.51-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Downloading scalene-1.5.51-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scalene
Successfully installed scalene-1.5.51


In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
from scalene import scalene_profiler

# Turn profiling on
scalene_profiler.start()

@njit(parallel=True, fastmath=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(fastmath=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Process block
    for i in range(block_size):
        x_idx = start_idx + i
        # Compute x_norm for this sample
        x_norm = 0.0
        for k in range(n_features):
            x_norm += X[x_idx, k] * X[x_idx, k]

        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norm + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[x_idx, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(X.shape[0]):
        counts[labels[i]] += 1

    # Sum points in each cluster - vectorized across features for each sample
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    scalene_profiler.stop()



ERROR: Do not try to invoke `start` if you have not called Scalene using one of the methods
in https://github.com/plasma-umass/scalene#using-scalene
(The most likely issue is that you need to run your code with `scalene`, not `python`).


SystemExit: 1

In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32

@njit(parallel=True, fastmath=True, cache=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.empty(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            norm += centroids[j, k] * centroids[j, k]
        centroid_norms[j] = norm

    # Pre-compute squared norms of X
    x_norms = np.empty(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        norm = 0.0
        for k in range(n_features):
            norm += X[i, k] * X[i, k]
        x_norms[i] = norm

    # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
    for i in prange(n_samples):
        x_norm = x_norms[i]
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] = x_norm + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(fastmath=True, cache=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.empty(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            norm += centroids[j, k] * centroids[j, k]
        centroid_norms[j] = norm

    # Process block with improved locality
    for i in range(block_size):
        x_idx = start_idx + i

        # Compute x_norm for this sample
        x_norm = 0.0
        x_values = np.empty(n_features, dtype=np.float64)

        # Cache the values in x_values for better memory access
        for k in range(n_features):
            val = X[x_idx, k]
            x_values[k] = val
            x_norm += val * val

        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += x_values[k] * centroids[j, k]
            distances[i, j] = x_norm + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True, cache=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(n_samples):
        counts[labels[i]] += 1

    # Sum points in each cluster - use prange for outer loop
    # but handle accumulation carefully to avoid race conditions
    for i in prange(n_samples):
        cluster_id = labels[i]
        for j in range(n_features):
            # Use atomic add to avoid race conditions when updating centroids
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]
        else:
            # Handle empty clusters - set to a random point
            idx = np.random.randint(0, n_samples)
            for j in range(n_features):
                centroids[i, j] = X[idx, j]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3


In [5]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy
if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        """Compute squared Euclidean distances with CuPy GPU acceleration."""
        # Transfer data to GPU if needed
        X_gpu = cp.asarray(X)
        centroids_gpu = cp.asarray(centroids)

        # Compute squared norms
        X_norm = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_norm = cp.sum(centroids_gpu**2, axis=1, keepdims=True).T

        # Use matrix multiplication for dot product
        dot_product = cp.dot(X_gpu, centroids_gpu.T)

        # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
        distances = X_norm + centroids_norm - 2.0 * dot_product

        return distances

    def compute_distances(X, centroids):
        """Wrapper to handle GPU distance computation."""
        distances_gpu = _fast_distances_gpu(X, centroids)
        return cp.asnumpy(distances_gpu)

    def assign_labels(distances):
        """Assign labels based on distances using GPU."""
        distances_gpu = cp.asarray(distances)
        min_distances = cp.min(distances_gpu, axis=1)
        labels = cp.argmin(distances_gpu, axis=1)

        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        """Update centroids using GPU acceleration."""
        X_gpu = cp.asarray(X)
        labels_gpu = cp.asarray(labels)
        n_features = X.shape[1]

        centroids = cp.zeros((n_clusters, n_features), dtype=X.dtype)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Process each cluster
        for k in range(n_clusters):
            mask = (labels_gpu == k)
            cluster_points = X_gpu[mask]
            if len(cluster_points) > 0:
                centroids[k] = cp.mean(cluster_points, axis=0)
                counts[k] = len(cluster_points)

        return cp.asnumpy(centroids), cp.asnumpy(counts)

# CPU optimized version - unchanged from original
@njit(parallel=True)
def _fast_distances_cpu(X, centroids):
    """Compute squared Euclidean distances with Numba - vectorized version."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms - parallelized
    x_norms = np.zeros(n_samples, dtype=np.float64)
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)

    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    for j in prange(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Compute distances in parallel
    for i in prange(n_samples):
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] = x_norms[i] + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True)
def _assign_labels_cpu(distances):
    """Assign labels to data points based on nearest centroid."""
    n_samples = distances.shape[0]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.full(n_samples, np.inf)

    for i in prange(n_samples):
        for j in range(distances.shape[1]):
            if distances[i, j] < min_distances[i]:
                min_distances[i] = distances[i, j]
                labels[i] = j

    return labels, min_distances

@njit(parallel=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Update centroids based on assigned labels."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count cluster members and sum values
    for i in range(n_samples):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Calculate means
    for i in prange(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.668 seconds
Average execution time (OptimizedKMeans): 0.022 seconds
Average execution time (SklearnKMeans): 0.038 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


fastest is above this


In [4]:
import cProfile, pstats

# Profile your function
cProfile.run('run_bench_evaluation()', 'profile_output')

# Load the profiling data
p = pstats.Stats('profile_output')
p.strip_dirs().sort_stats('cumulative').print_stats(15)  # Top 10 functions by cumulative time

# For a recursive breakdown, check the callers and callees of a specific function:
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3


KeyboardInterrupt: 

In [None]:
p.strip_dirs().sort_stats('cumulative').print_stats(100)
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')


In [1]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import warnings
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score, adjusted_rand_score,
                             adjusted_mutual_info_score)
from scipy import sparse
from sklearn.metrics import pairwise_distances
from joblib import Parallel, delayed

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit, prange, config, set_num_threads
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

# Try to import CuPy for GPU acceleration
try:
    import cupy as cp
    from cupyx.scipy import sparse as cp_sparse
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# ------------------------------------------------------------------------------
# GPU and CPU optimized functions (adapted from gpubottomup.py :contentReference[oaicite:4]{index=4})
# ------------------------------------------------------------------------------

if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        # Use float32 for better GPU performance
        X_gpu = cp.asarray(X, dtype=np.float32)
        centroids_gpu = cp.asarray(centroids, dtype=np.float32)
        n_samples = X_gpu.shape[0]
        n_clusters = centroids_gpu.shape[0]
        distances = cp.empty((n_samples, n_clusters), dtype=np.float32)
        batch_size = 10000  # adjust batch size as needed
        for i in range(0, n_samples, batch_size):
            end_idx = min(i + batch_size, n_samples)
            batch = X_gpu[i:end_idx]
            X_norm = cp.sum(batch * batch, axis=1, keepdims=True)
            centroids_norm = cp.sum(centroids_gpu * centroids_gpu, axis=1, keepdims=True).T
            dot_product = cp.dot(batch, centroids_gpu.T)
            distances[i:end_idx] = X_norm + centroids_norm - 2.0 * dot_product
        return distances

    def assign_labels(distances):
        distances_gpu = cp.asarray(distances, dtype=np.float32)
        min_distances = cp.min(distances_gpu, axis=1)
        labels = cp.argmin(distances_gpu, axis=1)
        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        X_gpu = cp.asarray(X, dtype=np.float32)
        labels_gpu = cp.asarray(labels, dtype=np.int32)
        n_features = X.shape[1]
        centroids = cp.zeros((n_clusters, n_features), dtype=cp.float32)
        counts = cp.zeros(n_clusters, dtype=cp.int32)
        for k in range(n_clusters):
            mask = (labels_gpu == k)
            cluster_points = X_gpu[mask]
            if cluster_points.shape[0] > 0:
                centroids[k] = cp.mean(cluster_points, axis=0)
                counts[k] = cluster_points.shape[0]
        return cp.asnumpy(centroids), cp.asnumpy(counts)

@njit(parallel=True, fastmath=True)
def _fast_distances_cpu(X, centroids):
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float32)
    x_norms = np.empty(n_samples, dtype=np.float32)
    centroid_norms = np.empty(n_clusters, dtype=np.float32)
    for i in prange(n_samples):
        s = 0.0
        for j in range(n_features):
            s += X[i, j] * X[i, j]
        x_norms[i] = s
    for k in prange(n_clusters):
        s = 0.0
        for j in range(n_features):
            s += centroids[k, j] * centroids[k, j]
        centroid_norms[k] = s
    for i in prange(n_samples):
        for k in range(n_clusters):
            dot = 0.0
            for j in range(n_features):
                dot += X[i, j] * centroids[k, j]
            distances[i, k] = x_norms[i] + centroid_norms[k] - 2.0 * dot
    return distances

@njit(parallel=True, fastmath=True)
def _assign_labels_cpu(distances):
    n_samples = distances.shape[0]
    n_clusters = distances.shape[1]
    labels = np.empty(n_samples, dtype=np.int32)
    min_distances = np.empty(n_samples, dtype=np.float32)
    for i in prange(n_samples):
        min_val = 1e10
        min_idx = 0
        for k in range(n_clusters):
            if distances[i, k] < min_val:
                min_val = distances[i, k]
                min_idx = k
        labels[i] = min_idx
        min_distances[i] = min_val
    return labels, min_distances

@njit(parallel=True, fastmath=True)
def _update_centroids_cpu(X, labels, n_clusters):
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)
    for i in range(n_samples):
        c = labels[i]
        counts[c] += 1
        for j in range(n_features):
            centroids[c, j] += X[i, j]
    for k in prange(n_clusters):
        if counts[k] > 0:
            for j in range(n_features):
                centroids[k, j] /= counts[k]
    return centroids, counts

def _fast_distances(X, centroids):
    if HAS_GPU:
        # For very large datasets, you might add additional logic here
        distances_gpu = _fast_distances_gpu(X, centroids)
        return cp.asnumpy(distances_gpu)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU and distances.shape[0] > 10000:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU and X.shape[0] > 10000:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# ------------------------------------------------------------------------------
# GPU-optimized HybridBottomUpKMeans class (integrated from gpubottomup.py :contentReference[oaicite:5]{index=5})
# ------------------------------------------------------------------------------

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []
        if n_jobs is not None and n_jobs > 0:
            set_num_threads(n_jobs)

    def _set_dynamic_parameters(self, X):
        n_samples, n_features = X.shape
        if sparse.issparse(X):
            sparsity = 1.0 - (X.count_nonzero() / (n_samples * n_features))
        else:
            if n_samples > 10000:
                sample = X[np.random.choice(n_samples, 1000, replace=False)]
                sparsity = np.sum(sample == 0) / sample.size
            else:
                sparsity = np.sum(X == 0) / X.size
        is_large = n_samples > 50000
        is_high_dim = n_features > 100
        if is_large:
            self._batch_size_factor = 0.02 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
        elif is_high_dim:
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
        else:
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
        self._hybrid_threshold = self.hybrid_threshold
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)
        if self.verbose:
            logger.info(f"Data: samples={n_samples}, features={n_features}, initial_batch={self._initial_batch_size}, growth_factor={self._batch_growth_factor}, hybrid_threshold={self._hybrid_threshold}")

    def _initialize_centroids(self, X, seed=None):
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)
        if self.init == 'random':
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            return X[indices].toarray() if sparse.issparse(X) else X[indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].toarray().flatten() if sparse.issparse(X) else X[first_idx].copy()
            if n_samples > 10000:
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)
                X_sample = X[sample_indices].toarray() if sparse.issparse(X) else X[sample_indices]
                for c in range(1, self.n_clusters):
                    min_dists = np.zeros(X_sample.shape[0])
                    for i in range(X_sample.shape[0]):
                        min_val = np.inf
                        for k in range(c):
                            dist = np.sum((X_sample[i] - centroids[k]) ** 2)
                            if dist < min_val:
                                min_val = dist
                        min_dists[i] = min_val
                    sum_dists = min_dists.sum()
                    if sum_dists > 0:
                        probs = min_dists / sum_dists
                        next_idx = random_state.choice(sample_size, p=probs)
                        centroids[c] = X_sample[next_idx].copy()
                    else:
                        next_idx = random_state.randint(sample_size)
                        centroids[c] = X_sample[next_idx].copy()
            else:
                if sparse.issparse(X):
                    X_dense = X.toarray()
                    for c in range(1, self.n_clusters):
                        min_dists = np.zeros(n_samples)
                        for i in range(n_samples):
                            min_val = np.inf
                            for k in range(c):
                                dist = np.sum((X_dense[i] - centroids[k]) ** 2)
                                if dist < min_val:
                                    min_val = dist
                            min_dists[i] = min_val
                        sum_dists = min_dists.sum()
                        if sum_dists > 0:
                            probs = min_dists / sum_dists
                            next_idx = random_state.choice(n_samples, p=probs)
                            centroids[c] = X_dense[next_idx].copy()
                        else:
                            next_idx = random_state.randint(n_samples)
                            centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = np.zeros(n_samples)
                        for i in range(n_samples):
                            min_val = np.inf
                            for k in range(c):
                                dist = np.sum((X[i] - centroids[k]) ** 2)
                                if dist < min_val:
                                    min_val = dist
                            min_dists[i] = min_val
                        sum_dists = min_dists.sum()
                        if sum_dists > 0:
                            probs = min_dists / sum_dists
                            next_idx = random_state.choice(n_samples, p=probs)
                            centroids[c] = X[next_idx].copy()
                        else:
                            next_idx = random_state.randint(n_samples)
                            centroids[c] = X[next_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1
        if sparse.issparse(X):
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)
        if n_jobs <= 1 or n_samples < 1000:
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size
        blocks = [(i * block_size, min(i * block_size + block_size, n_samples)) for i in range(n_blocks)]
        results = Parallel(n_jobs=n_jobs)(delayed(_fast_distances_cpu)(X[start:end], centroids) for start, end in blocks)
        return np.vstack(results)

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        n_samples = X.shape[0]
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]
        batch_indices = []
        for k in range(self.n_clusters):
            cluster_mask = (inactive_labels == k)
            cluster_indices = inactive_indices[cluster_mask]
            if len(cluster_indices) > 0:
                cluster_dists = distances[cluster_indices][:, k]
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest_idx = np.argsort(cluster_dists)[:num_to_take]
                batch_indices.extend(cluster_indices[closest_idx])
        if self.n_clusters > 1:
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, batch_size // 4)
            boundary_idx = np.argsort(margins)[:num_boundary]
            batch_indices.extend(inactive_indices[boundary_idx])
        min_dists = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_idx = np.argsort(-min_dists)[:num_outliers]
        batch_indices.extend(inactive_indices[outlier_idx])
        batch_indices = list(set(batch_indices))
        if len(batch_indices) > batch_size:
            batch_distances = np.min(distances[batch_indices], axis=1)
            priority_idx = np.argsort(batch_distances)[:batch_size]
            batch_indices = [batch_indices[i] for i in priority_idx]
        if len(batch_indices) < batch_size:
            remaining = batch_size - len(batch_indices)
            available = list(set(inactive_indices) - set(batch_indices))
            if available:
                random_indices = np.random.choice(available, size=min(remaining, len(available)), replace=False)
                batch_indices.extend(random_indices)
        return np.array(batch_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        distances = self._compute_distances_parallel(X, centroids)
        if NUMBA_AVAILABLE:
            labels, min_distances = _assign_labels_numba(distances)
        else:
            labels = np.argmin(distances, axis=1)
            min_distances = np.min(distances, axis=1)
        new_centroids = np.zeros_like(centroids)
        if NUMBA_AVAILABLE and not sparse.issparse(X):
            new_centroids, counts = _update_centroids_numba(X, labels, self.n_clusters)
            for k in range(self.n_clusters):
                if counts[k] == 0:
                    new_centroids[k] = centroids[k]
        else:
            for k in range(self.n_clusters):
                cluster_mask = (labels == k)
                if np.any(cluster_mask):
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
                else:
                    new_centroids[k] = centroids[k]
        inertia = np.sum(min_distances)
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids) ** 2))
        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)
        self._set_dynamic_parameters(X)
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1
        distances = self._compute_distances_parallel(X, centroids)
        if NUMBA_AVAILABLE:
            labels, _ = _assign_labels_numba(distances)
        else:
            labels = np.argmin(distances, axis=1)
        initial_indices = []
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            cluster_points = np.where(cluster_mask)[0]
            if len(cluster_points) > 0:
                cluster_dists = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest_idx = np.argsort(cluster_dists)[:num_to_take]
                initial_indices.extend(cluster_points[closest_idx])
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_idx = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_idx)
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()
            coverage_ratio = len(active_indices) / n_samples
            if coverage_ratio < self._hybrid_threshold:
                if len(active_indices) > 0:
                    X_active = X[active_indices].toarray() if sparse.issparse(X) else X[active_indices]
                    active_labels = labels[active_indices]
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                new_centroids[k] = old_centroids[k] if iteration > 0 else X_active[np.random.randint(len(X_active))]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                new_centroids[k] = old_centroids[k] if iteration > 0 else X_active[np.random.randint(len(X_active))]
                    centroids = new_centroids
                distances = self._compute_distances_parallel(X, centroids)
                if NUMBA_AVAILABLE:
                    new_labels, min_distances = _assign_labels_numba(distances)
                else:
                    new_labels = np.argmin(distances, axis=1)
                    min_distances = np.min(distances, axis=1)
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels
                active_inertia = np.sum(min_distances[active_indices])
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5
                next_batch_size = min(int(self._initial_batch_size * (growth_factor ** iteration)), n_samples - len(active_indices))
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids) ** 2))
                iter_time = time.time() - iter_start
                iteration_info = {'iteration': n_iter, 'phase': 'bottom-up', 'active_points': len(active_indices),
                                  'coverage': len(active_indices) / n_samples * 100, 'active_changed': active_changed,
                                  'active_changed_pct': active_changed_pct * 100, 'centroid_shift': centroid_shift,
                                  'new_points_added': len(new_batch), 'inertia': active_inertia, 'time': iter_time}
                iteration_table.append(iteration_info)
                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices) / n_samples * 100:.1f}% points, {active_changed_pct * 100:.1f}% changed, {iter_time:.3f}s")
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    break
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0
                if stability_counter >= 3:
                    break
                prev_inertia = active_inertia
                prev_active_size = len(active_indices)
            else:
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} with {len(active_indices)/n_samples*100:.1f}% coverage")
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)
                iter_time = time.time() - iter_start
                iteration_info = {'iteration': n_iter, 'phase': 'standard', 'active_points': n_samples,
                                  'coverage': 100.0, 'active_changed': np.nan, 'active_changed_pct': np.nan,
                                  'centroid_shift': centroid_shift, 'new_points_added': 0, 'inertia': inertia, 'time': iter_time}
                iteration_table.append(iteration_info)
                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, shift={centroid_shift:.6f}, {iter_time:.3f}s")
                if centroid_shift < self.tolerance:
                    break
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0
                if stability_counter >= 2:
                    break
                prev_inertia = inertia
        if len(active_indices) / n_samples < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))
        total_time = time.time() - start_time
        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter + 1}")
        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []
            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]
            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i + 1}/{len(seeds)}")
                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)
                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table
            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, self.random_state)
            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iter_table
        return self

    def fit_predict(self, X, y=None):
        self.fit(X)
        return self.labels_

    def predict(self, X):
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=True)
        if HAS_GPU and X.shape[0] > 10000:
            batch_size = 10000
            n_samples = X.shape[0]
            labels = np.empty(n_samples, dtype=np.int32)
            for i in range(0, n_samples, batch_size):
                end = min(i + batch_size, n_samples)
                X_batch = X[i:end]
                distances = _fast_distances(X_batch, self.cluster_centers_)
                batch_labels, _ = _assign_labels_numba(distances)
                labels[i:end] = batch_labels
            return labels
        else:
            if NUMBA_AVAILABLE:
                distances = _fast_distances(X, self.cluster_centers_)
                labels, _ = _assign_labels_numba(distances)
                return labels
            else:
                distances = self._compute_distances_parallel(X, self.cluster_centers_)
                return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            if 'time' in df.columns and 'active_points' in df.columns:
                df['throughput'] = df['active_points'] / df['time']
                df['cumulative_time'] = df['time'].cumsum()
            df.name = "Iteration History (GPU optimized)"
            if 'phase' in df.columns:
                df['phase'] = df['phase'].str.capitalize()
            for col in df.select_dtypes(include=['float']).columns:
                df[col] = df[col].round(3)
            return df
        except ImportError:
            print("Iteration History:")
            for i, info in enumerate(self.iteration_table_):
                print(f"Iteration {i + 1}: " + ", ".join([f"{k}: {v:.3f}" if isinstance(v, float) else f"{k}: {v}" for k, v in info.items()]))

# ------------------------------------------------------------------------------
# Benchmark and Evaluation Functions (adapted from bottomup.py :contentReference[oaicite:6]{index=6})
# ------------------------------------------------------------------------------

def load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3):
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=42)
    return X, y

def load_wine_data():
    from sklearn.datasets import load_wine
    data = load_wine()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target
    return df

def preprocess_data(df):
    X = df.drop('target', axis=1).values
    y = df['target'].values
    return X, {'original_shape': df.shape, 'cleaned_shape': X.shape}, y

def print_results(results, y_true_available=False):
    print("Benchmark Results:")
    for method in results['times']:
        times = results['times'][method]
        print(f"{method}: Avg Time = {np.mean(times):.3f}s, Avg Iterations = {np.mean(results['iterations'][method]):.1f}")
        metrics = results['metrics'][method]
        print("Metrics:")
        for m in metrics:
            print(m)

def visualize_results(X, results, optim_k_results, preprocessing_info, y_true=None):
    try:
        import matplotlib.pyplot as plt
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        plt.figure(figsize=(8, 6))
        plt.scatter(X_pca[:, 0], X_pca[:, 1], c=results['labels'], cmap='viridis', alpha=0.6)
        plt.title("Cluster Visualization")
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.show()
    except ImportError:
        print("Matplotlib not installed. Skipping visualization.")

def run_bench_evaluation():
    print("\n" + "=" * 80)
    print("BENCHMARK EVALUATION: HybridBottomUpKMeans vs SklearnKMeans")
    print("=" * 80)
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3
    results = {'times': {'hybrid': [], 'sklearn': []},
               'metrics': {'hybrid': [], 'sklearn': []},
               'iterations': {'hybrid': [], 'sklearn': []},
               'labels': None}
    for run in range(n_runs):
        print(f"\nRun {run + 1}/{n_runs}")
        start_time = time.time()
        hybrid_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42 + run, verbose=True)
        hybrid_kmeans.fit(X)
        hybrid_time = time.time() - start_time
        results['times']['hybrid'].append(hybrid_time)
        hybrid_metrics = {
            'silhouette': silhouette_score(X, hybrid_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, hybrid_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, hybrid_kmeans.labels_),
            'inertia': hybrid_kmeans.inertia_
        }
        if y_true is not None:
            hybrid_metrics['adjusted_rand'] = adjusted_rand_score(y_true, hybrid_kmeans.labels_)
            hybrid_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, hybrid_kmeans.labels_)
        results['metrics']['hybrid'].append(hybrid_metrics)
        results['iterations']['hybrid'].append(hybrid_kmeans.n_iter_)
        if run == 0:
            print("\nIteration table for first run:")
            print(hybrid_kmeans.print_iteration_table())
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42 + run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)
    results['labels'] = hybrid_kmeans.labels_
    print("\nFinal Benchmark Results:")
    print(f"HybridBottomUpKMeans Avg Time: {np.mean(results['times']['hybrid']):.3f}s")
    print(f"SklearnKMeans Avg Time: {np.mean(results['times']['sklearn']):.3f}s")
    return results

# ------------------------------------------------------------------------------
# Main entry point
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Run the benchmark evaluation; additional evaluations (e.g. on wine data) can be added similarly.
    bench_results = run_bench_evaluation()



BENCHMARK EVALUATION: HybridBottomUpKMeans vs SklearnKMeans

Run 1/3

Iteration table for first run:
   iteration      phase  active_points  coverage  active_changed  \
0          1  Bottom-up           2249     7.497           392.0   
1          2  Bottom-up           5249    17.497           401.0   
2          3  Bottom-up          11249    37.497           210.0   
3          4   Standard          30000   100.000             NaN   
4          5   Standard          30000   100.000             NaN   
5          6   Standard          30000   100.000             NaN   
6          7   Standard          30000   100.000             NaN   

   active_changed_pct  centroid_shift  new_points_added        inertia   time  \
0              17.430           2.128              3000    5981.881836  0.027   
1               7.640           3.352              6000   64804.453125  0.032   
2               1.867           1.190             12000  169974.281250  0.042   
3                 NaN        

In [3]:
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np


try:
    import cupy as cp
    from cupyx.scipy import sparse as cp_sparse
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state
from sklearn.metrics import pairwise_distances
from joblib import Parallel, delayed
import time
import logging
from numba import njit, prange, config, set_num_threads

# Set up logging
logger = logging.getLogger(__name__)

# Configure Numba for optimal performance
config.THREADING_LAYER = 'threadsafe'
NUMBA_AVAILABLE = True  # Assume Numba is available

# GPU optimized functions
if HAS_GPU:
    # Improved GPU distance calculation with memory optimization
    def _fast_distances_gpu(X, centroids):
        """Optimized Euclidean distance computation with CuPy."""
        # Transfer data to GPU if needed (only once)
        X_gpu = cp.asarray(X, dtype=np.float32)  # Use float32 for better GPU performance
        centroids_gpu = cp.asarray(centroids, dtype=np.float32)

        # Pre-allocate output array
        n_samples = X_gpu.shape[0]
        n_clusters = centroids_gpu.shape[0]
        distances = cp.empty((n_samples, n_clusters), dtype=np.float32)

        # Compute in batches to avoid GPU memory issues on large datasets
        batch_size = 10000  # Adjust based on your GPU memory

        for i in range(0, n_samples, batch_size):
            end_idx = min(i + batch_size, n_samples)
            batch = X_gpu[i:end_idx]

            # Compute squared norms (use inplace operations where possible)
            X_norm = cp.sum(batch * batch, axis=1, keepdims=True)
            centroids_norm = cp.sum(centroids_gpu * centroids_gpu, axis=1, keepdims=True).T

            # Use matrix multiplication for dot product
            dot_product = cp.dot(batch, centroids_gpu.T)

            # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
            distances[i:end_idx] = X_norm + centroids_norm - 2.0 * dot_product

        return distances

    # Optimized label assignment on GPU
    def assign_labels(distances):
        """Optimized label assignment using GPU."""
        distances_gpu = cp.asarray(distances, dtype=np.float32)
        min_distances = cp.min(distances_gpu, axis=1)
        labels = cp.argmin(distances_gpu, axis=1)

        # Transfer results back to CPU at once to minimize transfers
        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    # Streamlined centroid update on GPU
    def _update_centroids_gpu(X, labels, n_clusters):
        """Optimized GPU centroid update using atomics and parallel reduction."""
        X_gpu = cp.asarray(X, dtype=np.float32)
        labels_gpu = cp.asarray(labels, dtype=np.int32)
        n_features = X.shape[1]

        # Use one-hot encoding for efficient parallel reduction
        # This is faster for large datasets than iterating through clusters
        centroids = cp.zeros((n_clusters, n_features), dtype=np.float32)
        counts = cp.zeros(n_clusters, dtype=np.int32)

        # Custom CUDA kernel for parallel reduction would be ideal here
        # For now, use a simpler approach with existing CuPy functions
        for k in range(n_clusters):
            mask = (labels_gpu == k)
            cluster_points = X_gpu[mask]
            cluster_size = cp.sum(mask)

            if cluster_size > 0:
                # Use parallel reduction for sum
                centroids[k] = cp.sum(cluster_points, axis=0) / cluster_size
                counts[k] = cluster_size

        # Transfer results back to CPU at once
        return cp.asnumpy(centroids), cp.asnumpy(counts)

    # GPU sparse matrix support
    def _handle_sparse_gpu(X, centroids=None):
        """Convert sparse matrices to GPU format efficiently."""
        if not sparse.issparse(X):
            return cp.asarray(X, dtype=np.float32)

        # Convert to CSR for efficient GPU transfer
        X_csr = X.tocsr() if not isinstance(X, sparse.csr_matrix) else X
        X_gpu = cp_sparse.csr_matrix(X_csr)

        return X_gpu

# Optimized CPU functions with Numba
@njit(parallel=True, fastmath=True)
def _fast_distances_cpu(X, centroids):
    """Highly optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float32)

    # Pre-compute squared norms with better vectorization
    x_norms = np.zeros(n_samples, dtype=np.float32)
    centroid_norms = np.zeros(n_clusters, dtype=np.float32)

    # Vectorized norm computation
    for i in prange(n_samples):
        x_norms[i] = np.sum(X[i] * X[i])

    for j in prange(n_clusters):
        centroid_norms[j] = np.sum(centroids[j] * centroids[j])

    # Optimize cache locality by computing all distances for one sample at a time
    for i in prange(n_samples):
        x = X[i]  # Cache the row for better memory access
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += x[k] * centroids[j, k]
            distances[i, j] = x_norms[i] + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True)
def _assign_labels_cpu(distances):
    """Optimized label assignment with Numba."""
    n_samples = distances.shape[0]
    n_clusters = distances.shape[1]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.zeros(n_samples, dtype=np.float32)

    # Use parallelization with better memory access
    for i in prange(n_samples):
        min_dist = np.inf
        min_idx = 0
        for j in range(n_clusters):
            if distances[i, j] < min_dist:
                min_dist = distances[i, j]
                min_idx = j
        labels[i] = min_idx
        min_distances[i] = min_dist

    return labels, min_distances

@njit(parallel=True, fastmath=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Optimized centroid update with Numba."""
    n_samples = X.shape[0]
    n_features = X.shape[1]

    # Pre-allocate arrays for better memory management
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count cluster members and sum values - parallelize by cluster
    for i in range(n_samples):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Calculate means - parallelize by cluster
    for i in prange(n_clusters):
        if counts[i] > 0:
            inv_count = 1.0 / counts[i]
            for j in range(n_features):
                centroids[i, j] *= inv_count

    return centroids, counts

# Optimized block processing for parallel distance computation
@njit(parallel=True, fastmath=True)
def _fast_distances_block(X, centroids, start, end):
    """Process a block of data for parallel distance computation."""
    X_block = X[start:end]
    return _fast_distances_cpu(X_block, centroids)

# Unified optimized interface
def _fast_distances(X, centroids):
    """Optimized distance computation with automatic CPU/GPU selection."""
    if HAS_GPU:
        # Use batched GPU implementation for very large datasets
        if X.shape[0] > 100000:
            return compute_distances(X, centroids)
        else:
            # For smaller datasets, transfer everything at once
            distances_gpu = _fast_distances_gpu(X, centroids)
            return cp.asnumpy(distances_gpu)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    """Optimized label assignment with GPU/CPU selection."""
    if HAS_GPU and distances.shape[0] > 10000:  # Only use GPU for larger datasets
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    """Optimized centroid update with GPU/CPU selection."""
    if HAS_GPU and X.shape[0] > 10000:  # Only use GPU for larger datasets
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation with hybrid bottom-up approach.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

        # Set Numba threading if n_jobs is specified
        if n_jobs is not None and n_jobs > 0:
            set_num_threads(n_jobs)

    def _set_dynamic_parameters(self, X):
        """Optimized parameter selection based on data characteristics."""
        n_samples, n_features = X.shape

        # Fast analysis based on size and simple sampling
        is_large = n_samples > 50000
        is_high_dim = n_features > 100

        # Calculate sparsity without full matrix computation
        if sparse.issparse(X):
            sparsity = 1.0 - (X.count_nonzero() / (n_samples * n_features))
        else:
            # Sample a subset for large datasets
            if n_samples > 10000:
                random_idx = np.random.choice(n_samples, 1000, replace=False)
                sample = X[random_idx]
                sparsity = np.sum(sample == 0) / sample.size
            else:
                sparsity = np.sum(X == 0) / X.size

        is_sparse = sparsity > 0.5

        # Optimize parameters based on data characteristics
        if is_large:
            self._batch_size_factor = 0.02 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
        elif is_high_dim:
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
        else:
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor

        self._hybrid_threshold = self.hybrid_threshold

        # Adjust for very small datasets
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data characteristics: samples={n_samples}, features={n_features}, "
                      f"sparsity={sparsity:.2f}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                      f"growth_factor={self._batch_growth_factor:.1f}, "
                      f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Optimized centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ with sampling for large datasets
            is_large = n_samples > 10000

            # Initialize array for centroids
            if sparse.issparse(X):
                dtype = X.dtype
            else:
                dtype = X.dtype

            centroids = np.zeros((self.n_clusters, n_features), dtype=dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, use sampling with lower memory footprint
            if is_large:
                # Use a smaller sample for initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample with optimized distance computation
                for c in range(1, self.n_clusters):
                    # Calculate squared distances to closest centroid efficiently
                    if HAS_GPU and X_sample.shape[0] > 5000:
                        # Use GPU for larger samples
                        X_gpu = cp.asarray(X_sample)
                        centroids_gpu = cp.asarray(centroids[:c])

                        # Compute distances efficiently
                        distances = cp.zeros((X_sample.shape[0], c))
                        for k in range(c):
                            diff = X_gpu - centroids_gpu[k]
                            distances[:, k] = cp.sum(diff * diff, axis=1)

                        min_dists = cp.min(distances, axis=1).get()
                    else:
                        # Use vectorized CPU computation
                        min_dists = np.zeros(X_sample.shape[0])
                        for i in range(X_sample.shape[0]):
                            min_dist = float('inf')
                            for k in range(c):
                                dist = np.sum((X_sample[i] - centroids[k]) ** 2)
                                if dist < min_dist:
                                    min_dist = dist
                            min_dists[i] = min_dist

                    # Select next centroid with probability proportional to squared distance
                    sum_dists = min_dists.sum()
                    if sum_dists > 0:
                        probs = min_dists / sum_dists
                        next_idx = random_state.choice(sample_size, p=probs)
                        centroids[c] = X_sample[next_idx].copy()
                    else:
                        # Fallback for numerical issues
                        next_idx = random_state.randint(sample_size)
                        centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        # Use efficient vectorized distance calculation
                        min_dists = np.zeros(n_samples)
                        for i in range(n_samples):
                            min_dist = float('inf')
                            for k in range(c):
                                dist = np.sum((X_dense[i] - centroids[k]) ** 2)
                                if dist < min_dist:
                                    min_dist = dist
                            min_dists[i] = min_dist

                        sum_dists = min_dists.sum()
                        if sum_dists > 0:
                            probs = min_dists / sum_dists
                            next_idx = random_state.choice(n_samples, p=probs)
                            centroids[c] = X_dense[next_idx].copy()
                        else:
                            next_idx = random_state.randint(n_samples)
                            centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        # Use efficient vectorized distance calculation
                        if c == 1:
                            # First iteration - simple vector operations
                            diff = X - centroids[0]
                            min_dists = np.sum(diff * diff, axis=1)
                        else:
                            # Compute only new distances
                            new_dists = np.sum((X - centroids[c-1]) ** 2, axis=1)
                            min_dists = np.minimum(min_dists, new_dists)

                        sum_dists = min_dists.sum()
                        if sum_dists > 0:
                            probs = min_dists / sum_dists
                            next_idx = random_state.choice(n_samples, p=probs)
                            centroids[c] = X[next_idx].copy()
                        else:
                            next_idx = random_state.randint(n_samples)
                            centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Highly optimized parallel distance computation."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        # For sparse matrices, use specialized handling
        if sparse.issparse(X):
            if HAS_GPU:
                # Use GPU for sparse computation if available
                try:
                    X_gpu = _handle_sparse_gpu(X)
                    centroids_gpu = cp.asarray(centroids)

                    # Use specialized sparse GPU functions
                    # For large sparse matrices, process in batches
                    if X.shape[0] > 50000:
                        batch_size = 10000
                        distances = np.zeros((X.shape[0], centroids.shape[0]))

                        for i in range(0, X.shape[0], batch_size):
                            end = min(i + batch_size, X.shape[0])
                            X_batch = X_gpu[i:end]

                            # Convert to dense for distance computation (if needed)
                            if isinstance(X_batch, cp_sparse.spmatrix):
                                X_batch = X_batch.toarray()

                            # Compute distances efficiently
                            batch_distances = _fast_distances_gpu(X_batch, centroids_gpu)
                            distances[i:end] = cp.asnumpy(batch_distances)

                        return distances
                    else:
                        # Convert to dense for smaller matrices
                        X_dense = X_gpu.toarray() if isinstance(X_gpu, cp_sparse.spmatrix) else X_gpu
                        distances = _fast_distances_gpu(X_dense, centroids_gpu)
                        return cp.asnumpy(distances)

                except Exception as e:
                    # Fallback to CPU if GPU fails
                    logger.warning(f"GPU sparse computation failed, falling back to CPU: {e}")
                    return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)
            else:
                # CPU sparse computation
                return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        # For dense matrices, choose optimal implementation based on size
        if n_samples < 1000 or n_jobs <= 1:
            # Use optimized Numba for small datasets or single-threaded
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        if NUMBA_AVAILABLE:
            # Use Numba with parallel processing
            # Determine optimal block size based on data size
            if n_samples > 100000:
                block_size = max(1000, n_samples // (n_jobs * 2))
            else:
                block_size = max(100, n_samples // n_jobs)

            n_blocks = (n_samples + block_size - 1) // block_size

            # Prepare blocks for better load balancing
            blocks = []
            for i in range(n_blocks):
                start_idx = i * block_size
                end_idx = min(start_idx + block_size, n_samples)
                blocks.append((start_idx, end_idx))

            # Process blocks in parallel with joblib
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results efficiently
            return np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized strategic batch selection algorithm."""
        n_samples = X.shape[0]

        # Fast path for edge cases
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask more efficiently using boolean operations
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Optimized batch selection using vectorized operations
        batch_indices = []

        # 1. Take points closest to centroids using vectorized operations
        for k in range(self.n_clusters):
            cluster_mask = (inactive_labels == k)
            cluster_indices = inactive_indices[cluster_mask]

            if len(cluster_indices) > 0:
                # Get distances to centroid k
                cluster_dists = distances[cluster_indices][:, k]

                # Take closest points using argsort (more efficient)
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest_idx = np.argsort(cluster_dists)[:num_to_take]
                batch_indices.extend(cluster_indices[closest_idx])

        # 2. Take boundary points (vectorized)
        if self.n_clusters > 1:
            # Get margins between closest and second closest centroids
            sorted_idx = np.argsort(inactive_distances, axis=1)
            closest_dists = np.take_along_axis(inactive_distances, sorted_idx[:, 0:1], axis=1)
            second_closest_dists = np.take_along_axis(inactive_distances, sorted_idx[:, 1:2], axis=1)
            margins = second_closest_dists.ravel() - closest_dists.ravel()

            # Get points with smallest margins (near boundaries)
            num_boundary = max(1, batch_size // 4)
            boundary_idx = np.argsort(margins)[:num_boundary]
            batch_indices.extend(inactive_indices[boundary_idx])

        # 3. Add outliers (vectorized)
        min_dists = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_idx = np.argsort(-min_dists)[:num_outliers]
        batch_indices.extend(inactive_indices[outlier_idx])

        # Ensure uniqueness efficiently using sets
        batch_indices = list(set(batch_indices))

        # If we have too many points, use the most important ones
        if len(batch_indices) > batch_size:
            # Prioritize by distance
            batch_distances = np.min(distances[batch_indices], axis=1)
            priority_idx = np.argsort(batch_distances)[:batch_size]
            batch_indices = [batch_indices[i] for i in priority_idx]

        # If we need more points, add random ones
        if len(batch_indices) < batch_size:
            remaining = batch_size - len(batch_indices)
            # Use set difference for efficiency
            available = list(set(inactive_indices) - set(batch_indices))
            if available:
                random_indices = np.random.choice(available,
                                                size=min(remaining, len(available)),
                                                replace=False)
                batch_indices.extend(random_indices)

        return np.array(batch_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Optimized standard k-means iteration."""
        # Compute distances efficiently
        distances = self._compute_distances_parallel(X, centroids)

        # Assign labels using optimized function
        if NUMBA_AVAILABLE:
            labels, min_distances = _assign_labels_numba(distances)
        else:
            labels = np.argmin(distances, axis=1)
            min_distances = np.min(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)

        if NUMBA_AVAILABLE and not sparse.issparse(X):
            # Use optimized Numba function for dense data
            new_centroids, counts = _update_centroids_numba(X, labels, self.n_clusters)

            # Handle empty clusters
            for k in range(self.n_clusters):
                if counts[k] == 0:
                    new_centroids[k] = centroids[k]
        else:
            # Standard update logic
            for k in range(self.n_clusters):
                cluster_mask = (labels == k)
                if np.any(cluster_mask):
                    if sparse.issparse(X):
                        new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                    else:
                        new_centroids[k] = np.mean(X[cluster_mask], axis=0)
                else:
                    new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(min_distances)
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Optimized hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)

        # Initial setup
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels efficiently
        distances = self._compute_distances_parallel(X, centroids)
        if NUMBA_AVAILABLE:
            labels, _ = _assign_labels_numba(distances)
        else:
            labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        # Use vectorized operations for efficiency
        initial_indices = []

        # Include points closest to centroids
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            cluster_points = np.where(cluster_mask)[0]

            if len(cluster_points) > 0:
                # Use vectorized operations
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest_idx = np.argsort(cluster_distances)[:num_to_take]
                initial_indices.extend(cluster_points[closest_idx])

        # Include boundary points for better separation
        if self.n_clusters > 1:
            # Vectorized boundary computation
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_idx = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_idx)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Convergence tracking
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop - optimized
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data efficiently
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using optimized functions
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)

                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        # Standard update with vectorized operations
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points efficiently
                distances = self._compute_distances_parallel(X, centroids)

                # Assign labels efficiently
                if NUMBA_AVAILABLE:
                    new_labels, min_distances = _assign_labels_numba(distances)
                else:
                    new_labels = np.argmin(distances, axis=1)
                    min_distances = np.min(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia efficiently
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth based on stability
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size with bounds
                next_batch_size = min(
                    int(self._initial_batch_size * (growth_factor ** iteration)),
                    n_samples - len(active_indices)
                )

                # Select next batch strategically
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration stats
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping optimizations
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing when it should
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    break

                # 3. Inertia stability with counter
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration with optimizations
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration stats
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration if needed
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter
    def fit(self, X, y=None):

        """
        Optimized model fitting with multiple initializations for Google Colab T4 environment.
        """
        # Start timing for performance monitoring
        start_time = time.time()

        # Input validation with optimal data type for T4 GPU
        if HAS_GPU:
            # For T4 GPU, float32 offers better performance
            X = check_array(X, dtype=np.float32, accept_sparse=True)
        else:
            X = check_array(X, accept_sparse=True)

        best_inertia = np.inf
        best_centroids = None
        best_labels = None
        best_n_iter = 0

        # Generate seeds efficiently
        seeds = self._get_seeds()

        # For Colab's 2-core system, limit parallelism and optimize single runs
        if self.n_jobs is None or self.n_jobs > 2:
            # Override n_jobs for Colab environment
            self.n_jobs = 2

        # Progress tracking
        if self.verbose:
            logger.info(f"Starting K-means with {len(seeds)} initializations on "
                    f"{'GPU (T4)' if HAS_GPU else 'CPU (2 cores)'}")

        # Run initializations
        for seed_idx, seed in enumerate(seeds):
            if self.verbose and len(seeds) > 1:
                logger.info(f"K-means initialization {seed_idx + 1}/{len(seeds)}")

            # Run single K-means efficiently
            try:
                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                # Check if this is the best run
                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    self.iteration_table_ = iter_table

            except Exception as e:
                # Robust error handling
                logger.warning(f"Initialization {seed_idx + 1} failed: {str(e)}. Continuing with next seed.")
                continue

        # Store results
        self.cluster_centers_ = best_centroids
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        self.n_iter_ = best_n_iter

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"HybridBottomUpKMeans converged in {total_time:.2f}s after {self.n_iter_} iterations. "
                    f"Final inertia: {self.inertia_:.4f}")

        return self

    def _get_seeds(self):
        """Generate optimized random seeds for multiple initializations."""
        # If only one initialization is needed, don't generate extra random numbers
        if self.n_init == 1:
            return [self.random_state if isinstance(self.random_state, int) else None]

        # Generate seeds efficiently
        random_state = check_random_state(self.random_state)

        # For T4 GPU in Colab, optimal n_init is usually 3-5 due to time constraints
        # Adjust n_init if it's too large for interactive use
        effective_n_init = min(self.n_init, 5) if HAS_GPU else self.n_init

        # Generate all seeds at once (more efficient than one at a time)
        if effective_n_init < self.n_init and self.verbose:
            logger.info(f"Reducing initializations from {self.n_init} to {effective_n_init} for optimal GPU usage")

        return random_state.randint(0, 2**31 - 1, size=effective_n_init).tolist()

    def predict(self, X):
        """Optimized prediction for T4 GPU and dual-core CPU."""
        check_is_fitted(self, ['cluster_centers_'])

        # Convert input to appropriate format
        if HAS_GPU:
            X = check_array(X, dtype=np.float32, accept_sparse=True)
        else:
            X = check_array(X, accept_sparse=True)

        # For large datasets, process in batches to optimize GPU memory
        if HAS_GPU and X.shape[0] > 10000:
            batch_size = 10000  # Optimal batch size for T4 GPU
            n_samples = X.shape[0]
            labels = np.empty(n_samples, dtype=np.int32)

            for i in range(0, n_samples, batch_size):
                end = min(i + batch_size, n_samples)
                X_batch = X[i:end]

                # Use optimized GPU distance calculation
                distances = _fast_distances(X_batch, self.cluster_centers_)
                batch_labels, _ = _assign_labels_numba(distances)
                labels[i:end] = batch_labels

            return labels
        else:
            # For smaller datasets, process all at once
            if NUMBA_AVAILABLE:
                distances = _fast_distances(X, self.cluster_centers_)
                labels, _ = _assign_labels_numba(distances)
                return labels
            else:
                distances = self._compute_distances_parallel(X, self.cluster_centers_)
                return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Optimized fit and predict for T4 GPU environment."""
        # For Colab/T4 environment, do this in one pass to minimize memory transfers
        self.fit(X)
        return self.labels_  # Use stored labels rather than recomputing

    def print_iteration_table(self):
        """Enhanced iteration table with performance insights."""
        check_is_fitted(self, ['iteration_table_'])

        try:
            import pandas as pd

            # Create DataFrame
            df = pd.DataFrame(self.iteration_table_)

            # Add more useful columns for performance analysis
            if 'time' in df.columns:
                # Calculate throughput (points processed per second)
                if 'active_points' in df.columns:
                    df['throughput'] = df['active_points'] / df['time']

                # Calculate cumulative time
                df['cumulative_time'] = df['time'].cumsum()

            # Add hardware info
            if HAS_GPU:
                df.name = "Iteration History (GPU: T4, CPU: 2-core Xeon)"
            else:
                df.name = "Iteration History (CPU: 2-core Xeon)"

            # Format for better readability
            if 'phase' in df.columns:
                df['phase'] = df['phase'].str.capitalize()

            # Round numeric columns
            for col in df.select_dtypes(include=['float']).columns:
                df[col] = df[col].round(3)

            return df

        except ImportError:
            # Fallback to simple text output
            print("Iteration History:")
            for i, info in enumerate(self.iteration_table_):
                print(f"Iteration {i+1}: " + ", ".join([f"{k}: {v:.3f}" if isinstance(v, float) else f"{k}: {v}"
                                                    for k, v in info.items()]))

    def transform(self, X):
        """Transform X to cluster-distance space (optimized for T4 GPU)."""
        check_is_fitted(self, ['cluster_centers_'])

        # Optimize data type for GPU
        if HAS_GPU:
            X = check_array(X, dtype=np.float32, accept_sparse=True)
        else:
            X = check_array(X, accept_sparse=True)

        # For large datasets on T4, process in batches
        if HAS_GPU and X.shape[0] > 10000:
            batch_size = 10000
            n_samples = X.shape[0]
            n_clusters = self.cluster_centers_.shape[0]
            distances = np.empty((n_samples, n_clusters), dtype=np.float32)

            for i in range(0, n_samples, batch_size):
                end = min(i + batch_size, n_samples)
                X_batch = X[i:end]

                # Use GPU accelerated distance computation
                distances[i:end] = _fast_distances(X_batch, self.cluster_centers_)

            return distances
        else:
            # For smaller datasets, compute all distances at once
            return self._compute_distances_parallel(X, self.cluster_centers_)
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=300000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()





BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3


KeyboardInterrupt: 

In [12]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy
if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        """Compute squared Euclidean distances with CuPy GPU acceleration."""
        # Transfer data to GPU if needed
        X_gpu = cp.asarray(X)
        centroids_gpu = cp.asarray(centroids)

        # Compute squared norms
        X_norm = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_norm = cp.sum(centroids_gpu**2, axis=1, keepdims=True).T

        # Use matrix multiplication for dot product
        dot_product = cp.dot(X_gpu, centroids_gpu.T)

        # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
        distances = X_norm + centroids_norm - 2.0 * dot_product

        return distances

    def compute_distances(X, centroids):
        """Wrapper to handle GPU distance computation."""
        distances_gpu = _fast_distances_gpu(X, centroids)
        return cp.asnumpy(distances_gpu)

    def assign_labels(distances):
        """Assign labels based on distances using GPU."""
        distances_gpu = cp.asarray(distances)
        min_distances = cp.min(distances_gpu, axis=1)
        labels = cp.argmin(distances_gpu, axis=1)

        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        """Update centroids using GPU acceleration."""
        X_gpu = cp.asarray(X)
        labels_gpu = cp.asarray(labels)
        n_features = X.shape[1]

        centroids = cp.zeros((n_clusters, n_features), dtype=X.dtype)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Process each cluster
        for k in range(n_clusters):
            mask = (labels_gpu == k)
            cluster_points = X_gpu[mask]
            if len(cluster_points) > 0:
                centroids[k] = cp.mean(cluster_points, axis=0)
                counts[k] = len(cluster_points)

        return cp.asnumpy(centroids), cp.asnumpy(counts)

# CPU optimized version - unchanged from original
@njit(parallel=True,fastmath=True,cache=True)
def _fast_distances_cpu(X, centroids):
    """Compute squared Euclidean distances with Numba - vectorized version."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms - parallelized
    x_norms = np.zeros(n_samples, dtype=np.float64)
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)

    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    for j in prange(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Compute distances in parallel
    for i in prange(n_samples):
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] = x_norms[i] + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True,cache=True)
def _assign_labels_cpu(distances):
    """Assign labels to data points based on nearest centroid."""
    n_samples = distances.shape[0]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.full(n_samples, np.inf)

    for i in prange(n_samples):
        for j in range(distances.shape[1]):
            if distances[i, j] < min_distances[i]:
                min_distances[i] = distances[i, j]
                labels[i] = j

    return labels, min_distances

@njit(parallel=True,cache=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Update centroids based on assigned labels."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count cluster members and sum values
    for i in range(n_samples):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Calculate means
    for i in prange(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.434 seconds
Average execution time (OptimizedKMeans): 0.025 seconds
Average execution time (SklearnKMeans): 0.019 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [11]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
import numpy as np
import numba as nb
from numba import prange, cuda
import math

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy - Highly optimized
if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        """Optimized squared Euclidean distances with CuPy GPU acceleration."""
        # Transfer data to GPU if needed (detect if already on GPU)
        X_gpu = cp.asarray(X)
        centroids_gpu = cp.asarray(centroids)

        # Use more efficient matrix operations and broadcast
        # ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
        # Use fp32 precision for better performance when appropriate
        if X_gpu.dtype == np.float64 and X_gpu.shape[1] < 100:
            X_gpu = X_gpu.astype(cp.float32)
            centroids_gpu = centroids_gpu.astype(cp.float32)

        # Efficient squared norm computation
        X_norm = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_norm = cp.sum(centroids_gpu**2, axis=1)

        # Use optimized BLAS for matrix multiplication
        # cuBLAS is automatically used by CuPy
        distances = X_norm + centroids_norm - 2 * cp.dot(X_gpu, centroids_gpu.T)

        # Ensure non-negative distances (numerical stability)
        cp.maximum(distances, 0, out=distances)

        return distances

    def compute_distances(X, centroids):
        """Wrapper with improved memory management."""
        # Avoid unnecessary transfers for large datasets
        if X.shape[0] > 100000:
            # Process in batches to avoid OOM
            batch_size = 50000
            n_batches = (X.shape[0] + batch_size - 1) // batch_size
            distances = np.empty((X.shape[0], centroids.shape[0]), dtype=np.float32)

            for i in range(n_batches):
                start, end = i * batch_size, min((i + 1) * batch_size, X.shape[0])
                batch_distances = _fast_distances_gpu(X[start:end], centroids)
                distances[start:end] = cp.asnumpy(batch_distances)

            return distances
        else:
            # Process in one go for smaller datasets
            distances_gpu = _fast_distances_gpu(X, centroids)
            return cp.asnumpy(distances_gpu)

    def assign_labels(distances):
        """Optimized label assignment using GPU."""
        # Keep data on GPU if already there
        if isinstance(distances, cp.ndarray):
            distances_gpu = distances
        else:
            distances_gpu = cp.asarray(distances)

        # Use specialized kernel for argmin operation
        labels = cp.argmin(distances_gpu, axis=1)
        min_distances = cp.take_along_axis(distances_gpu,
                                          cp.expand_dims(labels, axis=1),
                                          axis=1).squeeze()

        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        """Optimized centroid update using reduction operations."""
        X_gpu = cp.asarray(X)
        labels_gpu = cp.asarray(labels)
        n_features = X.shape[1]

        # Use specialized CuPy functions for performance
        new_centroids = cp.zeros((n_clusters, n_features), dtype=X_gpu.dtype)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Use one-hot encoding and matrix multiplication for faster reduction
        if n_clusters < 100:  # For reasonable number of clusters
            one_hot = cp.eye(n_clusters)[labels_gpu]
            counts = cp.sum(one_hot, axis=0)
            new_centroids = cp.dot(one_hot.T, X_gpu)

            # Handle non-empty clusters
            mask = counts > 0
            new_centroids[mask] = new_centroids[mask] / counts[mask, cp.newaxis]
        else:
            # Fall back to loop for many clusters to save memory
            for k in range(n_clusters):
                mask = (labels_gpu == k)
                cluster_size = cp.sum(mask)
                counts[k] = cluster_size

                if cluster_size > 0:
                    new_centroids[k] = cp.sum(X_gpu[mask], axis=0) / cluster_size

        # Handle empty clusters on GPU
        empty_clusters = cp.where(counts == 0)[0]
        if len(empty_clusters) > 0:
            # Find points farthest from their centroids
            non_empty_mask = cp.where(counts > 0)[0]
            if len(non_empty_mask) > 0:
                random_centroid = new_centroids[non_empty_mask[0]]

                for k in empty_clusters:
                    # Add random perturbation to an existing centroid
                    noise = cp.random.randn(n_features).astype(X_gpu.dtype) * 0.1
                    new_centroids[k] = random_centroid + noise
                    counts[k] = 1

        return cp.asnumpy(new_centroids), cp.asnumpy(counts)

# CPU optimized version - significantly improved
@nb.njit(parallel=True, fastmath=True, cache=True)
def _fast_distances_cpu(X, centroids):
    """Highly optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float32)

    # Pre-compute squared norms - optimized for cache locality
    x_norms = np.empty(n_samples, dtype=np.float32)
    centroid_norms = np.empty(n_clusters, dtype=np.float32)

    # Compute norms with better vectorization
    for i in prange(n_samples):
        norm = 0.0
        for k in range(n_features):
            val = X[i, k]
            norm += val * val
        x_norms[i] = norm

    for j in prange(n_clusters):
        norm = 0.0
        for k in range(n_features):
            val = centroids[j, k]
            norm += val * val
        centroid_norms[j] = norm

    # Compute distances with better memory access pattern
    # Process in blocks for better cache utilization
    block_size = min(32, n_samples)

    for block_start in prange(0, n_samples, block_size):
        block_end = min(block_start + block_size, n_samples)

        # Prefetch centroid data for this block
        for j in range(n_clusters):
            centroid_norm_j = centroid_norms[j]

            for i in range(block_start, block_end):
                x_norm_i = x_norms[i]
                dot_product = 0.0

                # Manual loop unrolling for better instruction-level parallelism
                for k in range(0, n_features - 3, 4):
                    dot_product += (X[i, k] * centroids[j, k] +
                                  X[i, k+1] * centroids[j, k+1] +
                                  X[i, k+2] * centroids[j, k+2] +
                                  X[i, k+3] * centroids[j, k+3])

                # Handle remaining elements
                for k in range((n_features // 4) * 4, n_features):
                    dot_product += X[i, k] * centroids[j, k]

                # ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
                distances[i, j] = x_norm_i + centroid_norm_j - 2.0 * dot_product

    return distances

@nb.njit(parallel=True, cache=True, fastmath=True)
def _assign_labels_cpu(distances):
    """Optimized label assignment for CPU."""
    n_samples = distances.shape[0]
    n_clusters = distances.shape[1]
    labels = np.empty(n_samples, dtype=np.int32)
    min_distances = np.empty(n_samples, dtype=np.float32)

    # Process in parallel with better vectorization
    for i in prange(n_samples):
        min_idx = 0
        min_dist = distances[i, 0]

        # Manual vectorization helps Numba generate better code
        for j in range(1, n_clusters):
            dist = distances[i, j]
            # Branchless min operation
            is_smaller = dist < min_dist
            min_idx = j if is_smaller else min_idx
            min_dist = dist if is_smaller else min_dist

        labels[i] = min_idx
        min_distances[i] = min_dist

    return labels, min_distances

@nb.njit(parallel=True, cache=True, fastmath=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Optimized centroid update with improved memory patterns."""
    n_samples = X.shape[0]
    n_features = X.shape[1]

    # Use float32 for better SIMD utilization when appropriate
    dtype = np.float32 if X.dtype == np.float32 else np.float64

    # Initialize centroids and counts
    new_centroids = np.zeros((n_clusters, n_features), dtype=dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Process in thread-local manner to avoid synchronization
    n_threads = nb.get_num_threads()
    local_centroids = np.zeros((n_threads, n_clusters, n_features), dtype=dtype)
    local_counts = np.zeros((n_threads, n_clusters), dtype=np.int32)

    # Accumulate in thread-local storage
    for i in prange(n_samples):
        thread_id = nb.get_thread_id()
        cluster_id = labels[i]
        local_counts[thread_id, cluster_id] += 1

        # Accumulate values with better cache pattern
        for j in range(n_features):
            local_centroids[thread_id, cluster_id, j] += X[i, j]

    # Combine thread-local results
    for t in range(n_threads):
        for k in range(n_clusters):
            counts[k] += local_counts[t, k]
            for j in range(n_features):
                new_centroids[k, j] += local_centroids[t, k, j]

    # Calculate means
    for k in prange(n_clusters):
        if counts[k] > 0:
            inv_count = 1.0 / counts[k]
            for j in range(n_features):
                new_centroids[k, j] *= inv_count

    return new_centroids, counts
# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 1.313 seconds
Average execution time (OptimizedKMeans): 0.019 seconds
Average execution time (SklearnKMeans): 0.016 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.062
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.594
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


multiprocessing cpu

In [13]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
import numpy as np
import numba as nb
from numba import prange, cuda
import math

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy - Highly optimized
if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        """Optimized squared Euclidean distances with CuPy GPU acceleration."""
        # Transfer data to GPU if needed (detect if already on GPU)
        X_gpu = cp.asarray(X)
        centroids_gpu = cp.asarray(centroids)

        # Use more efficient matrix operations and broadcast
        # ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
        # Use fp32 precision for better performance when appropriate
        if X_gpu.dtype == np.float64 and X_gpu.shape[1] < 100:
            X_gpu = X_gpu.astype(cp.float32)
            centroids_gpu = centroids_gpu.astype(cp.float32)

        # Efficient squared norm computation
        X_norm = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_norm = cp.sum(centroids_gpu**2, axis=1)

        # Use optimized BLAS for matrix multiplication
        # cuBLAS is automatically used by CuPy
        distances = X_norm + centroids_norm - 2 * cp.dot(X_gpu, centroids_gpu.T)

        # Ensure non-negative distances (numerical stability)
        cp.maximum(distances, 0, out=distances)

        return distances

    def compute_distances(X, centroids):
        """Wrapper with improved memory management."""
        # Avoid unnecessary transfers for large datasets
        if X.shape[0] > 100000:
            # Process in batches to avoid OOM
            batch_size = 50000
            n_batches = (X.shape[0] + batch_size - 1) // batch_size
            distances = np.empty((X.shape[0], centroids.shape[0]), dtype=np.float32)

            for i in range(n_batches):
                start, end = i * batch_size, min((i + 1) * batch_size, X.shape[0])
                batch_distances = _fast_distances_gpu(X[start:end], centroids)
                distances[start:end] = cp.asnumpy(batch_distances)

            return distances
        else:
            # Process in one go for smaller datasets
            distances_gpu = _fast_distances_gpu(X, centroids)
            return cp.asnumpy(distances_gpu)

    def assign_labels(distances):
        """Optimized label assignment using GPU."""
        # Keep data on GPU if already there
        if isinstance(distances, cp.ndarray):
            distances_gpu = distances
        else:
            distances_gpu = cp.asarray(distances)

        # Use specialized kernel for argmin operation
        labels = cp.argmin(distances_gpu, axis=1)
        min_distances = cp.take_along_axis(distances_gpu,
                                          cp.expand_dims(labels, axis=1),
                                          axis=1).squeeze()

        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        """Optimized centroid update using reduction operations."""
        X_gpu = cp.asarray(X)
        labels_gpu = cp.asarray(labels)
        n_features = X.shape[1]

        # Use specialized CuPy functions for performance
        new_centroids = cp.zeros((n_clusters, n_features), dtype=X_gpu.dtype)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Use one-hot encoding and matrix multiplication for faster reduction
        if n_clusters < 100:  # For reasonable number of clusters
            one_hot = cp.eye(n_clusters)[labels_gpu]
            counts = cp.sum(one_hot, axis=0)
            new_centroids = cp.dot(one_hot.T, X_gpu)

            # Handle non-empty clusters
            mask = counts > 0
            new_centroids[mask] = new_centroids[mask] / counts[mask, cp.newaxis]
        else:
            # Fall back to loop for many clusters to save memory
            for k in range(n_clusters):
                mask = (labels_gpu == k)
                cluster_size = cp.sum(mask)
                counts[k] = cluster_size

                if cluster_size > 0:
                    new_centroids[k] = cp.sum(X_gpu[mask], axis=0) / cluster_size

        # Handle empty clusters on GPU
        empty_clusters = cp.where(counts == 0)[0]
        if len(empty_clusters) > 0:
            # Find points farthest from their centroids
            non_empty_mask = cp.where(counts > 0)[0]
            if len(non_empty_mask) > 0:
                random_centroid = new_centroids[non_empty_mask[0]]

                for k in empty_clusters:
                    # Add random perturbation to an existing centroid
                    noise = cp.random.randn(n_features).astype(X_gpu.dtype) * 0.1
                    new_centroids[k] = random_centroid + noise
                    counts[k] = 1

        return cp.asnumpy(new_centroids), cp.asnumpy(counts)
@njit(parallel=True, fastmath=True, cache=True)
def _fast_distances_cpu(X, centroids):
    """Highly optimized squared Euclidean distances with Numba.
    Specifically tuned for 2-core CPU on Google Colab."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float32)

    # Precompute centroids squared norms (small array, fast)
    centroid_norms = np.empty(n_clusters, dtype=np.float32)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            val = centroids[j, k]
            norm += val * val
        centroid_norms[j] = norm

    # Process samples in parallel chunks optimized for 2 cores
    chunk_size = max(1, n_samples // 2)  # Split work evenly for 2 cores

    for chunk_idx in prange(2):  # Explicitly use 2 cores
        start = chunk_idx * chunk_size
        end = min(n_samples, (chunk_idx + 1) * chunk_size)

        # Process samples in this chunk
        for i in range(start, end):
            # Compute sample norm once
            x_norm = 0.0
            for k in range(n_features):
                val = X[i, k]
                x_norm += val * val

            # Compute distances to all centroids
            for j in range(n_clusters):
                # Start with precomputed norms
                distance = x_norm + centroid_norms[j]

                # Subtract 2 * dot product with manual unrolling for better vectorization
                dot_product = 0.0
                # Process 4 elements at a time when possible (SIMD friendly)
                for k in range(0, n_features - 3, 4):
                    dot_product += X[i, k] * centroids[j, k]
                    dot_product += X[i, k+1] * centroids[j, k+1]
                    dot_product += X[i, k+2] * centroids[j, k+2]
                    dot_product += X[i, k+3] * centroids[j, k+3]

                # Handle remaining elements
                for k in range((n_features // 4) * 4, n_features):
                    dot_product += X[i, k] * centroids[j, k]

                distances[i, j] = distance - 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True, cache=True)
def _assign_labels_cpu(distances):
    """Optimized label assignment with Numba.
    Tuned for 2-core processing."""
    n_samples = distances.shape[0]
    n_clusters = distances.shape[1]
    labels = np.empty(n_samples, dtype=np.int32)
    min_distances = np.empty(n_samples, dtype=np.float32)

    # Divide work for 2 cores
    chunk_size = max(1, n_samples // 2)

    for chunk_idx in prange(2):  # Explicitly use 2 cores
        start = chunk_idx * chunk_size
        end = min(n_samples, (chunk_idx + 1) * chunk_size)

        for i in range(start, end):
            min_idx = 0
            min_dist = distances[i, 0]

            # Manual unrolling to help compiler vectorize
            j = 1
            while j < n_clusters - 1:  # Process 2 at a time when possible
                dist1 = distances[i, j]
                dist2 = distances[i, j+1]

                # Update if first distance is smaller
                if dist1 < min_dist:
                    min_dist = dist1
                    min_idx = j

                # Update if second distance is smaller
                if dist2 < min_dist:
                    min_dist = dist2
                    min_idx = j+1

                j += 2

            # Handle last element if n_clusters is odd
            if j < n_clusters:
                dist = distances[i, j]
                if dist < min_dist:
                    min_dist = dist
                    min_idx = j

            labels[i] = min_idx
            min_distances[i] = min_dist

    return labels, min_distances

@njit(parallel=True, fastmath=True, cache=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Optimized centroid update for 2-core CPU.
    Uses thread-local storage to avoid synchronization."""
    n_samples = X.shape[0]
    n_features = X.shape[1]

    # Use thread-local storage to avoid locks (optimized for 2 cores)
    local_sums = np.zeros((2, n_clusters, n_features), dtype=np.float64)
    local_counts = np.zeros((2, n_clusters), dtype=np.int32)

    # Chunk size for 2 threads
    chunk_size = max(1, n_samples // 2)

    # Accumulate in thread-local storage
    for thread_id in prange(2):  # Explicitly use 2 cores
        start = thread_id * chunk_size
        end = min(n_samples, (thread_id + 1) * chunk_size)

        # Process points assigned to this thread
        for i in range(start, end):
            cluster_id = labels[i]
            local_counts[thread_id, cluster_id] += 1

            # Accumulate sum of points in thread-local storage
            for j in range(n_features):
                local_sums[thread_id, cluster_id, j] += X[i, j]

    # Combine thread-local results
    centroids = np.zeros((n_clusters, n_features), dtype=np.float64)
    counts = np.zeros(n_clusters, dtype=np.int32)

    for t in range(2):  # Combine from 2 threads
        for k in range(n_clusters):
            counts[k] += local_counts[t, k]
            for j in range(n_features):
                centroids[k, j] += local_sums[t, k, j]

    # Calculate means - avoid division by zero
    for k in range(n_clusters):
        if counts[k] > 0:
            inv_count = 1.0 / counts[k]
            for j in range(n_features):
                centroids[k, j] *= inv_count

    return centroids, counts

# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.492 seconds
Average execution time (OptimizedKMeans): 0.027 seconds
Average execution time (SklearnKMeans): 0.016 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.062
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.594
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


trying gpu optimisation

In [14]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
import numpy as np
import numba as nb
from numba import prange, cuda
import math

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy - Highly optimized
import numpy as np
import cupy as cp
import math
import time
from cupy import prof

# Custom CUDA kernel for efficient distance calculation
_distance_kernel = cp.RawKernel(r'''
extern "C" __global__
void compute_distances(const float* X, const float* centroids,
                      float* distances, int n_samples, int n_features, int n_clusters) {
    // Get global thread ID
    int tid = blockDim.x * blockIdx.x + threadIdx.x;

    // Check if this thread should compute a distance
    if (tid < n_samples * n_clusters) {
        int sample_idx = tid / n_clusters;
        int centroid_idx = tid % n_clusters;

        // Compute squared distance
        float dist = 0.0f;
        for (int j = 0; j < n_features; j++) {
            float diff = X[sample_idx * n_features + j] - centroids[centroid_idx * n_features + j];
            dist += diff * diff;
        }

        // Store result
        distances[sample_idx * n_clusters + centroid_idx] = dist;
    }
}
''', 'compute_distances')

# Custom CUDA kernel for label assignment
_label_kernel = cp.RawKernel(r'''
extern "C" __global__
void assign_labels(const float* distances, int* labels, float* min_distances,
                 int n_samples, int n_clusters) {
    // Get global thread ID
    int sample_idx = blockDim.x * blockIdx.x + threadIdx.x;

    // Check if this thread should process a sample
    if (sample_idx < n_samples) {
        int best_cluster = 0;
        float min_dist = distances[sample_idx * n_clusters];

        // Find minimum distance and corresponding cluster
        for (int j = 1; j < n_clusters; j++) {
            float dist = distances[sample_idx * n_clusters + j];
            if (dist < min_dist) {
                min_dist = dist;
                best_cluster = j;
            }
        }

        // Store results
        labels[sample_idx] = best_cluster;
        min_distances[sample_idx] = min_dist;
    }
}
''', 'assign_labels')

# Custom CUDA kernel for centroid update (reduction)
_centroid_update_kernel = cp.RawKernel(r'''
extern "C" __global__
void update_centroids(const float* X, const int* labels, float* centroids,
                     int* counts, int n_samples, int n_features, int n_clusters) {
    // Get feature and cluster for this thread
    int feature_idx = blockIdx.x;
    int cluster_idx = blockIdx.y;

    if (feature_idx < n_features && cluster_idx < n_clusters) {
        float sum = 0.0f;
        int count = 0;

        // Sum all points in this cluster (for this feature)
        for (int i = 0; i < n_samples; i++) {
            if (labels[i] == cluster_idx) {
                sum += X[i * n_features + feature_idx];
                count++;
            }
        }

        // Store results - atomic updates not needed since each thread handles a unique cluster-feature pair
        centroids[cluster_idx * n_features + feature_idx] = (count > 0) ? (sum / count) : 0.0f;
        if (feature_idx == 0) {  // Only update count once per cluster
            counts[cluster_idx] = count;
        }
    }
}
''', 'update_centroids')

def _fast_distances_gpu(X, centroids, use_raw_kernel=True):
    """Highly optimized distance calculation using CuPy.

    Args:
        X: Input data array (n_samples, n_features)
        centroids: Centroid array (n_clusters, n_features)
        use_raw_kernel: Whether to use custom CUDA kernel (faster for smaller datasets)

    Returns:
        distances: Distance matrix (n_samples, n_clusters)
    """
    # Transfer to GPU if needed (avoid copies if already on GPU)
    X_gpu = cp.asarray(X, dtype=cp.float32)
    centroids_gpu = cp.asarray(centroids, dtype=cp.float32)

    # Get dimensions
    n_samples = X_gpu.shape[0]
    n_features = X_gpu.shape[1]
    n_clusters = centroids_gpu.shape[0]

    # For small dimensions, use raw CUDA kernel
    if use_raw_kernel and n_features <= 128:
        # Allocate output memory
        distances = cp.empty((n_samples, n_clusters), dtype=cp.float32)

        # Configure kernel launch parameters
        threads_per_block = 256
        blocks_per_grid = (n_samples * n_clusters + threads_per_block - 1) // threads_per_block

        # Launch kernel
        _distance_kernel(
            (blocks_per_grid,), (threads_per_block,),
            (X_gpu, centroids_gpu, distances, n_samples, n_features, n_clusters)
        )

        return distances

    # For larger dimensions, use optimized matrix operations
    else:
        # Use the ||x-y||² = ||x||² + ||y||² - 2<x,y> identity
        # Pre-compute squared norms
        X_squared_norms = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_squared_norms = cp.sum(centroids_gpu**2, axis=1, keepdims=True).T

        # Compute dot products efficiently using BLAS
        # Use batching for very large matrices to avoid memory issues
        if n_samples > 50000 and n_clusters > 100:
            batch_size = 10000
            distances = cp.empty((n_samples, n_clusters), dtype=cp.float32)

            for i in range(0, n_samples, batch_size):
                end_idx = min(i + batch_size, n_samples)
                batch_dots = cp.dot(X_gpu[i:end_idx], centroids_gpu.T)
                batch_dists = X_squared_norms[i:end_idx] + centroids_squared_norms - 2.0 * batch_dots
                distances[i:end_idx] = cp.maximum(batch_dists, 0)  # Ensure non-negative
        else:
            # Compute all at once
            dot_products = cp.dot(X_gpu, centroids_gpu.T)
            distances = X_squared_norms + centroids_squared_norms - 2.0 * dot_products
            # Ensure non-negative distances (numerical stability)
            cp.maximum(distances, 0, out=distances)

        return distances

def assign_labels_gpu(distances, use_raw_kernel=True):
    """Optimized label assignment using CuPy.

    Args:
        distances: Distance matrix (n_samples, n_clusters)
        use_raw_kernel: Whether to use custom CUDA kernel

    Returns:
        labels: Cluster assignments for each sample
        min_distances: Minimum distance for each sample
    """
    # Ensure data is on GPU
    distances_gpu = cp.asarray(distances, dtype=cp.float32)
    n_samples, n_clusters = distances_gpu.shape

    # For smaller datasets, use custom kernel
    if use_raw_kernel and n_clusters <= 256:
        # Allocate output memory
        labels = cp.empty(n_samples, dtype=cp.int32)
        min_distances = cp.empty(n_samples, dtype=cp.float32)

        # Configure kernel launch parameters - 1 thread per sample
        threads_per_block = 256
        blocks_per_grid = (n_samples + threads_per_block - 1) // threads_per_block

        # Launch kernel
        _label_kernel(
            (blocks_per_grid,), (threads_per_block,),
            (distances_gpu, labels, min_distances, n_samples, n_clusters)
        )
    else:
        # For larger datasets, use optimized CuPy operations
        labels = cp.argmin(distances_gpu, axis=1).astype(cp.int32)
        # Extract minimum distances efficiently
        min_distances = cp.take_along_axis(
            distances_gpu,
            cp.expand_dims(labels, axis=1),
            axis=1
        ).squeeze()

    return labels, min_distances

def update_centroids_gpu(X, labels, n_clusters, use_raw_kernel=False):
    """Optimized centroid update using CuPy.

    Args:
        X: Input data array (n_samples, n_features)
        labels: Cluster assignments (n_samples,)
        n_clusters: Number of clusters
        use_raw_kernel: Whether to use custom CUDA kernel

    Returns:
        centroids: Updated cluster centroids
        counts: Number of points in each cluster
    """
    # Transfer to GPU if needed
    X_gpu = cp.asarray(X, dtype=cp.float32)
    labels_gpu = cp.asarray(labels, dtype=cp.int32)

    n_samples, n_features = X_gpu.shape

    # For small datasets with many features, use raw kernel
    if use_raw_kernel and n_features > 32 and n_clusters < 64:
        # Allocate output memory
        centroids = cp.zeros((n_clusters, n_features), dtype=cp.float32)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Configure kernel - one thread per feature per cluster
        # Each thread computes one element of the result
        blocks_per_grid = (n_features, n_clusters)
        threads_per_block = 1  # Simple reduction

        # Launch kernel
        _centroid_update_kernel(
            blocks_per_grid, (threads_per_block,),
            (X_gpu, labels_gpu, centroids, counts, n_samples, n_features, n_clusters)
        )
    else:
        # Use optimized matrix operations for larger datasets
        # This method is much faster for typical datasets

        # Method 1: One-hot encoding (fastest for moderate n_clusters)
        if n_clusters <= 256:
            # Create one-hot encoding of labels
            one_hot = cp.zeros((n_samples, n_clusters), dtype=cp.float32)
            cp.scatter_add(one_hot, 1, labels_gpu.reshape(-1, 1), 1)

            # Count points per cluster
            counts = cp.sum(one_hot, axis=0, dtype=cp.int32)

            # Compute new centroids (matrix multiplication)
            centroids = cp.dot(one_hot.T, X_gpu)

            # Normalize by counts (avoiding division by zero)
            valid_mask = counts > 0
            if cp.any(valid_mask):
                centroids[valid_mask] = centroids[valid_mask] / counts[valid_mask, cp.newaxis]

        # Method 2: For very large number of clusters, use scatter_add
        else:
            centroids = cp.zeros((n_clusters, n_features), dtype=cp.float32)
            counts = cp.zeros(n_clusters, dtype=cp.int32)

            # Count points per cluster
            cp.scatter_add(counts, labels_gpu, cp.ones(n_samples, dtype=cp.int32))

            # For each feature, sum values for each cluster
            for j in range(n_features):
                feature_values = X_gpu[:, j]
                cp.scatter_add(centroids[:, j], labels_gpu, feature_values)

            # Normalize
            valid_mask = counts > 0
            if cp.any(valid_mask):
                centroids[valid_mask] = centroids[valid_mask] / counts[valid_mask, cp.newaxis]

    # Handle empty clusters by setting to random point
    empty_clusters = cp.where(counts == 0)[0]
    if len(empty_clusters) > 0:
        # Get points furthest from their centroids
        if len(centroids) > 0:  # Ensure there's at least one valid centroid
            # Calculate distances to assigned centroids
            X_reshaped = X_gpu.reshape(n_samples, 1, n_features)
            valid_centroids = centroids.reshape(1, n_clusters, n_features)

            # Get assigned centroid for each point
            assigned_centroids = valid_centroids[:, labels_gpu, :]

            # Calculate distances
            dists = cp.sum((X_reshaped - assigned_centroids) ** 2, axis=2).squeeze()

            # Find furthest points
            furthest_indices = cp.argsort(dists)[-len(empty_clusters):]

            # Assign these points to empty clusters
            empty_centroids = X_gpu[furthest_indices]
            centroids[empty_clusters] = empty_centroids
            counts[empty_clusters] = 1

    return centroids, counts
@njit(parallel=True, fastmath=True, cache=True)
def _fast_distances_cpu(X, centroids):
    """Highly optimized squared Euclidean distances with Numba.
    Specifically tuned for 2-core CPU on Google Colab."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float32)

    # Precompute centroids squared norms (small array, fast)
    centroid_norms = np.empty(n_clusters, dtype=np.float32)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            val = centroids[j, k]
            norm += val * val
        centroid_norms[j] = norm

    # Process samples in parallel chunks optimized for 2 cores
    chunk_size = max(1, n_samples // 2)  # Split work evenly for 2 cores

    for chunk_idx in prange(2):  # Explicitly use 2 cores
        start = chunk_idx * chunk_size
        end = min(n_samples, (chunk_idx + 1) * chunk_size)

        # Process samples in this chunk
        for i in range(start, end):
            # Compute sample norm once
            x_norm = 0.0
            for k in range(n_features):
                val = X[i, k]
                x_norm += val * val

            # Compute distances to all centroids
            for j in range(n_clusters):
                # Start with precomputed norms
                distance = x_norm + centroid_norms[j]

                # Subtract 2 * dot product with manual unrolling for better vectorization
                dot_product = 0.0
                # Process 4 elements at a time when possible (SIMD friendly)
                for k in range(0, n_features - 3, 4):
                    dot_product += X[i, k] * centroids[j, k]
                    dot_product += X[i, k+1] * centroids[j, k+1]
                    dot_product += X[i, k+2] * centroids[j, k+2]
                    dot_product += X[i, k+3] * centroids[j, k+3]

                # Handle remaining elements
                for k in range((n_features // 4) * 4, n_features):
                    dot_product += X[i, k] * centroids[j, k]

                distances[i, j] = distance - 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True, cache=True)
def _assign_labels_cpu(distances):
    """Optimized label assignment with Numba.
    Tuned for 2-core processing."""
    n_samples = distances.shape[0]
    n_clusters = distances.shape[1]
    labels = np.empty(n_samples, dtype=np.int32)
    min_distances = np.empty(n_samples, dtype=np.float32)

    # Divide work for 2 cores
    chunk_size = max(1, n_samples // 2)

    for chunk_idx in prange(2):  # Explicitly use 2 cores
        start = chunk_idx * chunk_size
        end = min(n_samples, (chunk_idx + 1) * chunk_size)

        for i in range(start, end):
            min_idx = 0
            min_dist = distances[i, 0]

            # Manual unrolling to help compiler vectorize
            j = 1
            while j < n_clusters - 1:  # Process 2 at a time when possible
                dist1 = distances[i, j]
                dist2 = distances[i, j+1]

                # Update if first distance is smaller
                if dist1 < min_dist:
                    min_dist = dist1
                    min_idx = j

                # Update if second distance is smaller
                if dist2 < min_dist:
                    min_dist = dist2
                    min_idx = j+1

                j += 2

            # Handle last element if n_clusters is odd
            if j < n_clusters:
                dist = distances[i, j]
                if dist < min_dist:
                    min_dist = dist
                    min_idx = j

            labels[i] = min_idx
            min_distances[i] = min_dist

    return labels, min_distances

@njit(parallel=True, fastmath=True, cache=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Optimized centroid update for 2-core CPU.
    Uses thread-local storage to avoid synchronization."""
    n_samples = X.shape[0]
    n_features = X.shape[1]

    # Use thread-local storage to avoid locks (optimized for 2 cores)
    local_sums = np.zeros((2, n_clusters, n_features), dtype=np.float64)
    local_counts = np.zeros((2, n_clusters), dtype=np.int32)

    # Chunk size for 2 threads
    chunk_size = max(1, n_samples // 2)

    # Accumulate in thread-local storage
    for thread_id in prange(2):  # Explicitly use 2 cores
        start = thread_id * chunk_size
        end = min(n_samples, (thread_id + 1) * chunk_size)

        # Process points assigned to this thread
        for i in range(start, end):
            cluster_id = labels[i]
            local_counts[thread_id, cluster_id] += 1

            # Accumulate sum of points in thread-local storage
            for j in range(n_features):
                local_sums[thread_id, cluster_id, j] += X[i, j]

    # Combine thread-local results
    centroids = np.zeros((n_clusters, n_features), dtype=np.float64)
    counts = np.zeros(n_clusters, dtype=np.int32)

    for t in range(2):  # Combine from 2 threads
        for k in range(n_clusters):
            counts[k] += local_counts[t, k]
            for j in range(n_features):
                centroids[k, j] += local_sums[t, k, j]

    # Calculate means - avoid division by zero
    for k in range(n_clusters):
        if counts[k] > 0:
            inv_count = 1.0 / counts[k]
            for j in range(n_features):
                centroids[k, j] *= inv_count

    return centroids, counts

# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.392 seconds
Average execution time (OptimizedKMeans): 0.021 seconds
Average execution time (SklearnKMeans): 0.016 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.141
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.594
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


**potential**

In [5]:
!conda install -c numba icc_rt


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968