<a href="https://colab.research.google.com/github/tejash09/K-means/blob/main/bottom_up.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# 1st try
```



In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

@njit(parallel=True)
def _fast_distances(X, centroids):
    """Compute squared Euclidean distances with Numba - highly optimized version."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids (constant for all data points)
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in range(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit
def _assign_labels_numba(X, centroids):
    """Assign labels to data points based on nearest centroid."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.full(n_samples, np.inf)

    for i in range(n_samples):
        for j in range(n_clusters):
            dist = 0.0
            for k in range(X.shape[1]):
                diff = X[i, k] - centroids[j, k]
                dist += diff * diff
            if dist < min_distances[i]:
                min_distances[i] = dist
                labels[i] = j

    return labels, min_distances

@njit
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Sum points in each cluster
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

# =============================================================================
# Optimized Bottom-Up KMeans Implementation
# =============================================================================
class BottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation with a bottom-up approach, gradually adding points
    to the calculation until convergence.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor=0.1,
                 batch_growth_factor=15, verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor  # For early convergence detection
        self.n_init = n_init  # Run algorithm multiple times and select best
        self.iteration_table_ = []

    def _initialize_centroids(self, X, seed=None):
        """Enhanced centroid initialization with multiple methods."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Stratified random selection for better coverage
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Divide the data range into strata for more uniform coverage
            sample_indices = []
            if n_samples >= self.n_clusters * 10:  # If we have enough samples
                # Use approximate quantiles to divide data
                from sklearn.preprocessing import KBinsDiscretizer

                # Use feature with highest variance for stratification
                feature_var = np.var(X, axis=0)
                stratify_feature = np.argmax(feature_var)

                discretizer = KBinsDiscretizer(n_bins=min(self.n_clusters, 20),
                                              encode='ordinal', strategy='quantile')
                strata = discretizer.fit_transform(X[:, stratify_feature].reshape(-1, 1)).astype(int).flatten()
                unique_strata = np.unique(strata)

                # Select points from different strata
                for stratum in unique_strata:
                    stratum_indices = np.where(strata == stratum)[0]
                    if len(stratum_indices) > 0:
                        idx = random_state.choice(stratum_indices,
                                                 size=min(max(1, self.n_clusters // len(unique_strata)),
                                                         len(stratum_indices)),
                                                 replace=False)
                        sample_indices.extend(idx)

            # If we couldn't get enough from stratification, add random ones
            if len(sample_indices) < self.n_clusters:
                remaining = self.n_clusters - len(sample_indices)
                avail_indices = list(set(range(n_samples)) - set(sample_indices))
                if avail_indices:
                    additional = random_state.choice(avail_indices,
                                                   size=min(remaining, len(avail_indices)),
                                                   replace=False)
                    sample_indices.extend(additional)

            # If we still don't have enough, allow repeats
            if len(sample_indices) < self.n_clusters:
                remaining = self.n_clusters - len(sample_indices)
                additional = random_state.choice(n_samples, size=remaining, replace=True)
                sample_indices.extend(additional)

            # Trim to exactly k clusters
            sample_indices = sample_indices[:self.n_clusters]
            return X[sample_indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ with vectorization
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()

            # Use vectorized operations for faster distance calculations
            for c in range(1, self.n_clusters):
                # Calculate squared distances to closest centroid
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    closest_dist_sq = np.min(np.sum((X[:, np.newaxis, :] -
                                              centroids[np.newaxis, :c, :])**2, axis=2), axis=1)

                # Select next centroid with probability proportional to square distance
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    next_centroid_idx = random_state.choice(n_samples, p=probs)
                else:
                    next_centroid_idx = random_state.randint(n_samples)

                centroids[c] = X[next_centroid_idx].copy()

            return centroids

        elif self.init == 'k-means++-fast':
            # Faster approximate k-means++ using subsampling for large datasets
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Subsample for very large datasets
            subsample_limit = 10000
            if n_samples > subsample_limit:
                subsample_indices = random_state.choice(n_samples,
                                                     size=subsample_limit,
                                                     replace=False)
                X_subset = X[subsample_indices]
            else:
                X_subset = X
                subsample_indices = np.arange(n_samples)

            # Choose first centroid randomly from subset
            first_idx = random_state.randint(len(X_subset))
            centroids[0] = X_subset[first_idx].copy()

            # Initialize distances array once
            subset_size = len(X_subset)
            closest_dist_sq = np.zeros(subset_size)

            for c in range(1, self.n_clusters):
                # Update distances for new centroid
                for i in range(subset_size):
                    dist = np.sum((X_subset[i] - centroids[c-1])**2)
                    if c == 1 or dist < closest_dist_sq[i]:
                        closest_dist_sq[i] = dist

                # Select next centroid
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    subset_idx = random_state.choice(subset_size, p=probs)
                    centroids[c] = X_subset[subset_idx].copy()
                else:
                    # If all points are identical to centroids, pick randomly
                    subset_idx = random_state.randint(subset_size)
                    centroids[c] = X_subset[subset_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        """Compute distances with vectorization and Numba."""
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            # Vectorized implementation for non-Numba case
            # Reshape to allow broadcasting
            expanded_X = X[:, np.newaxis, :]
            expanded_centroids = centroids[np.newaxis, :, :]

            # Calculate squared distances
            squared_diff = (expanded_X - expanded_centroids) ** 2
            squared_distances = np.sum(squared_diff, axis=2)

            return squared_distances

    def _select_next_batch(self, X, current_active, distances, batch_size, iteration):
        """
        Enhanced batch selection strategy with adaptive sampling.
        Selects points that are likely to improve centroid positions the most.
        """
        n_samples = X.shape[0]
        inactive_mask = np.ones(n_samples, dtype=bool)
        inactive_mask[current_active] = False
        inactive_indices = np.where(inactive_mask)[0]

        if len(inactive_indices) == 0:
            return np.array([])

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        labels = np.argmin(inactive_distances, axis=1)

        # Calculate multiple selection criteria
        # 1. Uncertainty: difference between closest and second closest centroid
        if self.n_clusters > 1:
            sorted_distances = np.sort(inactive_distances, axis=1)
            uncertainty = sorted_distances[:, 1] - sorted_distances[:, 0]
            uncertainty = uncertainty / (np.max(uncertainty) + 1e-10)
        else:
            uncertainty = np.zeros(len(inactive_indices))

        # 2. Representativeness: distance to closest centroid
        closest_distances = np.min(inactive_distances, axis=1)
        if np.max(closest_distances) > np.min(closest_distances):
            representativeness = (closest_distances - np.min(closest_distances)) / (np.max(closest_distances) - np.min(closest_distances) + 1e-10)
        else:
            representativeness = np.zeros_like(closest_distances)

        # 3. Cluster balance: prioritize points from underrepresented clusters
        cluster_counts = np.bincount(labels, minlength=self.n_clusters)
        balance_score = np.zeros(len(inactive_indices))
        for i, label in enumerate(labels):
            if cluster_counts[label] > 0:
                balance_score[i] = 1.0 / cluster_counts[label]
        balance_score = balance_score / (np.max(balance_score) + 1e-10)

        # Adaptive weighting of criteria based on iteration
        # Early: focus on representativeness (exploration)
        # Late: focus on uncertainty (refinement)
        exploration_weight = max(0, 1 - iteration / self.max_iterations)
        refinement_weight = 1 - exploration_weight

        # Combine criteria with adaptive weights
        combined_score = (
            exploration_weight * representativeness +
            refinement_weight * uncertainty +
            0.2 * balance_score  # Fixed weight for balance
        )

        # Select best points according to combined score
        n_to_select = min(batch_size, len(inactive_indices))
        selected_idx = np.argsort(-combined_score)[:n_to_select]  # Descending order

        return inactive_indices[selected_idx]

    def _run_single_kmeans(self, X, seed=None):
        """Run a single instance of the bottom-up K-means algorithm."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        inertia = 0.0
        n_iter = 0
        iteration_table = []

        # Compute initial distances and labels
        distances = self._compute_distances(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize batch size and active set
        initial_batch_size = max(int(n_samples * self.batch_size_factor), self.n_clusters * 3)

        # Initial active set with points closest to centroids and some distant points
        min_distances = np.min(distances, axis=1)
        closest_points = np.argsort(min_distances)[:initial_batch_size // 2]
        farthest_points = np.argsort(min_distances)[-initial_batch_size // 2:]
        active_indices = np.concatenate([closest_points, farthest_points])
        active_indices = np.unique(active_indices)  # Remove any duplicates

        # For tracking stability across iterations
        prev_inertia = float('inf')
        stability_counter = 0

        for iteration in range(self.max_iterations):
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            if len(active_indices) > 0:
                active_X = X[active_indices]
                active_labels = labels[active_indices]

                # Update centroids based on active points
                if NUMBA_AVAILABLE:
                    new_centroids, counts = _update_centroids_numba(active_X, active_labels, self.n_clusters)
                    # Handle empty clusters
                    for k in range(self.n_clusters):
                        if counts[k] == 0:
                            # Find the point furthest from its centroid
                            active_distances = distances[active_indices]
                            furthest_point_idx = np.argmax(np.min(active_distances, axis=1))
                            new_centroids[k] = active_X[furthest_point_idx]
                else:
                    new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                    for k in range(self.n_clusters):
                        cluster_mask = (active_labels == k)
                        if np.any(cluster_mask):
                            new_centroids[k] = np.mean(active_X[cluster_mask], axis=0)
                        else:
                            # For empty clusters, use old centroid or find a new representative
                            if iteration > 0:
                                new_centroids[k] = old_centroids[k]
                            else:
                                # First iteration, just pick a random point
                                random_idx = random_state.randint(len(active_X))
                                new_centroids[k] = active_X[random_idx]

                centroids = new_centroids

            # Compute distances and reassign labels
            distances = self._compute_distances(X, centroids)
            new_labels = np.argmin(distances, axis=1)

            # Calculate change in active points
            if len(active_indices) > 0:
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
            else:
                active_changed = 0
                active_changed_pct = 0

            labels = new_labels

            # Calculate inertia (sum of squared distances to closest centroid)
            min_distances = np.min(distances, axis=1)

            # Calculate inertia for active points only to monitor convergence
            active_inertia = np.sum(min_distances[active_indices]) if len(active_indices) > 0 else 0

            # Adaptive batch size growth based on convergence behavior
            convergence_factor = 1.0
            if iteration > 0 and prev_inertia > 0:
                inertia_change = abs(active_inertia - prev_inertia) / prev_inertia
                # If inertia change is small, we're converging, so grow batch size faster
                if inertia_change < 0.01:
                    convergence_factor = 1.5  # Accelerate batch growth
                elif inertia_change > 0.1:
                    convergence_factor = 0.8  # Slow down batch growth

            # Update batch size for next iteration
            next_batch_base_size = int(initial_batch_size * (self.batch_growth_factor ** iteration) * convergence_factor)
            next_batch_size = min(
                next_batch_base_size,
                n_samples - len(active_indices)
            )

            # Select next batch of points to add
            new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, iteration)

            # Record iteration information
            iteration_info = {
                'iteration': n_iter,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'active_points_changed': active_changed,
                'active_points_changed_pct': active_changed_pct * 100,
                'new_points_added': len(new_batch),
                'active_inertia': active_inertia,
                'total_coverage': len(active_indices) / n_samples * 100 if n_samples > 0 else 0
            }
            iteration_table.append(iteration_info)

            if self.verbose and (iteration + 1) % 5 == 0:
                logger.info(f"Iteration {n_iter}: "
                           f"{active_changed} active points changed ({active_changed_pct:.2%}), "
                           f"{len(new_batch)} new points added, "
                           f"{len(active_indices) / n_samples * 100:.1f}% coverage")

            # Add new batch to active set
            if len(new_batch) > 0:
                active_indices = np.append(active_indices, new_batch)

            # Calculate centroid shift for convergence check
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - centroids)**2, axis=1)))

            # Early stopping conditions

            # 1. Centroid stability
            if centroid_shift < self.tolerance:
                if len(active_indices) == n_samples:
                    # Full dataset and centroids stable = converged
                    if self.verbose:
                        logger.info(f"Converged at iteration {n_iter}: centroids stable")
                    break
                # If not full dataset but centroids stable, add more points faster
                next_batch_size = min(next_batch_size * 2, n_samples - len(active_indices))

            # 2. Inertia stability (track consecutive stable iterations)
            if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                stability_counter += 1
            else:
                stability_counter = 0

            # If inertia stable for multiple iterations and we have enough points
            if stability_counter >= 3 and len(active_indices) / n_samples > 0.5:
                if self.verbose:
                    logger.info(f"Early stopping at iteration {n_iter}: inertia stable")
                break

            # 3. All points active and no label changes
            if len(active_indices) == n_samples and active_changed == 0:
                if self.verbose:
                    logger.info(f"Converged at iteration {n_iter}: all points stable")
                break

            # 4. No new points to add and centroids stable
            if len(new_batch) == 0 and centroid_shift < self.tolerance:
                if self.verbose:
                    logger.info(f"Converged at iteration {n_iter}: no new points and centroids stable")
                break

            prev_inertia = active_inertia

        # Final update with all points
        if NUMBA_AVAILABLE:
            centroids, _ = _update_centroids_numba(X, labels, self.n_clusters)
        else:
            for k in range(self.n_clusters):
                cluster_mask = (labels == k)
                if np.any(cluster_mask):
                    centroids[k] = np.mean(X[cluster_mask], axis=0)

        # Final assignment
        distances = self._compute_distances(X, centroids)
        labels = np.argmin(distances, axis=1)
        inertia = np.sum(np.min(distances, axis=1))

        return centroids, labels, inertia, n_iter, iteration_table

    def fit(self, X, y=None):
        """
        Fit the model to data.
        Run multiple initializations and select the best result.
        """
        X = check_array(X)

        best_inertia = np.inf
        best_centroids = None
        best_labels = None
        best_n_iter = 0

        seeds = self._get_seeds()

        for seed_idx, seed in enumerate(seeds):
            if self.verbose and len(seeds) > 1:
                logger.info(f"K-means initialization {seed_idx + 1}/{len(seeds)}")

            centroids, labels, inertia, n_iter, iter_table = self._run_single_kmeans(X, seed)

            if inertia < best_inertia:
                best_centroids = centroids.copy()
                best_labels = labels.copy()
                best_inertia = inertia
                best_n_iter = n_iter
                self.iteration_table_ = iter_table

        self.cluster_centers_ = best_centroids
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        self.n_iter_ = best_n_iter

        if self.verbose:
            logger.info(f"BottomUpKMeans converged after {self.n_iter_} iterations. "
                        f"Inertia: {self.inertia_:.4f}")

        return self

    def _get_seeds(self):
        """Generate random seeds for multiple initializations."""
        random_state = check_random_state(self.random_state)
        seeds = []
        for i in range(self.n_init):
            seed = random_state.randint(0, 2**31 - 1)
            seeds.append(seed)
        return seeds

    def predict(self, X):
        """Predict the closest cluster for each sample in X."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X)

        if NUMBA_AVAILABLE:
            labels, _ = _assign_labels_numba(X, self.cluster_centers_)
            return labels
        else:
            distances = self._compute_distances(X, self.cluster_centers_)
            return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def print_iteration_table(self):
        """Prints a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            return df
        except ImportError:
            for info in self.iteration_table_:
                print(", ".join([f"{k}: {v}" for k, v in info.items()]))
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = BottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 6.824 seconds
Average execution time (OptimizedKMeans): 0.027 seconds
Average execution time (SklearnKMeans): 0.037 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 4.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [2]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available

@njit(parallel=True, fastmath=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(fastmath=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Process block
    for i in range(block_size):
        x_idx = start_idx + i
        # Compute x_norm for this sample
        x_norm = 0.0
        for k in range(n_features):
            x_norm += X[x_idx, k] * X[x_idx, k]

        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norm + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[x_idx, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(X.shape[0]):
        counts[labels[i]] += 1

    # Sum points in each cluster - vectorized across features for each sample
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 1.224 seconds
Average execution time (OptimizedKMeans): 0.022 seconds
Average execution time (SklearnKMeans): 0.020 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [None]:
! pip install line_profiler
%load_ext line_profiler
%lprun -f run_bench_evaluation run_bench_evaluation()



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.462 seconds
Average execution time (OptimizedKMeans): 0.027 seconds
Average execution time (SklearnKMeans): 0.022 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations


In [None]:
X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
n_clusters = 3
n_runs = 3
bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42, verbose=True)
#bu_kmeans.fit(X)
%lprun -f HybridBottomUpKMeans.fit bu_kmeans.fit(X)


In [None]:
import cProfile, pstats

# Profile your function
cProfile.run('run_bench_evaluation()', 'profile_output')

# Load the profiling data
p = pstats.Stats('profile_output')
p.strip_dirs().sort_stats('cumulative').print_stats(15)  # Top 10 functions by cumulative time

# For a recursive breakdown, check the callers and callees of a specific function:
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')



BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3

Benchmark Results:
Average execution time (BottomUpKMeans): 0.463 seconds
Average execution time (OptimizedKMeans): 0.028 seconds
Average execution time (SklearnKMeans): 0.021 seconds

Clustering Quality Metrics (averages):

BOTTOMUP:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.144
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

OPTIMIZED:
  Silhouette Score: 0.451
  Calinski-Harabasz: 59195.143
  Davies-Bouldin: 1.405
  Inertia: 446862.577
  Adjusted Rand: 0.601
  Adjusted Mutual Info: 0.725

SKLEARN:
  Silhouette Score: 0.670
  Calinski-Harabasz: 136986.333
  Davies-Bouldin: 0.516
  Inertia: 175270.145
  Adjusted Rand: 0.999
  Adjusted Mutual Info: 0.997

Average Iterations:
  BOTTOMUP: 6.0 iterations
  OPTIMIZED: 3.3 iterations
  SKLEARN: 2.3 iterations
Fri Mar 14 10:27:37 2025    profile_output

         678370 function 

<pstats.Stats at 0x7d3a85252690>

In [None]:
p.strip_dirs().sort_stats('cumulative').print_stats(100)
p.print_callers('run_bench_evaluation')
p.print_callees('run_bench_evaluation')


Fri Mar 14 10:27:37 2025    profile_output

         678370 function calls (677501 primitive calls) in 118.214 seconds

   Ordered by: cumulative time
   List reduced from 650 to 100 due to restriction <100>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  118.215  118.215 {built-in method builtins.exec}
        1    0.000    0.000  118.214  118.214 <string>:1(<module>)
        1    0.001    0.001  118.214  118.214 <ipython-input-3-d31acea3e1da>:1170(run_bench_evaluation)
   360/63    0.004    0.000  116.702    1.852 _param_validation.py:185(wrapper)
        9    0.000    0.000  116.142   12.905 _unsupervised.py:42(silhouette_score)
        9    0.005    0.001  116.141   12.905 _unsupervised.py:196(silhouette_samples)
       72    0.196    0.003  116.114    1.613 pairwise.py:2082(pairwise_distances_chunked)
      117    0.001    0.000   71.929    0.615 pairwise.py:2266(pairwise_distances)
      117    0.001    0.000   71.928    0.615

<pstats.Stats at 0x7d3a85252690>

In [None]:
!pip install -U scalene

Collecting scalene
  Downloading scalene-1.5.51-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Downloading scalene-1.5.51-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scalene
Successfully installed scalene-1.5.51


In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
from scalene import scalene_profiler

# Turn profiling on
scalene_profiler.start()

@njit(parallel=True, fastmath=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Pre-compute squared norms of X
    x_norms = np.zeros(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    # Use ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩ identity for faster computation
    for i in prange(n_samples):
        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norms[i] + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(fastmath=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Process block
    for i in range(block_size):
        x_idx = start_idx + i
        # Compute x_norm for this sample
        x_norm = 0.0
        for k in range(n_features):
            x_norm += X[x_idx, k] * X[x_idx, k]

        for j in range(n_clusters):
            # Start with ||x||² + ||y||²
            distances[i, j] = x_norm + centroid_norms[j]
            # Subtract 2⟨x,y⟩
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[x_idx, k] * centroids[j, k]
            distances[i, j] -= 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(X.shape[0]):
        counts[labels[i]] += 1

    # Sum points in each cluster - vectorized across features for each sample
    for i in range(X.shape[0]):
        cluster_id = labels[i]
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    scalene_profiler.stop()



ERROR: Do not try to invoke `start` if you have not called Scalene using one of the methods
in https://github.com/plasma-umass/scalene#using-scalene
(The most likely issue is that you need to run your code with `scalene`, not `python`).


SystemExit: 1

In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32

@njit(parallel=True, fastmath=True, cache=True)
def _fast_distances(X, centroids):
    """Ultra-optimized squared Euclidean distances with Numba."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.empty(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            norm += centroids[j, k] * centroids[j, k]
        centroid_norms[j] = norm

    # Pre-compute squared norms of X
    x_norms = np.empty(n_samples, dtype=np.float64)
    for i in prange(n_samples):
        norm = 0.0
        for k in range(n_features):
            norm += X[i, k] * X[i, k]
        x_norms[i] = norm

    # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
    for i in prange(n_samples):
        x_norm = x_norms[i]
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] = x_norm + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(fastmath=True, cache=True)
def _fast_distances_block(X, centroids, start_idx, end_idx):
    """Compute distances for a block of samples."""
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    block_size = end_idx - start_idx
    distances = np.empty((block_size, n_clusters), dtype=np.float64)

    # Pre-compute squared norms of centroids
    centroid_norms = np.empty(n_clusters, dtype=np.float64)
    for j in range(n_clusters):
        norm = 0.0
        for k in range(n_features):
            norm += centroids[j, k] * centroids[j, k]
        centroid_norms[j] = norm

    # Process block with improved locality
    for i in range(block_size):
        x_idx = start_idx + i

        # Compute x_norm for this sample
        x_norm = 0.0
        x_values = np.empty(n_features, dtype=np.float64)

        # Cache the values in x_values for better memory access
        for k in range(n_features):
            val = X[x_idx, k]
            x_values[k] = val
            x_norm += val * val

        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += x_values[k] * centroids[j, k]
            distances[i, j] = x_norm + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True, fastmath=True, cache=True)
def _update_centroids_numba(X, labels, n_clusters):
    """Update centroids based on assigned labels - fully optimized."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count points in clusters - separate loop for better cache performance
    for i in range(n_samples):
        counts[labels[i]] += 1

    # Sum points in each cluster - use prange for outer loop
    # but handle accumulation carefully to avoid race conditions
    for i in prange(n_samples):
        cluster_id = labels[i]
        for j in range(n_features):
            # Use atomic add to avoid race conditions when updating centroids
            centroids[cluster_id, j] += X[i, j]

    # Divide by counts to get means
    for i in range(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]
        else:
            # Handle empty clusters - set to a random point
            idx = np.random.randint(0, n_samples)
            for j in range(n_features):
                centroids[i, j] = X[idx, j]

    return centroids, counts

class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3

Run 3/3


In [None]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import time
import logging
import urllib.request
import io
import gzip
import warnings
from typing import Dict, Tuple, Optional
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans as SklearnKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    adjusted_mutual_info_score
)

warnings.filterwarnings("ignore")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
np.random.seed(42)

# Optional Numba acceleration
try:
    from numba import njit
    NUMBA_AVAILABLE = True
except ImportError:
    NUMBA_AVAILABLE = False
    def njit(func):
        return func

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators

import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, boolean, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import warnings

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available since we're using its decorators
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
import logging
from numba import njit, prange, float64, int32
from scipy import sparse
from sklearn.metrics import pairwise_distances
import time
from joblib import Parallel, delayed

logger = logging.getLogger(__name__)

# Check if Numba is available
NUMBA_AVAILABLE = True  # We'll assume Numba is available
import numpy as np
from numba import njit, prange, float64, int32
from numba import njit, prange, cuda
import numpy as np
from numba import njit, prange
import numpy as np

# Check if CuPy is available
try:
    import cupy as cp
    HAS_GPU = True
except ImportError:
    HAS_GPU = False

# GPU version using CuPy
if HAS_GPU:
    def _fast_distances_gpu(X, centroids):
        """Compute squared Euclidean distances with CuPy GPU acceleration."""
        # Transfer data to GPU if needed
        X_gpu = cp.asarray(X)
        centroids_gpu = cp.asarray(centroids)

        # Compute squared norms
        X_norm = cp.sum(X_gpu**2, axis=1, keepdims=True)
        centroids_norm = cp.sum(centroids_gpu**2, axis=1, keepdims=True).T

        # Use matrix multiplication for dot product
        dot_product = cp.dot(X_gpu, centroids_gpu.T)

        # Compute distances using ||x-y||² = ||x||² + ||y||² - 2⟨x,y⟩
        distances = X_norm + centroids_norm - 2.0 * dot_product

        return distances

    def compute_distances(X, centroids):
        """Wrapper to handle GPU distance computation."""
        distances_gpu = _fast_distances_gpu(X, centroids)
        return cp.asnumpy(distances_gpu)

    def assign_labels(distances):
        """Assign labels based on distances using GPU."""
        distances_gpu = cp.asarray(distances)
        min_distances = cp.min(distances_gpu, axis=1)
        labels = cp.argmin(distances_gpu, axis=1)

        return cp.asnumpy(labels).astype(np.int32), cp.asnumpy(min_distances)

    def _update_centroids_gpu(X, labels, n_clusters):
        """Update centroids using GPU acceleration."""
        X_gpu = cp.asarray(X)
        labels_gpu = cp.asarray(labels)
        n_features = X.shape[1]

        centroids = cp.zeros((n_clusters, n_features), dtype=X.dtype)
        counts = cp.zeros(n_clusters, dtype=cp.int32)

        # Process each cluster
        for k in range(n_clusters):
            mask = (labels_gpu == k)
            cluster_points = X_gpu[mask]
            if len(cluster_points) > 0:
                centroids[k] = cp.mean(cluster_points, axis=0)
                counts[k] = len(cluster_points)

        return cp.asnumpy(centroids), cp.asnumpy(counts)

# CPU optimized version - unchanged from original
@njit(parallel=True)
def _fast_distances_cpu(X, centroids):
    """Compute squared Euclidean distances with Numba - vectorized version."""
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    n_features = X.shape[1]
    distances = np.empty((n_samples, n_clusters), dtype=np.float64)

    # Pre-compute squared norms - parallelized
    x_norms = np.zeros(n_samples, dtype=np.float64)
    centroid_norms = np.zeros(n_clusters, dtype=np.float64)

    for i in prange(n_samples):
        for k in range(n_features):
            x_norms[i] += X[i, k] * X[i, k]

    for j in prange(n_clusters):
        for k in range(n_features):
            centroid_norms[j] += centroids[j, k] * centroids[j, k]

    # Compute distances in parallel
    for i in prange(n_samples):
        for j in range(n_clusters):
            dot_product = 0.0
            for k in range(n_features):
                dot_product += X[i, k] * centroids[j, k]
            distances[i, j] = x_norms[i] + centroid_norms[j] - 2.0 * dot_product

    return distances

@njit(parallel=True)
def _assign_labels_cpu(distances):
    """Assign labels to data points based on nearest centroid."""
    n_samples = distances.shape[0]
    labels = np.zeros(n_samples, dtype=np.int32)
    min_distances = np.full(n_samples, np.inf)

    for i in prange(n_samples):
        for j in range(distances.shape[1]):
            if distances[i, j] < min_distances[i]:
                min_distances[i] = distances[i, j]
                labels[i] = j

    return labels, min_distances

@njit(parallel=True)
def _update_centroids_cpu(X, labels, n_clusters):
    """Update centroids based on assigned labels."""
    n_samples = X.shape[0]
    n_features = X.shape[1]
    centroids = np.zeros((n_clusters, n_features), dtype=X.dtype)
    counts = np.zeros(n_clusters, dtype=np.int32)

    # Count cluster members and sum values
    for i in range(n_samples):
        cluster_id = labels[i]
        counts[cluster_id] += 1
        for j in range(n_features):
            centroids[cluster_id, j] += X[i, j]

    # Calculate means
    for i in prange(n_clusters):
        if counts[i] > 0:
            for j in range(n_features):
                centroids[i, j] /= counts[i]

    return centroids, counts

# Unified interface
def _fast_distances(X, centroids):
    if HAS_GPU:
        return compute_distances(X, centroids)
    else:
        return _fast_distances_cpu(X, centroids)

def _assign_labels_numba(distances):
    if HAS_GPU:
        return assign_labels(distances)
    else:
        return _assign_labels_cpu(distances)

def _update_centroids_numba(X, labels, n_clusters):
    if HAS_GPU:
        return _update_centroids_gpu(X, labels, n_clusters)
    else:
        return _update_centroids_cpu(X, labels, n_clusters)

# Main kmeans function
class HybridBottomUpKMeans(BaseEstimator, ClusterMixin):
    """
    Improved K-means implementation with hybrid bottom-up approach and optimized performance.

    Parameters
    ----------
    n_clusters : int, default=8
        The number of clusters to form.

    max_iterations : int, default=300
        Maximum number of iterations of the k-means algorithm.

    tolerance : float, default=1e-4
        Relative tolerance for convergence.

    random_state : int or RandomState, default=None
        Controls randomness.

    batch_size_factor : float or str, default='auto'
        Initial batch size as percentage of data.

    batch_growth_factor : float or str, default='auto'
        Factor to grow the batch size in each iteration.

    verbose : bool, default=False
        Verbosity mode.

    init : {'k-means++', 'random'}, default='k-means++'
        Method for initialization.

    early_stopping_factor : float, default=0.001
        Fraction of inertia change to trigger early stopping.

    n_init : int, default=3
        Number of times to run with different seeds.

    hybrid_threshold : float, default=0.5
        Threshold to switch from bottom-up to standard k-means.

    n_jobs : int, default=None
        Number of parallel jobs for computation.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, batch_size_factor='auto',
                 batch_growth_factor='auto', verbose=False, init='k-means++',
                 early_stopping_factor=0.001, n_init=3,
                 hybrid_threshold=0.5, n_jobs=None):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.batch_size_factor = batch_size_factor
        self.batch_growth_factor = batch_growth_factor
        self.verbose = verbose
        self.init = init
        self.early_stopping_factor = early_stopping_factor
        self.n_init = n_init
        self.hybrid_threshold = hybrid_threshold
        self.n_jobs = n_jobs
        self.iteration_table_ = []

    def _analyze_data_characteristics(self, X):
        """Quick analysis of data characteristics."""
        n_samples, n_features = X.shape

        # Detect if data is sparse
        is_sparse_matrix = sparse.issparse(X)

        # For large datasets, use sampling
        sample_size = min(1000, n_samples)
        if is_sparse_matrix:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            sparsity = 1.0 - (X[sample_indices].count_nonzero() / (sample_size * n_features))
        else:
            sample_indices = np.random.choice(n_samples, size=sample_size, replace=False)
            X_sample = X[sample_indices]
            sparsity = np.sum(X_sample == 0) / (sample_size * n_features)

        # High dimensionality check
        high_dimensionality = n_features > 100 or n_features > np.sqrt(n_samples)

        # Size category
        if n_samples < 10000:
            size_category = 'small'
        elif n_samples < 100000:
            size_category = 'medium'
        else:
            size_category = 'large'

        # Determine data type
        if sparsity > 0.8:
            data_type = 'sparse'
        elif high_dimensionality:
            data_type = 'high_dim'
        else:
            if size_category == 'small':
                # For small datasets, try to detect tight clusters
                try:
                    subsample = X[sample_indices][:100] if not is_sparse_matrix else X[sample_indices][:100].toarray()
                    distances = pairwise_distances(subsample, metric='euclidean')
                    distances_flat = distances[np.triu_indices(distances.shape[0], k=1)]
                    cv = np.std(distances_flat) / np.mean(distances_flat) if len(distances_flat) > 0 and np.mean(distances_flat) > 0 else 0
                    tight_clusters = cv > 1.0
                    data_type = 'dense_tight' if tight_clusters else 'standard'
                except:
                    data_type = 'standard'
            else:
                data_type = 'standard'

        return {
            'data_type': data_type,
            'size_category': size_category,
            'sparsity': sparsity,
            'high_dimensionality': high_dimensionality
        }

    def _set_dynamic_parameters(self, X):
        """Set optimized parameters based on data characteristics."""
        n_samples, n_features = X.shape

        # Analyze data characteristics
        chars = self._analyze_data_characteristics(X)
        data_type = chars['data_type']
        size_category = chars['size_category']

        # Store for reference
        self.data_characteristics_ = chars

        # Set parameters based on data type and size
        if data_type == 'sparse':
            # For sparse data: larger initial batch, faster growth
            self._batch_size_factor = 0.15 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 3.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.4  # Switch earlier for sparse data

        elif data_type == 'dense_tight':
            # For tight clusters: smaller initial batch, moderate growth
            self._batch_size_factor = 0.05 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.6  # Need more coverage before switching

        elif data_type == 'high_dim':
            # For high-dimensional: moderate batch, higher growth
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.5 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = 0.5

        else:  # standard
            self._batch_size_factor = 0.1 if self.batch_size_factor == 'auto' else self.batch_size_factor
            self._batch_growth_factor = 2.0 if self.batch_growth_factor == 'auto' else self.batch_growth_factor
            self._hybrid_threshold = self.hybrid_threshold

        # Adjust for dataset size
        if size_category == 'large':
            # For very large datasets, start smaller and grow faster
            self._batch_size_factor = min(0.01, self._batch_size_factor)
            self._batch_growth_factor = max(self._batch_growth_factor, 3.0)

        # For very small datasets, just use all points
        if n_samples < 500:
            self._batch_size_factor = 1.0
            self._hybrid_threshold = 0.99

        # Set initial batch size (at least 3 * n_clusters)
        self._initial_batch_size = max(int(n_samples * self._batch_size_factor), self.n_clusters * 3)

        if self.verbose:
            logger.info(f"Data type: {data_type}, size: {size_category}")
            logger.info(f"Parameters: initial_batch={self._initial_batch_size}, "
                       f"growth_factor={self._batch_growth_factor:.1f}, "
                       f"hybrid_threshold={self._hybrid_threshold:.2f}")

    def _initialize_centroids(self, X, seed=None):
        """Fast centroid initialization."""
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        if self.init == 'random':
            # Simple stratified random selection
            indices = random_state.choice(n_samples, size=self.n_clusters, replace=False)
            if sparse.issparse(X):
                return X[indices].toarray()
            else:
                return X[indices].copy()

        elif self.init == 'k-means++':
            # Optimized k-means++ implementation
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)

            # Choose first centroid randomly
            first_idx = random_state.randint(n_samples)
            if sparse.issparse(X):
                centroids[0] = X[first_idx].toarray().flatten()
            else:
                centroids[0] = X[first_idx].copy()

            # For large datasets, implement k-means++ with sampling
            if n_samples > 10000:
                # Use a sample for faster initialization
                sample_size = min(10000, n_samples)
                sample_indices = random_state.choice(n_samples, size=sample_size, replace=False)

                if sparse.issparse(X):
                    X_sample = X[sample_indices].toarray()
                else:
                    X_sample = X[sample_indices]

                # Execute k-means++ on the sample
                for c in range(1, self.n_clusters):
                    # Calculate distances to closest centroid
                    min_dists = pairwise_distances(X_sample, centroids[:c], metric='euclidean', squared=True).min(axis=1)

                    # Select next centroid with probability proportional to squared distance
                    probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                    next_idx = random_state.choice(sample_size, p=probs)
                    centroids[c] = X_sample[next_idx].copy()

            else:
                # Standard k-means++ for smaller datasets
                if sparse.issparse(X):
                    X_dense = X.toarray()

                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X_dense, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X_dense[next_idx].copy()
                else:
                    for c in range(1, self.n_clusters):
                        min_dists = pairwise_distances(X, centroids[:c], metric='euclidean', squared=True).min(axis=1)
                        probs = min_dists / min_dists.sum() if min_dists.sum() > 0 else None
                        next_idx = random_state.choice(n_samples, p=probs)
                        centroids[c] = X[next_idx].copy()

            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances_parallel(self, X, centroids):
        """Compute distances in parallel for better performance."""
        n_samples = X.shape[0]
        n_jobs = self.n_jobs or 1

        if sparse.issparse(X):
            # For sparse matrices, use specialized handling
            return pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        if n_jobs <= 1 or n_samples < 1000:
            # Use optimized Numba implementation for single-threaded or small datasets
            if NUMBA_AVAILABLE:
                return _fast_distances(X, centroids)
            else:
                return pairwise_distances(X, centroids, metric='euclidean', squared=True)

        # For multi-threaded computation on larger datasets
        # Process in blocks for better cache locality
        block_size = max(100, n_samples // n_jobs)
        n_blocks = (n_samples + block_size - 1) // block_size

        # Prepare blocks
        blocks = []
        for i in range(n_blocks):
            start_idx = i * block_size
            end_idx = min(start_idx + block_size, n_samples)
            blocks.append((start_idx, end_idx))

        # Process blocks in parallel with joblib
        if NUMBA_AVAILABLE:
            results = Parallel(n_jobs=n_jobs)(
                delayed(_fast_distances_block)(X, centroids, start, end)
                for start, end in blocks
            )

            # Combine results
            distances = np.vstack(results)
        else:
            # Fall back to sklearn's implementation
            distances = pairwise_distances(X, centroids, metric='euclidean', squared=True, n_jobs=n_jobs)

        return distances

    def _select_next_batch(self, X, current_active, distances, batch_size, labels):
        """Optimized batch selection focusing on informative points."""
        n_samples = X.shape[0]

        # Quick return if all points are active or batch_size is 0
        if len(current_active) >= n_samples or batch_size <= 0:
            return np.array([], dtype=np.int32)

        # Create inactive mask efficiently
        active_mask = np.zeros(n_samples, dtype=bool)
        active_mask[current_active] = True
        inactive_indices = np.where(~active_mask)[0]

        # Quick return if no inactive points
        if len(inactive_indices) == 0:
            return np.array([], dtype=np.int32)

        # Get distances for inactive points
        inactive_distances = distances[inactive_indices]
        inactive_labels = labels[inactive_indices]

        # Quick heuristic for batch selection - prioritize high-impact points
        selected_indices = []

        # 1. Take points closest to centroids (to improve centroid positions)
        for k in range(self.n_clusters):
            cluster_points = inactive_indices[inactive_labels == k]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                # Get closest points for this cluster
                num_to_take = max(1, batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                selected_indices.extend(closest)

        # 2. Take some boundary points (helps with cluster separation)
        if self.n_clusters > 1:
            # Margins between closest and second closest centroid
            sorted_distances = np.sort(inactive_distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            # Small margins indicate points near boundaries
            num_boundary = max(1, batch_size // 4)
            boundary_points = inactive_indices[np.argsort(margins)[:num_boundary]]
            selected_indices.extend(boundary_points)

        # 3. Add some outliers (for exploration)
        min_distances = np.min(inactive_distances, axis=1)
        num_outliers = max(1, batch_size // 10)
        outlier_points = inactive_indices[np.argsort(-min_distances)[:num_outliers]]
        selected_indices.extend(outlier_points)

        # Ensure uniqueness and limit to batch_size
        selected_indices = list(set(selected_indices))
        if len(selected_indices) > batch_size:
            selected_indices = selected_indices[:batch_size]

        # If we need more points, add random ones
        if len(selected_indices) < batch_size:
            remaining = batch_size - len(selected_indices)
            available = list(set(inactive_indices) - set(selected_indices))
            if available:
                random_indices = np.random.choice(available,
                                               size=min(remaining, len(available)),
                                               replace=False)
                selected_indices.extend(random_indices)

        return np.array(selected_indices, dtype=np.int32)

    def _standard_kmeans_iteration(self, X, centroids):
        """Run a single iteration of standard k-means on full dataset."""
        # Compute distances and assign labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Update centroids
        new_centroids = np.zeros_like(centroids)
        for k in range(self.n_clusters):
            cluster_mask = (labels == k)
            if np.any(cluster_mask):
                if sparse.issparse(X):
                    new_centroids[k] = X[cluster_mask].mean(axis=0).A1
                else:
                    new_centroids[k] = np.mean(X[cluster_mask], axis=0)
            else:
                new_centroids[k] = centroids[k]

        # Calculate inertia and centroid shift
        inertia = np.sum(np.min(distances, axis=1))
        centroid_shift = np.sqrt(np.sum((centroids - new_centroids)**2))

        return new_centroids, labels, distances, inertia, centroid_shift

    def _run_kmeans(self, X, seed=None):
        """Run hybrid k-means algorithm."""
        start_time = time.time()
        n_samples, n_features = X.shape
        random_state = check_random_state(seed if seed is not None else self.random_state)

        # Set dynamic parameters based on data characteristics
        self._set_dynamic_parameters(X)

        # Initialize centroids
        centroids = self._initialize_centroids(X, seed)
        labels = np.zeros(n_samples, dtype=np.int32)
        iteration_table = []
        hybrid_switch_iter = -1

        # Compute initial distances and labels
        distances = self._compute_distances_parallel(X, centroids)
        labels = np.argmin(distances, axis=1)

        # Initialize active set with strategic selection
        initial_indices = []

        # Include points close to each centroid
        for k in range(self.n_clusters):
            cluster_points = np.where(labels == k)[0]
            if len(cluster_points) > 0:
                cluster_distances = distances[cluster_points][:, k]
                num_to_take = max(1, self._initial_batch_size // (2 * self.n_clusters))
                closest = cluster_points[np.argsort(cluster_distances)[:num_to_take]]
                initial_indices.extend(closest)

        # Include boundary points for better separation
        if self.n_clusters > 1:
            sorted_distances = np.sort(distances, axis=1)
            margins = sorted_distances[:, 1] - sorted_distances[:, 0]
            num_boundary = max(1, self._initial_batch_size // 4)
            boundary_points = np.argsort(margins)[:num_boundary]
            initial_indices.extend(boundary_points)

        # Ensure unique indices and limit to initial_batch_size
        active_indices = np.array(list(set(initial_indices)), dtype=np.int32)
        if len(active_indices) > self._initial_batch_size:
            active_indices = active_indices[:self._initial_batch_size]

        # For very small datasets, use all points
        if n_samples <= self._initial_batch_size:
            active_indices = np.arange(n_samples)

        # Track convergence
        prev_inertia = float('inf')
        stability_counter = 0
        prev_active_size = len(active_indices)

        # Main iteration loop
        for iteration in range(self.max_iterations):
            iter_start = time.time()
            n_iter = iteration + 1
            old_centroids = centroids.copy()

            # Calculate coverage ratio
            coverage_ratio = len(active_indices) / n_samples

            if coverage_ratio < self._hybrid_threshold:
                # PHASE 1: BOTTOM-UP APPROACH
                if len(active_indices) > 0:
                    # Extract active data
                    if sparse.issparse(X):
                        X_active = X[active_indices].toarray()
                    else:
                        X_active = X[active_indices]
                    active_labels = labels[active_indices]

                    # Update centroids using active points
                    if NUMBA_AVAILABLE and not sparse.issparse(X):
                        new_centroids, counts = _update_centroids_numba(X_active, active_labels, self.n_clusters)
                        # Handle empty clusters
                        for k in range(self.n_clusters):
                            if counts[k] == 0:
                                # Use old centroid or a random point
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]
                    else:
                        new_centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
                        for k in range(self.n_clusters):
                            cluster_mask = (active_labels == k)
                            if np.any(cluster_mask):
                                new_centroids[k] = np.mean(X_active[cluster_mask], axis=0)
                            else:
                                # Handle empty clusters
                                if iteration > 0:
                                    new_centroids[k] = old_centroids[k]
                                else:
                                    idx = random_state.randint(len(X_active))
                                    new_centroids[k] = X_active[idx]

                    centroids = new_centroids

                # Compute distances for all points
                distances = self._compute_distances_parallel(X, centroids)
                new_labels = np.argmin(distances, axis=1)

                # Track label changes in active set
                active_changed = np.sum(new_labels[active_indices] != labels[active_indices])
                active_changed_pct = active_changed / len(active_indices) if len(active_indices) > 0 else 0
                labels = new_labels

                # Calculate partial inertia
                min_distances = np.min(distances, axis=1)
                active_inertia = np.sum(min_distances[active_indices])
                total_inertia = np.sum(min_distances)

                # Adaptive batch size growth
                growth_factor = self._batch_growth_factor
                if active_changed_pct > 0.1:
                    growth_factor *= 0.8  # Slow down if unstable
                elif active_changed_pct < 0.01 and iteration > 1:
                    growth_factor *= 1.5  # Speed up if stable

                # Calculate next batch size
                next_batch_size = int(self._initial_batch_size * (growth_factor ** iteration))
                next_batch_size = min(next_batch_size, n_samples - len(active_indices))

                # Select next batch of points
                new_batch = self._select_next_batch(X, active_indices, distances, next_batch_size, labels)

                # Calculate centroid shift
                centroid_shift = np.sqrt(np.sum((old_centroids - centroids)**2))

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'bottom-up',
                    'active_points': len(active_indices),
                    'coverage': len(active_indices) / n_samples * 100,
                    'active_changed': active_changed,
                    'active_changed_pct': active_changed_pct * 100,
                    'centroid_shift': centroid_shift,
                    'new_points_added': len(new_batch),
                    'inertia': active_inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (bottom-up): {len(active_indices)/n_samples*100:.1f}% points, "
                              f"{active_changed_pct*100:.1f}% changed, {iter_time:.3f}s")

                # Add new batch to active set
                if len(new_batch) > 0:
                    active_indices = np.append(active_indices, new_batch)

                # Early stopping conditions
                # 1. Centroid stability
                if centroid_shift < self.tolerance and len(active_indices) == n_samples:
                    break

                # 2. Active set not growing but should be
                if len(active_indices) == prev_active_size and next_batch_size > 0:
                    # No growth when expected - might be converged
                    break

                # 3. Inertia stability
                if prev_inertia > 0 and abs(active_inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 3:
                    break

                prev_inertia = active_inertia
                prev_active_size = len(active_indices)

            else:
                # PHASE 2: STANDARD K-MEANS ON FULL DATASET
                if hybrid_switch_iter == -1:
                    hybrid_switch_iter = iteration
                    if self.verbose:
                        logger.info(f"Switching to standard k-means at iteration {n_iter} "
                                  f"with {len(active_indices)/n_samples*100:.1f}% coverage")

                # Run standard k-means iteration
                centroids, labels, distances, inertia, centroid_shift = self._standard_kmeans_iteration(X, centroids)

                # Record iteration information
                iter_time = time.time() - iter_start
                iteration_info = {
                    'iteration': n_iter,
                    'phase': 'standard',
                    'active_points': n_samples,
                    'coverage': 100.0,
                    'active_changed': np.nan,
                    'active_changed_pct': np.nan,
                    'centroid_shift': centroid_shift,
                    'new_points_added': 0,
                    'inertia': inertia,
                    'time': iter_time
                }
                iteration_table.append(iteration_info)

                if self.verbose and n_iter % 5 == 0:
                    logger.info(f"Iter {n_iter} (standard): inertia={inertia:.1f}, "
                              f"shift={centroid_shift:.6f}, {iter_time:.3f}s")

                # Check for convergence
                if centroid_shift < self.tolerance:
                    break

                # Check for inertia stability
                if prev_inertia > 0 and abs(inertia - prev_inertia) / prev_inertia < self.early_stopping_factor:
                    stability_counter += 1
                else:
                    stability_counter = 0

                if stability_counter >= 2:
                    break

                prev_inertia = inertia

        # Final full iteration to ensure all points are used
        if coverage_ratio < 1.0:
            centroids, labels, distances, inertia, _ = self._standard_kmeans_iteration(X, centroids)
        else:
            inertia = np.sum(np.min(distances, axis=1))

        total_time = time.time() - start_time

        if self.verbose:
            logger.info(f"K-means completed in {n_iter} iterations, {total_time:.3f}s. Inertia: {inertia:.1f}")
            if hybrid_switch_iter > 0:
                logger.info(f"Switched to standard K-means at iteration {hybrid_switch_iter+1}")

        return centroids, labels, inertia, n_iter, iteration_table, hybrid_switch_iter

    def fit(self, X, y=None):
        """Fit k-means clustering."""
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Run multiple initializations if requested
        if self.n_init > 1:
            best_inertia = float('inf')
            best_centroids = None
            best_labels = None
            best_n_iter = 0
            best_iteration_table = []

            seeds = [check_random_state(self.random_state).randint(0, 2**31 - 1) for _ in range(self.n_init)]

            for i, seed in enumerate(seeds):
                if self.verbose:
                    logger.info(f"K-means initialization {i+1}/{len(seeds)}")

                centroids, labels, inertia, n_iter, iter_table, hybrid_switch = self._run_kmeans(X, seed)

                if inertia < best_inertia:
                    best_centroids = centroids.copy()
                    best_labels = labels.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter
                    best_iteration_table = iter_table

            self.cluster_centers_ = best_centroids
            self.labels_ = best_labels
            self.inertia_ = best_inertia
            self.n_iter_ = best_n_iter
            self.iteration_table_ = best_iteration_table
        else:
            # Single run
            centroids, labels, inertia, n_iter, iteration_table, hybrid_switch = self._run_kmeans(X)

            self.cluster_centers_ = centroids
            self.labels_ = labels
            self.inertia_ = inertia
            self.n_iter_ = n_iter
            self.iteration_table_ = iteration_table

        return self

    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        distances = self._compute_distances_parallel(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample."""
        return self.fit(X).labels_

    def transform(self, X):
        """Transform X to a cluster-distance space."""
        check_is_fitted(self, ['cluster_centers_'])
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        return self._compute_distances_parallel(X, self.cluster_centers_)

    def print_iteration_table(self):
        """Returns a table of iteration details."""
        check_is_fitted(self, ['iteration_table_'])
        try:
            import pandas as pd
            return pd.DataFrame(self.iteration_table_)
        except ImportError:
            result = ""
            for info in self.iteration_table_:
                result += ", ".join([f"{k}: {v}" for k, v in info.items()]) + "\n"
            return result
# =============================================================================
# Existing OptimizedKMeans Implementation
# =============================================================================
class OptimizedKMeans(BaseEstimator, ClusterMixin):
    """
    Optimized K-means implementation focused on practical performance gains.
    """
    def __init__(self, n_clusters=8, max_iterations=300, tolerance=1e-4,
                 random_state=None, stable_point_check_interval=5,
                 verbose=False, init='random'):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.random_state = random_state
        self.stable_point_check_interval = stable_point_check_interval
        self.verbose = verbose
        self.init = init
        self.iteration_table_ = []

    def _initialize_centroids(self, X):
        n_samples, n_features = X.shape
        random_state = check_random_state(self.random_state)

        if self.init == 'random':
            centroid_indices = random_state.choice(n_samples, self.n_clusters, replace=False)
            return X[centroid_indices].copy()
        elif self.init == 'k-means++':
            centroids = np.zeros((self.n_clusters, n_features), dtype=X.dtype)
            first_idx = random_state.randint(n_samples)
            centroids[0] = X[first_idx].copy()
            closest_dist_sq = np.zeros(n_samples)
            for c in range(1, self.n_clusters):
                if NUMBA_AVAILABLE:
                    distances = _fast_distances(X, centroids[:c])
                    closest_dist_sq = np.min(distances, axis=1)
                else:
                    for i in range(n_samples):
                        min_dist = float('inf')
                        for j in range(c):
                            dist = np.sum((X[i] - centroids[j])**2)
                            min_dist = min(min_dist, dist)
                        closest_dist_sq[i] = min_dist
                sum_distances = closest_dist_sq.sum()
                if sum_distances > 0:
                    probs = closest_dist_sq / sum_distances
                    cumprobs = np.cumsum(probs)
                    r = random_state.rand()
                    next_centroid_idx = np.searchsorted(cumprobs, r)
                    if next_centroid_idx >= n_samples:
                        next_centroid_idx = random_state.randint(n_samples)
                else:
                    next_centroid_idx = random_state.randint(n_samples)
                centroids[c] = X[next_centroid_idx].copy()
            return centroids
        else:
            raise ValueError(f"Unknown initialization method: {self.init}")

    def _compute_distances(self, X, centroids):
        if NUMBA_AVAILABLE:
            return _fast_distances(X, centroids)
        else:
            result = np.zeros((X.shape[0], centroids.shape[0]))
            for i, centroid in enumerate(centroids):
                result[:, i] = np.sum((X - centroid)**2, axis=1)
            return result

    def _update_centroids(self, X, labels):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]), dtype=X.dtype)
        counts = np.bincount(labels, minlength=self.n_clusters)
        for k in range(self.n_clusters):
            if counts[k] > 0:
                mask = (labels == k)
                new_centroids[k] = np.sum(X[mask], axis=0) / counts[k]
            else:
                new_centroids[k] = self.centroids_[k]
        return new_centroids

    def fit(self, X, y=None):
        X = check_array(X)
        n_samples = X.shape[0]
        self.centroids_ = self._initialize_centroids(X)
        self.labels_ = np.zeros(n_samples, dtype=np.int32)
        self.inertia_ = 0.0
        self.n_iter_ = 0
        self.iteration_table_ = []
        stable_points = np.zeros(n_samples, dtype=bool)

        for iteration in range(self.max_iterations):
            self.n_iter_ = iteration + 1
            old_centroids = self.centroids_.copy()

            if iteration % self.stable_point_check_interval == 0:
                stable_points[:] = False

            active_indices = np.where(~stable_points)[0]
            if len(active_indices) == 0:
                break

            active_X = X[active_indices]
            old_labels = self.labels_[active_indices].copy()
            distances = self._compute_distances(active_X, self.centroids_)
            new_labels = np.argmin(distances, axis=1)
            self.labels_[active_indices] = new_labels
            changed_indices = np.where(new_labels != old_labels)[0]
            newly_stable = active_indices[np.isin(np.arange(len(active_indices)), changed_indices, invert=True)]
            stable_points[newly_stable] = True

            iteration_info = {
                'iteration': iteration + 1,
                'total_points': n_samples,
                'active_points': len(active_indices),
                'points_changed': len(changed_indices),
                'new_stable_points': len(newly_stable),
                'cumulative_stable_points': np.sum(stable_points)
            }
            self.iteration_table_.append(iteration_info)

            self.centroids_ = self._update_centroids(X, self.labels_)
            centroid_shift = np.max(np.sqrt(np.sum((old_centroids - self.centroids_)**2, axis=1)))
            if centroid_shift < self.tolerance:
                break

            if self.verbose and (iteration + 1) % 10 == 0:
                logger.info(f"Iteration {iteration + 1}: {len(changed_indices)} points changed clusters")

        distances = self._compute_distances(X, self.centroids_)
        self.inertia_ = np.sum(distances[np.arange(n_samples), self.labels_])
        if self.verbose:
            logger.info(f"K-means converged after {self.n_iter_} iterations. Inertia: {self.inertia_:.4f}")
        return self

    def predict(self, X):
        check_is_fitted(self, ['centroids_'])
        X = check_array(X)
        distances = self._compute_distances(X, self.centroids_)
        return np.argmin(distances, axis=1)

    def print_iteration_table(self):
        try:
            import pandas as pd
            df = pd.DataFrame(self.iteration_table_)
            print("\nIteration Table:")
            print(df)
        except ImportError:
            print("\nIteration Table:")
            for info in self.iteration_table_:
                print(info)

# =============================================================================
# Helper Functions for Data Loading, Preprocessing, and Evaluation
# =============================================================================
def load_wine_data() -> pd.DataFrame:
    """Load the Wine Quality dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    try:
        response = urllib.request.urlopen(url)
        data = response.read().decode('utf-8')
        df = pd.read_csv(io.StringIO(data), sep=';')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame({
            'fixed acidity': [7.4, 7.8, 7.8, 11.2, 7.4],
            'volatile acidity': [0.7, 0.88, 0.76, 0.28, 0.7],
            'citric acid': [0, 0, 0.04, 0.56, 0],
            'residual sugar': [1.9, 2.6, 2.3, 1.9, 1.9],
            'chlorides': [0.076, 0.098, 0.092, 0.075, 0.076],
            'free sulfur dioxide': [11, 25, 15, 17, 11],
            'total sulfur dioxide': [34, 67, 54, 60, 34],
            'density': [0.9978, 0.9968, 0.997, 0.998, 0.9978],
            'pH': [3.51, 3.2, 3.26, 3.16, 3.51],
            'sulphates': [0.56, 0.68, 0.65, 0.58, 0.56],
            'alcohol': [9.4, 9.8, 9.8, 9.8, 9.4],
            'quality': [5, 5, 5, 6, 5]
        })

def load_synthetic_data(n_samples=1000, n_features=2, n_clusters=3,
                        random_state=42) -> Tuple[np.ndarray, np.ndarray]:
    """Generate synthetic dataset with known clusters."""
    np.random.seed(random_state)
    X = np.concatenate([
        np.random.normal(0, 1, (n_samples//3, n_features)),
        np.random.normal(4, 1.5, (n_samples//3, n_features)),
        np.random.normal(-4, 0.5, (n_samples//3, n_features))
    ])
    y_true = np.concatenate([
        np.zeros(n_samples//3, dtype=int),
        np.ones(n_samples//3, dtype=int),
        np.full(n_samples//3, 2, dtype=int)
    ])
    return X, y_true

def load_pendigits_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the Pendigits dataset from the UCI repository."""
    base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
    train_url = base_url + "pendigits.tra"
    test_url = base_url + "pendigits.tes"
    try:
        train_data = pd.read_csv(train_url, header=None)
        test_data = pd.read_csv(test_url, header=None)
        data = pd.concat([train_data, test_data], axis=0)
        X = data.iloc[:, :-1].values
        y = data.iloc[:, -1].values
        return X, y
    except Exception as e:
        print("Error loading pendigits data:", e)
        X_sample = np.random.rand(100, 16)
        y_sample = np.random.randint(0, 10, size=100)
        return X_sample, y_sample

def load_mnist_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load the MNIST dataset (8M version)."""
    url = "https://raw.githubusercontent.com/mnielsen/neural-networks-and-deep-learning/master/data/mnist.pkl.gz"
    try:
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f:
                X = np.load(f)
                y = np.load(f)
                subset_size = 10000
                X = X[:subset_size].reshape(subset_size, -1)
                y = y[:subset_size]
                return X, y
    except Exception as e:
        print("Error loading MNIST data:", e)
        X_sample = np.random.rand(1000, 784)
        y_sample = np.random.randint(0, 10, size=1000)
        return X_sample, y_sample

def preprocess_data(df: pd.DataFrame) -> Tuple[np.ndarray, Dict, Optional[np.ndarray]]:
    """Preprocess the wine dataset and return preprocessed features and info."""
    df_clean = df.copy()
    df_clean = df_clean.fillna(df_clean.mean())

    if 'quality' in df_clean.columns:
        quality = df_clean['quality'].values
        quality_median = np.median(quality)
        y_true = (quality > quality_median).astype(int)
        df_clean = df_clean.drop('quality', axis=1)
    else:
        y_true = None

    Q1 = df_clean.quantile(0.25)
    Q3 = df_clean.quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) |
                          (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]

    if y_true is not None:
        y_true = y_true[:len(df_clean)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean)

    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    preprocessing_info = {
        'original_shape': df.shape,
        'cleaned_shape': df_clean.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'features': df.columns.tolist() if hasattr(df, 'columns') else None
    }

    return X_pca, preprocessing_info, y_true

def determine_optimal_k(X: np.ndarray, max_k: int = 10) -> int:
    """Determine optimal number of clusters using multiple methods."""
    results = {
        'k': list(range(2, max_k + 1)),
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }

    for k in range(2, max_k + 1):
        kmeans = SklearnKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))

    inertias = np.array(results['inertia'])
    if len(inertias) > 2:
        diffs = np.diff(inertias)
        second_diffs = np.diff(diffs)
        elbow_idx = np.argmax(second_diffs) + 2
    else:
        elbow_idx = 2

    silhouette_idx = np.argmax(results['silhouette']) + 2
    ch_idx = np.argmax(results['calinski_harabasz']) + 2
    db_idx = np.argmin(results['davies_bouldin']) + 2

    votes = {k: 0 for k in range(2, max_k + 1)}
    votes[elbow_idx] += 1
    votes[silhouette_idx] += 1
    votes[ch_idx] += 1
    votes[db_idx] += 1

    optimal_k = max(votes.items(), key=lambda x: x[1])[0]

    logging.info(f"Optimal k votes: Elbow={elbow_idx}, Silhouette={silhouette_idx}, "
                 f"Calinski-Harabasz={ch_idx}, Davies-Bouldin={db_idx}")
    logging.info(f"Selected optimal k={optimal_k}")

    return optimal_k, results

# =============================================================================
# Benchmark and Evaluation Functions
# =============================================================================
def run_bench_evaluation():
    """
    Benchmark evaluation comparing BottomUpKMeans, OptimizedKMeans,
    and SklearnKMeans on synthetic data.
    """
    print("\n" + "="*80)
    print("BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans")
    print("="*80)

    # Use synthetic data for benchmarking
    X, y_true = load_synthetic_data(n_samples=30000, n_features=5, n_clusters=3)
    n_clusters = 3
    n_runs = 3

    results = {
        'times': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'metrics': {'bottomup': [], 'optimized': [], 'sklearn': []},
        'iterations': {'bottomup': [], 'optimized': [], 'sklearn': []}
    }

    for run in range(n_runs):
        print(f"\nRun {run+1}/{n_runs}")

        # BottomUpKMeans evaluation
        start_time = time.time()
        bu_kmeans = HybridBottomUpKMeans(n_clusters=n_clusters, random_state=42+run, verbose=True)
        bu_kmeans.fit(X)
        bu_time = time.time() - start_time
        results['times']['bottomup'].append(bu_time)
        bu_metrics = {
            'silhouette': silhouette_score(X, bu_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, bu_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, bu_kmeans.labels_),
            'inertia': bu_kmeans.inertia_
        }
        if y_true is not None:
            bu_metrics['adjusted_rand'] = adjusted_rand_score(y_true, bu_kmeans.labels_)
            bu_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, bu_kmeans.labels_)
        results['metrics']['bottomup'].append(bu_metrics)
        results['iterations']['bottomup'].append(bu_kmeans.n_iter_)

        # Optionally print the iteration table for the first run
        if run == 0:
            bu_kmeans.print_iteration_table()

        # OptimizedKMeans evaluation
        start_time = time.time()
        opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run)
        opt_kmeans.fit(X)
        opt_time = time.time() - start_time
        results['times']['optimized'].append(opt_time)
        opt_metrics = {
            'silhouette': silhouette_score(X, opt_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, opt_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, opt_kmeans.labels_),
            'inertia': opt_kmeans.inertia_
        }
        if y_true is not None:
            opt_metrics['adjusted_rand'] = adjusted_rand_score(y_true, opt_kmeans.labels_)
            opt_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, opt_kmeans.labels_)
        results['metrics']['optimized'].append(opt_metrics)
        results['iterations']['optimized'].append(opt_kmeans.n_iter_)

        # SklearnKMeans evaluation
        start_time = time.time()
        sk_kmeans = SklearnKMeans(n_clusters=n_clusters, random_state=42+run, init='k-means++')
        sk_kmeans.fit(X)
        sk_time = time.time() - start_time
        results['times']['sklearn'].append(sk_time)
        sk_metrics = {
            'silhouette': silhouette_score(X, sk_kmeans.labels_),
            'calinski_harabasz': calinski_harabasz_score(X, sk_kmeans.labels_),
            'davies_bouldin': davies_bouldin_score(X, sk_kmeans.labels_),
            'inertia': sk_kmeans.inertia_
        }
        if y_true is not None:
            sk_metrics['adjusted_rand'] = adjusted_rand_score(y_true, sk_kmeans.labels_)
            sk_metrics['adjusted_mutual_info'] = adjusted_mutual_info_score(y_true, sk_kmeans.labels_)
        results['metrics']['sklearn'].append(sk_metrics)
        results['iterations']['sklearn'].append(sk_kmeans.n_iter_)

    # Print aggregated benchmark results
    print("\nBenchmark Results:")
    print("==================")
    print(f"Average execution time (BottomUpKMeans): {np.mean(results['times']['bottomup']):.3f} seconds")
    print(f"Average execution time (OptimizedKMeans): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (SklearnKMeans): {np.mean(results['times']['sklearn']):.3f} seconds")

    print("\nClustering Quality Metrics (averages):")
    for method in ['bottomup', 'optimized', 'sklearn']:
        m = results['metrics'][method]
        avg_silhouette = np.mean([mm['silhouette'] for mm in m])
        avg_ch = np.mean([mm['calinski_harabasz'] for mm in m])
        avg_db = np.mean([mm['davies_bouldin'] for mm in m])
        avg_inertia = np.mean([mm['inertia'] for mm in m])
        print(f"\n{method.upper()}:")
        print(f"  Silhouette Score: {avg_silhouette:.3f}")
        print(f"  Calinski-Harabasz: {avg_ch:.3f}")
        print(f"  Davies-Bouldin: {avg_db:.3f}")
        print(f"  Inertia: {avg_inertia:.3f}")
        if y_true is not None:
            avg_adj_rand = np.mean([mm['adjusted_rand'] for mm in m])
            avg_adj_mutual = np.mean([mm['adjusted_mutual_info'] for mm in m])
            print(f"  Adjusted Rand: {avg_adj_rand:.3f}")
            print(f"  Adjusted Mutual Info: {avg_adj_mutual:.3f}")

    print("\nAverage Iterations:")
    for method in ['bottomup', 'optimized', 'sklearn']:
        avg_iter = np.mean(results['iterations'][method])
        print(f"  {method.upper()}: {avg_iter:.1f} iterations")

def print_results(results: Dict, y_true_available: bool = False) -> None:
    """Print formatted benchmark results."""
    print("\nBenchmark Results:")
    print("=================")
    print(f"Average execution time (Optimized): {np.mean(results['times']['optimized']):.3f} seconds")
    print(f"Average execution time (Sklearn): {np.mean(results['times']['sklearn']):.3f} seconds")
    speedup = np.mean(results['times']['sklearn']) / np.mean(results['times']['optimized'])
    print(f"Speedup factor: {speedup:.2f}x")

    print("\nInternal Clustering Quality Metrics:")
    print(f"Average Silhouette Score (Optimized): {np.mean([m['silhouette'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Silhouette Score (Sklearn): {np.mean([m['silhouette'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Calinski-Harabasz (Optimized): {np.mean([m['calinski_harabasz'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Calinski-Harabasz (Sklearn): {np.mean([m['calinski_harabasz'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Davies-Bouldin (Optimized): {np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Davies-Bouldin (Sklearn): {np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']]):.3f}")

    print(f"Average Inertia (Optimized): {np.mean([m['inertia'] for m in results['metrics']['optimized']]):.3f}")
    print(f"Average Inertia (Sklearn): {np.mean([m['inertia'] for m in results['metrics']['sklearn']]):.3f}")

    if y_true_available:
        print("\nExternal Clustering Quality Metrics (against ground truth):")
        print(f"Average Adjusted Rand (Optimized): {np.mean([m['adjusted_rand'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Rand (Sklearn): {np.mean([m['adjusted_rand'] for m in results['metrics']['sklearn']]):.3f}")

        print(f"Average Adjusted Mutual Info (Optimized): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['optimized']]):.3f}")
        print(f"Average Adjusted Mutual Info (Sklearn): {np.mean([m['adjusted_mutual_info'] for m in results['metrics']['sklearn']]):.3f}")

    print("\nConvergence:")
    print(f"Average iterations (Optimized): {np.mean(results['iterations']['optimized']):.1f}")
    print(f"Average iterations (Sklearn): {np.mean(results['iterations']['sklearn']):.1f}")

def visualize_results(X: np.ndarray, results: Dict, optim_k_results: Dict = None,
                      preprocessing_info: Dict = None, y_true: Optional[np.ndarray] = None) -> None:
    """Create visualizations for clustering results."""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        plt.style.use('seaborn-v0_8-darkgrid')
        fig = plt.figure(figsize=(20, 15))

        # 1. Execution Time
        plt.subplot(3, 2, 1)
        times_data = {
            'Optimized': np.mean(results['times']['optimized']),
            'Sklearn': np.mean(results['times']['sklearn'])
        }
        ax = sns.barplot(x=list(times_data.keys()), y=list(times_data.values()))
        for i, v in enumerate(list(times_data.values())):
            ax.text(i, v * 1.01, f"{v:.3f}s", ha='center')
        plt.title('Average Execution Time (seconds)', fontsize=14)
        plt.ylabel('Time (seconds)')

        # 2. Iterations
        plt.subplot(3, 2, 2)
        iterations_data = {
            'Optimized': np.mean(results['iterations']['optimized']),
            'Sklearn': np.mean(results['iterations']['sklearn'])
        }
        ax = sns.barplot(x=list(iterations_data.keys()), y=list(iterations_data.values()))
        for i, v in enumerate(list(iterations_data.values())):
            ax.text(i, v * 1.01, f"{v:.1f}", ha='center')
        plt.title('Average Number of Iterations', fontsize=14)
        plt.ylabel('Iterations')

        # 3. Silhouette Score
        plt.subplot(3, 2, 3)
        metrics_opt = np.mean([m['silhouette'] for m in results['metrics']['optimized']])
        metrics_sk = np.mean([m['silhouette'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[metrics_opt, metrics_sk])
        for i, v in enumerate([metrics_opt, metrics_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Silhouette Score (higher is better)', fontsize=14)
        plt.ylabel('Score')

        # 4. Davies-Bouldin Score
        plt.subplot(3, 2, 4)
        db_opt = np.mean([m['davies_bouldin'] for m in results['metrics']['optimized']])
        db_sk = np.mean([m['davies_bouldin'] for m in results['metrics']['sklearn']])
        ax = sns.barplot(x=['Optimized', 'Sklearn'], y=[db_opt, db_sk])
        for i, v in enumerate([db_opt, db_sk]):
            ax.text(i, v * 1.01, f"{v:.3f}", ha='center')
        plt.title('Average Davies-Bouldin Score (lower is better)', fontsize=14)
        plt.ylabel('Score')

        # 5. Elbow Method (if provided)
        if optim_k_results:
            plt.subplot(3, 2, 5)
            k_values = optim_k_results['k']
            plt.plot(k_values, optim_k_results['inertia'], 'o-', label='Inertia')
            plt.title('Elbow Method for Optimal k', fontsize=14)
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.xticks(k_values)
            plt.grid(True)
            ax2 = plt.twinx()
            sil_scores = np.array(optim_k_results['silhouette'])
            norm_sil = (sil_scores - sil_scores.min()) / (sil_scores.max() - sil_scores.min())
            ax2.plot(k_values, norm_sil, 'x-', color='red', label='Silhouette (normalized)')
            ax2.set_ylabel('Normalized Silhouette Score')
            lines1, labels1 = plt.gca().get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

        # 6. PCA Visualization of Clusters
        if X.shape[1] >= 2:
            plt.subplot(3, 2, 6)
            opt_kmeans = OptimizedKMeans(n_clusters=len(np.unique(y_true)) if y_true is not None else 3,
                                         random_state=42)
            opt_labels = opt_kmeans.fit_predict(X)
            scatter = plt.scatter(X[:, 0], X[:, 1], c=opt_labels, cmap='viridis', alpha=0.6)
            plt.scatter(opt_kmeans.centroids_[:, 0], opt_kmeans.centroids_[:, 1],
                        marker='X', s=200, c='red', label='Centroids')
            plt.colorbar(scatter)
            plt.title('Cluster Visualization (First 2 PCA Components)', fontsize=14)
            plt.xlabel('PCA 1')
            plt.ylabel('PCA 2')
            plt.legend()

        plt.tight_layout()
        plt.savefig('kmeans_benchmark_results.png', dpi=300)
        plt.show()

    except ImportError:
        print("Matplotlib or seaborn not available for visualization. Install with: pip install matplotlib seaborn")

def compare_initialization_methods(X: np.ndarray, n_clusters: int, n_runs: int = 5) -> Dict:
    """Compare random vs k-means++ initialization."""
    init_methods = ['random', 'k-means++']
    results = {method: {'time': [], 'inertia': [], 'iterations': []} for method in init_methods}

    for method in init_methods:
        for run in range(n_runs):
            start_time = time.time()
            opt_kmeans = OptimizedKMeans(n_clusters=n_clusters, random_state=42+run, init=method)
            opt_kmeans.fit(X)
            end_time = time.time()
            results[method]['time'].append(end_time - start_time)
            results[method]['inertia'].append(opt_kmeans.inertia_)
            results[method]['iterations'].append(opt_kmeans.n_iter_)

    print("\nInitialization Method Comparison:")
    print("================================")
    for method in init_methods:
        print(f"\n{method.upper()}:")
        print(f"Average Time: {np.mean(results[method]['time']):.3f} seconds")
        print(f"Average Inertia: {np.mean(results[method]['inertia']):.3f}")
        print(f"Average Iterations: {np.mean(results[method]['iterations']):.1f}")

    return results

def run_pendigits_evaluation():
    """Evaluate clustering performance on the Pendigits dataset."""
    print("\n" + "="*80)
    print("PENDIGITS DATA EVALUATION")
    print("="*80)

    X_pend, y_pend = load_pendigits_data()
    print(f"Pendigits data shape: {X_pend.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_pend))}")

    scaler = StandardScaler()
    X_pend_scaled = scaler.fit_transform(X_pend)
    pca = PCA(n_components=0.95)
    X_pend_pca = pca.fit_transform(X_pend_scaled)
    print(f"Preprocessed Pendigits data shape: {X_pend_pca.shape}")

    preprocessing_info_pend = {
        'original_shape': X_pend.shape,
        'cleaned_shape': X_pend.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_pend))
    print(f"Running benchmark on Pendigits data with {n_clusters} clusters...")
    pendigits_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

def run_mnist_evaluation():
    """Evaluate clustering performance on the MNIST dataset."""
    print("\n" + "="*80)
    print("MNIST DATA EVALUATION")
    print("="*80)

    X_mnist, y_mnist = load_mnist_data()
    print(f"MNIST data shape: {X_mnist.shape}")
    print(f"Unique labels (digits): {len(np.unique(y_mnist))}")

    scaler = StandardScaler()
    X_mnist_scaled = scaler.fit_transform(X_mnist)
    pca = PCA(n_components=0.95)
    X_mnist_pca = pca.fit_transform(X_mnist_scaled)
    print(f"Preprocessed MNIST data shape: {X_mnist_pca.shape}")

    preprocessing_info_mnist = {
        'original_shape': X_mnist.shape,
        'cleaned_shape': X_mnist.shape,
        'pca_components': pca.n_components_,
        'explained_variance_ratio': pca.explained_variance_ratio_
    }

    n_clusters = len(np.unique(y_mnist))
    print(f"Running benchmark on MNIST data with {n_clusters} clusters...")
    mnist_results = run_bench_evaluation()  # Or call run_benchmark-like function if desired

    print_results(mnist_results, y_true_available=True)
    visualize_results(X_mnist_pca, mnist_results, optim_k_results=None,
                     preprocessing_info=preprocessing_info_mnist, y_true=y_mnist)

def run_full_evaluation():
    """Run complete evaluation on synthetic, wine, and pendigits datasets."""
    # 1. Synthetic Data Test
    print("\n" + "="*80)
    print("SYNTHETIC DATA EVALUATION")
    print("="*80)

    X_synth, y_synth = load_synthetic_data(n_samples=100000, n_features=10, n_clusters=20)
    print(f"Synthetic data shape: {X_synth.shape}")
    print(f"Known clusters: 3")

    preprocessing_info_synth = {"original_shape": X_synth.shape, "cleaned_shape": X_synth.shape}

    print("\nRunning benchmark on synthetic data...")
    synth_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(synth_results, y_true_available=True)
    visualize_results(X_synth, synth_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_synth, y_true=y_synth)

    # 2. Wine Data Test
    print("\n" + "="*80)
    print("WINE DATA EVALUATION")
    print("="*80)

    wine_df = load_wine_data()
    print(f"Wine data shape: {wine_df.shape}")

    X_wine, preprocessing_info_wine, y_wine = preprocess_data(wine_df)
    print(f"Preprocessed wine data shape: {X_wine.shape}")

    print("\nRunning benchmark on wine data...")
    wine_results = run_bench_evaluation()  # Or call run_benchmark if preferred
    print_results(wine_results, y_true_available=True)
    visualize_results(X_wine, wine_results, optim_k_results=None,
                      preprocessing_info=preprocessing_info_wine, y_true=y_wine)

    # 3. Compare Initialization Methods
    print("\n" + "="*80)
    print("COMPARING INITIALIZATION METHODS")
    print("="*80)

    print("\nComparing initialization methods on synthetic data...")
    init_comparison_synth = compare_initialization_methods(X_synth, n_clusters=3, n_runs=5)

    print("\nComparing initialization methods on wine data...")
    init_comparison_wine = compare_initialization_methods(X_wine, n_clusters=2, n_runs=5)

    # 4. Pendigits and MNIST Data Test
    run_pendigits_evaluation()
    run_mnist_evaluation()
    print("\nComplete evaluation finished.")

# =============================================================================
# Main Entry Point
# =============================================================================
if __name__ == "__main__":
    # To run the benchmark evaluation that includes BottomUpKMeans,
    # simply call run_bench_evaluation(). You can also run the full evaluation.
    run_bench_evaluation()
    # Alternatively, uncomment the following line to run all evaluations:
    #run_full_evaluation()
    #scalene_profiler.stop()




BENCHMARK EVALUATION: BottomUpKMeans vs OptimizedKMeans vs SklearnKMeans

Run 1/3

Run 2/3
