In [None]:
import pandas as pd
import numpy as np
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import os
import argparse
from pathlib import Path
import yaml
import logging
from typing import Dict, List, Set, Tuple, Optional, Any

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('hdbscan_clustering')

def get_container_aisles(container_id: str, container_data: pd.DataFrame, 
                         sku_aisle_mapping: Dict[str, List[int]]) -> Set[int]:
    """
    Get optimized aisles required for a specific container, minimizing total aisles visited.
    Chooses the best aisle for multi-location SKUs by considering other SKUs in the container.
    """
    # Get all SKUs for this container
    container_skus = container_data[container_data['container_id'] == container_id]['item_number'].unique()

    # First, identify single-location SKUs - these must be visited
    must_visit_aisles = set()
    multi_location_skus = []

    for sku in container_skus:
        if sku in sku_aisle_mapping:
            aisles = sku_aisle_mapping[sku]
            if len(aisles) == 1:
                # Single location SKU - must visit this aisle
                must_visit_aisles.add(aisles[0])
            else:
                # Multi-location SKU - will optimize later
                multi_location_skus.append(sku)

    # For multi-location SKUs, choose aisles to minimize additional aisles
    for sku in multi_location_skus:
        aisles = sku_aisle_mapping[sku]
        
        # Check if any of the SKU's aisles are already in the must-visit set
        already_covered = [aisle for aisle in aisles if aisle in must_visit_aisles]
        
        if already_covered:
            # If one or more aisles are already covered, pick the first one
            best_aisle = already_covered[0]
        else:
            # Otherwise, find the aisle that minimizes the distance to the nearest must-visit aisle
            # If no must-visit aisles yet, choose the first available aisle
            if not must_visit_aisles:
                best_aisle = aisles[0]
            else:
                # Calculate "distance" to the nearest must-visit aisle for each option
                min_distance = float('inf')
                best_aisle = None
                
                for aisle in aisles:
                    # Find distance to closest must-visit aisle
                    closest_distance = min(abs(aisle - existing) for existing in must_visit_aisles)
                    
                    if closest_distance < min_distance:
                        min_distance = closest_distance
                        best_aisle = aisle
        
        # Add the best aisle to the must-visit set
        must_visit_aisles.add(best_aisle)

    return must_visit_aisles

def compute_container_features(container_id: str, container_data: pd.DataFrame, 
                              sku_aisle_mapping: Dict[str, List[int]]) -> Tuple[float, float, int]:
    """
    Compute feature vector for a container: (aisle_centroid, aisle_span, distinct_aisles)
    """
    # Get container aisles
    aisles = get_container_aisles(container_id, container_data, sku_aisle_mapping)
    
    if not aisles:
        return 0, 0, 0
    
    # Calculate aisle centroid and span
    centroid = sum(aisles) / len(aisles)
    span = max(aisles) - min(aisles) if len(aisles) > 1 else 0
    
    # Count distinct aisles
    distinct_aisles = len(aisles)
    
    return centroid, span, distinct_aisles

def preprocess_data(container_data: pd.DataFrame, slotbook_data: pd.DataFrame, 
                   use_distinct_aisles: bool = True,
                   centroid_weight: float = 0.5,
                   secondary_weight: float = 0.5) -> Tuple[List[str], np.ndarray, np.ndarray]:
    """
    Preprocess container data to create feature matrix for clustering.
    
    Parameters:
    -----------
    container_data : pd.DataFrame
        DataFrame containing container data with container_id and item_number
    slotbook_data : pd.DataFrame
        DataFrame containing slotbook data with item_number and aisle_sequence
    use_distinct_aisles : bool
        Whether to use distinct aisles count (True) or aisle span (False) as second feature
    centroid_weight : float
        Weight for the centroid feature
    secondary_weight : float
        Weight for the secondary feature (distinct aisles or span)
        
    Returns:
    --------
    valid_containers : List[str]
        List of container IDs that were successfully processed
    feature_matrix : np.ndarray
        Matrix of normalized features for each container
    raw_feature_matrix : np.ndarray
        Matrix of raw (unweighted, unnormalized) features for each container
    """
    logger.info("Preprocessing container data for clustering")
    
    # Build SKU-to-aisle mapping
    sku_aisle_mapping = {}
    for _, row in slotbook_data.iterrows():
        if row['item_number'] not in sku_aisle_mapping:
            sku_aisle_mapping[row['item_number']] = []
        sku_aisle_mapping[row['item_number']].append(row['aisle_sequence'])
    
    # For each SKU, sort its aisles
    for sku in sku_aisle_mapping:
        sku_aisle_mapping[sku].sort()
    
    # Get all container IDs
    container_ids = container_data['container_id'].unique().tolist()
    
    # Compute features for each container
    feature_arrays = []
    valid_containers = []
    container_features = {}
    
    for container_id in container_ids:
        # Compute container features
        centroid, span, distinct_aisles = compute_container_features(container_id, container_data, sku_aisle_mapping)
        container_features[container_id] = (centroid, span, distinct_aisles)
        
        # Skip containers with no aisle data
        if centroid == 0 and span == 0:
            continue
            
        # Choose secondary feature based on flag
        secondary_feature = distinct_aisles if use_distinct_aisles else span
        
        feature_arrays.append([
            centroid * centroid_weight,          # Aisle centroid 
            secondary_feature * secondary_weight # Secondary feature
        ])
        valid_containers.append(container_id)
    
    if not valid_containers:
        logger.warning("No valid containers found with aisle data")
        return [], np.array([])
    
    # Create feature matrix
    feature_matrix = np.array(feature_arrays)
    
    # Store a copy of the raw features before normalization (but after applying weights)
    weighted_feature_matrix = feature_matrix.copy()
    
    # Create unweighted raw features by extracting the original values
    raw_feature_matrix = np.zeros_like(feature_matrix)
    raw_feature_matrix[:, 0] = feature_matrix[:, 0] / centroid_weight  # Remove centroid weight
    raw_feature_matrix[:, 1] = feature_matrix[:, 1] / secondary_weight  # Remove secondary feature weight
    
    # Log feature statistics
    logger.info(f"Processed {len(valid_containers)} valid containers")
    logger.info(f"Feature 1 (Centroid) - Mean: {np.mean(raw_feature_matrix[:, 0]):.2f}, "
               f"Min: {np.min(raw_feature_matrix[:, 0]):.2f}, "
               f"Max: {np.max(raw_feature_matrix[:, 0]):.2f}")
    logger.info(f"Feature 2 ({('Distinct Aisles' if use_distinct_aisles else 'Aisle Span')}) - "
               f"Mean: {np.mean(raw_feature_matrix[:, 1]):.2f}, "
               f"Min: {np.min(raw_feature_matrix[:, 1]):.2f}, "
               f"Max: {np.max(raw_feature_matrix[:, 1]):.2f}")
    
    # Normalize weighted features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(feature_matrix)
    
    return valid_containers, normalized_features, raw_feature_matrix

def run_hdbscan_clustering(feature_matrix: np.ndarray, 
                          min_cluster_size: int = 5,
                          min_samples: int = None, 
                          cluster_selection_epsilon: float = 0.0,
                          alpha: float = 1.0,
                          cluster_selection_method: str = 'eom') -> np.ndarray:
    """
    Run HDBSCAN clustering on the feature matrix.
    
    Parameters:
    -----------
    feature_matrix : np.ndarray
        Matrix of normalized features for each container
    min_cluster_size : int
        Minimum number of containers in a cluster
    min_samples : int
        Number of samples in a neighborhood for a point to be a core point
    cluster_selection_epsilon : float
        Epsilon to use when determining cluster membership
    alpha : float
        Alpha to use in the RobustSingleLinkage
    cluster_selection_method : str
        Method used to select clusters ('eom' or 'leaf')
        
    Returns:
    --------
    labels : np.ndarray
        Cluster labels for each container (-1 for noise)
    """
    # Set min_samples to match min_cluster_size if not provided
    if min_samples is None:
        min_samples = min_cluster_size
        
    logger.info(f"Running HDBSCAN with min_cluster_size={min_cluster_size}, "
               f"min_samples={min_samples}, "
               f"cluster_selection_epsilon={cluster_selection_epsilon}, "
               f"alpha={alpha}, cluster_selection_method={cluster_selection_method}")
    
    # Initialize and run HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=cluster_selection_epsilon,
        alpha=alpha,
        cluster_selection_method=cluster_selection_method,
        gen_min_span_tree=True
    )
    
    # Fit the model
    cluster_labels = clusterer.fit_predict(feature_matrix)
    
    # Count clusters
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    noise_points = np.sum(cluster_labels == -1)
    
    logger.info(f"HDBSCAN found {num_clusters} clusters and {noise_points} noise points")
    
    return cluster_labels, clusterer

def generate_visualizations(feature_matrix: np.ndarray, 
                           cluster_labels: np.ndarray,
                           valid_containers: List[str],
                           clusterer: hdbscan.HDBSCAN,
                           output_dir: str,
                           use_distinct_aisles: bool = True,
                           raw_feature_matrix: np.ndarray = None) -> None:
    """
    Generate visualizations for the clustering results.
    
    Parameters:
    -----------
    feature_matrix : np.ndarray
        Matrix of normalized features for each container
    cluster_labels : np.ndarray
        Cluster labels for each container
    valid_containers : List[str]
        List of container IDs that were successfully processed
    clusterer : hdbscan.HDBSCAN
        HDBSCAN clusterer object
    output_dir : str
        Directory to save visualizations
    use_distinct_aisles : bool
        Whether distinct aisles (True) or aisle span (False) was used as second feature
    raw_feature_matrix : np.ndarray, optional
        Matrix of raw (unweighted, unnormalized) features for visualization
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a colormap for the clusters
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    cmap = plt.cm.get_cmap('tab20', num_clusters + 1)  # +1 for noise points
    
    # 1. Scatter plot of clusters with normalized features
    plt.figure(figsize=(12, 8))
    
    # Plot noise points first (gray)
    noise_mask = (cluster_labels == -1)
    plt.scatter(
        feature_matrix[noise_mask, 0], 
        feature_matrix[noise_mask, 1],
        c='gray', marker='x', alpha=0.5, label='Noise'
    )
    
    # Then plot each cluster with a different color
    for i in range(num_clusters):
        mask = (cluster_labels == i)
        plt.scatter(
            feature_matrix[mask, 0], 
            feature_matrix[mask, 1],
            c=[cmap(i)], marker='o', alpha=0.7, label=f'Cluster {i}'
        )
    
    plt.title('HDBSCAN Clustering Results (Normalized Features)')
    plt.xlabel('Aisle Centroid (normalized)')
    plt.ylabel('Distinct Aisles (normalized)' if use_distinct_aisles else 'Aisle Span (normalized)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'hdbscan_clusters_normalized.png'), dpi=300)
    plt.close()
    
    # 2. Scatter plot with raw (unweighted, unnormalized) features
    if raw_feature_matrix is not None:
        plt.figure(figsize=(12, 8))
        
        # Plot noise points first (gray)
        plt.scatter(
            raw_feature_matrix[noise_mask, 0], 
            raw_feature_matrix[noise_mask, 1],
            c='gray', marker='x', alpha=0.5, label='Noise'
        )
        
        # Then plot each cluster with a different color
        for i in range(num_clusters):
            mask = (cluster_labels == i)
            plt.scatter(
                raw_feature_matrix[mask, 0], 
                raw_feature_matrix[mask, 1],
                c=[cmap(i)], marker='o', alpha=0.7, label=f'Cluster {i}'
            )
        
        plt.title('HDBSCAN Clustering Results (Raw Features)')
        plt.xlabel('Aisle Centroid (raw)')
        plt.ylabel('Distinct Aisles (raw)' if use_distinct_aisles else 'Aisle Span (raw)')
        #plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'hdbscan_clusters_raw.png'), dpi=300)
        plt.close()
    
    # 2. Cluster size distribution
    cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()
    if -1 in cluster_sizes.index:
        cluster_sizes = cluster_sizes.drop(-1)  # Remove noise points
    
    plt.figure(figsize=(10, 6))
    cluster_sizes.plot(kind='bar', color='skyblue')
    plt.axhline(y=np.mean(cluster_sizes), color='r', linestyle='--', label='Average Size')
    plt.xlabel('Cluster ID')
    plt.ylabel('Number of Containers')
    plt.title('Cluster Size Distribution')
    plt.grid(True, axis='y', alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'cluster_sizes.png'), dpi=300)
    plt.close()
    
    # 3. HDBSCAN condensed tree
    plt.figure(figsize=(14, 10))
    clusterer.condensed_tree_.plot(select_clusters=True)
    plt.title('HDBSCAN Condensed Tree')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'hdbscan_tree.png'), dpi=300)
    plt.close()
    
    # 4. HDBSCAN minimum spanning tree
    if hasattr(clusterer, 'minimum_spanning_tree_'):
        plt.figure(figsize=(14, 10))
        clusterer.minimum_spanning_tree_.plot()
        plt.title('HDBSCAN Minimum Spanning Tree')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'hdbscan_mst.png'), dpi=300)
        plt.close()
    
    logger.info(f"Generated visualizations in {output_dir}")

def create_output_dataframe(valid_containers: List[str], 
                           cluster_labels: np.ndarray,
                           container_data: pd.DataFrame,
                           output_dir: str) -> pd.DataFrame:
    """
    Create a dataframe with container IDs and their cluster assignments.
    
    Parameters:
    -----------
    valid_containers : List[str]
        List of container IDs that were successfully processed
    cluster_labels : np.ndarray
        Cluster labels for each container
    container_data : pd.DataFrame
        Original container data
    output_dir : str
        Directory to save output file
        
    Returns:
    --------
    cluster_df : pd.DataFrame
        DataFrame with container IDs and cluster assignments
    """
    # Create dataframe with container IDs and cluster assignments
    cluster_df = pd.DataFrame({
        'container_id': valid_containers,
        'cluster': cluster_labels
    })
    
    # Map -1 to "noise"
    cluster_df['cluster_name'] = cluster_df['cluster'].apply(
        lambda x: f"cluster_{x}" if x >= 0 else "noise"
    )
    
    # Save to CSV
    output_path = os.path.join(output_dir, 'container_clusters.csv')
    cluster_df.to_csv(output_path, index=False)
    logger.info(f"Saved container cluster assignments to {output_path}")
    
    # Log cluster statistics
    cluster_counts = cluster_df['cluster_name'].value_counts()
    logger.info("Cluster statistics:")
    for cluster, count in cluster_counts.items():
        logger.info(f"  {cluster}: {count} containers")
    
    return cluster_df


In [6]:
def load_config(config_path: str) -> Dict[str, Any]:
    with open(config_path, 'r') as config_file:
        config = yaml.safe_load(config_file)
    return config


In [7]:
# Load and validate configuration
config = load_config('root/config/hdbscan_config.yaml')

# Print configuration
logger.info(f"Configuration loaded ")

# Create output directory
output_dir = config['output']['dir']
os.makedirs(output_dir, exist_ok=True)

# Load data
container_data = pd.read_csv('root\input\container_data.csv')
sampled_containers = container_data['container_id'].unique()
container_data = container_data[container_data['container_id'].isin(sampled_containers)]

slotbook_data = pd.read_csv('root\input\slotbook_data.csv')

 # Basic data validation
required_container_cols = ['container_id', 'item_number']
required_slotbook_cols = ['item_number', 'aisle_sequence']

for col in required_container_cols:
    if col not in container_data.columns:
        raise ValueError(f"Container data missing required column: {col}")

for col in required_slotbook_cols:
    if col not in slotbook_data.columns:
        raise ValueError(f"Slotbook data missing required column: {col}")

# Preprocess data
use_distinct_aisles = config['features']['use_distinct_aisles']
centroid_weight = config['features']['centroid_weight']
secondary_weight = config['features']['secondary_weight']

valid_containers, feature_matrix, raw_feature_matrix = preprocess_data(
    container_data, 
    slotbook_data,
    use_distinct_aisles=use_distinct_aisles,
    centroid_weight=centroid_weight,
    secondary_weight=secondary_weight
)

if len(valid_containers) == 0:
    logger.error("No valid containers found. Exiting.")


# Run HDBSCAN clustering
hdbscan_config = config['hdbscan']
cluster_labels, clusterer = run_hdbscan_clustering(
    feature_matrix,
    min_cluster_size=hdbscan_config['min_cluster_size'],
    min_samples=hdbscan_config['min_samples'],
    cluster_selection_epsilon=hdbscan_config['cluster_selection_epsilon'],
    alpha=hdbscan_config['alpha'],
    cluster_selection_method=hdbscan_config['cluster_selection_method']
)

# Generate visualizations
if config['visualizations']['enabled']:
    generate_visualizations(
        feature_matrix,
        cluster_labels,
        valid_containers,
        clusterer,
        output_dir,
        use_distinct_aisles=use_distinct_aisles,
        raw_feature_matrix=raw_feature_matrix
    )

# Create output dataframe
cluster_df = create_output_dataframe(
    valid_containers,
    cluster_labels,
    container_data,
    output_dir
)

logger.info("HDBSCAN clustering completed successfully")


  container_data = pd.read_csv('root\input\container_data.csv')
  slotbook_data = pd.read_csv('root\input\slotbook_data.csv')
2025-03-10 09:45:47,905 - hdbscan_clustering - INFO - Configuration loaded 
2025-03-10 09:45:48,000 - hdbscan_clustering - INFO - Preprocessing container data for clustering
2025-03-10 09:45:51,036 - hdbscan_clustering - INFO - Processed 13530 valid containers
2025-03-10 09:45:51,037 - hdbscan_clustering - INFO - Feature 1 (Centroid) - Mean: 27.78, Min: 1.00, Max: 72.00
2025-03-10 09:45:51,038 - hdbscan_clustering - INFO - Feature 2 (Aisle Span) - Mean: 11.71, Min: 0.00, Max: 70.00
2025-03-10 09:45:51,042 - hdbscan_clustering - INFO - Running HDBSCAN with min_cluster_size=80, min_samples=2, cluster_selection_epsilon=0.01, alpha=1.0, cluster_selection_method=eom
2025-03-10 09:45:51,196 - hdbscan_clustering - INFO - HDBSCAN found 57 clusters and 1805 noise points
  cmap = plt.cm.get_cmap('tab20', num_clusters + 1)  # +1 for noise points
  line_width = edge_linewid