# Icecat Taxonomy Clustering Analysis

This notebook orchestrates the clustering analysis using the modular `src` package.
It performs:
1.  Data Loading (from JSON)
2.  Feature Engineering (Text Embeddings)
3.  Clustering (KMeans, HDBSCAN, DBSCAN, Agglomerative)
4.  Evaluation (Metrics)
5.  Visualization (Interactive 2D Plots)

In [None]:
# 1. Setup & Imports
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src import config, data_loader, features, clustering, evaluation, visualization
Developer: Reload Window
print(f"Using Data Path: {config.DATA_PATH}")

In [None]:
# 2. Load Data
try:
    df = data_loader.load_icecat_data()
    print(f"Loaded {len(df)} records.")
except FileNotFoundError as e:
    print(e)
    print("Please ensure the dataset path in `src/config.py` is correct.")

In [None]:
# 3. Feature Engineering
if 'df' in locals():
    df = features.create_text_features(df)
    embeddings = features.generate_embeddings(df)
    print(f"Embeddings shape: {embeddings.shape}")

In [None]:
# 4. Clustering Experiments
results = {}
labels_dict = {}

if 'embeddings' in locals():
    # Prepare Ground Truth if available
    y_true = df[config.LABEL_COL] if config.LABEL_COL in df.columns else None
    
    # --- K-Means ---
    labels_km = clustering.run_kmeans(embeddings, n_clusters=50)
    metrics_km = evaluation.compute_metrics(embeddings, labels_km, y_true)
    results['KMeans'] = metrics_km
    labels_dict['KMeans'] = labels_km
    print(f"KMeans Metrics: {metrics_km}")

    # --- HDBSCAN ---
    labels_hdb = clustering.run_hdbscan(embeddings)
    metrics_hdb = evaluation.compute_metrics(embeddings, labels_hdb, y_true)
    results['HDBSCAN'] = metrics_hdb
    labels_dict['HDBSCAN'] = labels_hdb
    print(f"HDBSCAN Metrics: {metrics_hdb}")
    
    # --- DBSCAN ---
    labels_db = clustering.run_dbscan(embeddings, eps=0.5, min_samples=5)
    metrics_db = evaluation.compute_metrics(embeddings, labels_db, y_true)
    results['DBSCAN'] = metrics_db
    labels_dict['DBSCAN'] = labels_db
    print(f"DBSCAN Metrics: {metrics_db}")
    
    # --- Agglomerative ---
    # (Can be slow for >20k points)
    if len(df) <= 20000:
        labels_agg = clustering.run_agglomerative(embeddings, n_clusters=50)
        metrics_agg = evaluation.compute_metrics(embeddings, labels_agg, y_true)
        results['Agglomerative'] = metrics_agg
        labels_dict['Agglomerative'] = labels_agg
        print(f"Agglomerative Metrics: {metrics_agg}")
    else:
        print("Skipping Agglomerative Clustering (dataset too large for O(N^2) memory).")

In [None]:
# 5. Comparison Table
if results:
    df_results = pd.DataFrame(results).T
    display(df_results.style.highlight_max(axis=0, color='lightgreen'))

In [None]:
# 6. Visual Analysis (t-SNE / UMAP)

# Choose model to visualize (e.g., HDBSCAN or KMeans)
MODEL_TO_PLOT = 'HDBSCAN'

if 'embeddings' in locals() and MODEL_TO_PLOT in labels_dict:
    # Reduce to 2D (Sample if huge)
    if len(embeddings) > 10000:
        idx = np.random.choice(len(embeddings), 10000, replace=False)
        emb_viz = embeddings[idx]
        lab_viz = labels_dict[MODEL_TO_PLOT][idx]
        true_viz = y_true.iloc[idx] if y_true is not None else None
    else:
        emb_viz = embeddings
        lab_viz = labels_dict[MODEL_TO_PLOT]
        true_viz = y_true

    # 1. Compute 2D projection
    embeddings_2d = visualization.reduce_dimensions(emb_viz, method='umap') # Try 'umap' or 'tsne' or 'pca'
    
    # 2. Plot
    visualization.plot_clusters_2d(
        embeddings_2d, 
        lab_viz, 
        title=f"Cluster Visualization ({MODEL_TO_PLOT})", 
        true_labels=true_viz,
        interactive=True
    )