# Phase 3: Clustering Analysis

This notebook analyzes the HDBSCAN clustering results.

**What to look for:**
- Experiment comparison (if multiple)
- Cluster size distribution
- Noise ratio and outliers
- Division purity (do clusters align with divisions?)
- Visual cluster separation

In [None]:
# Setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import json
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display

from config import get_output_path
from src.archetypes.extraction import RequirementExtractor
from src.archetypes.feature_engineering import FeatureOutput
from src.archetypes.clustering import ClusteringResult

In [None]:
# Load extraction results (for metadata and skills)
extraction_path = get_output_path("archetypes", "phase_1_extraction", "extracted_requirements.json")
extractions = RequirementExtractor.load_results(str(extraction_path))
extraction_lookup = {e.jd_id: e for e in extractions}
print(f"‚úÖ Loaded {len(extractions)} extractions")

# Build metadata DataFrame
metadata_records = []
for e in extractions:
    if e.extraction_success:
        record = {'jd_id': e.jd_id}
        record.update(e.metadata)
        record['top_skills'] = ', '.join(e.get_all_skills_flat()[:5])
        metadata_records.append(record)

metadata_df = pd.DataFrame(metadata_records)
print(f"‚úÖ Built metadata for {len(metadata_df)} JDs")

In [None]:
# Load clustering results
cluster_dir = get_output_path("archetypes", "phase_3_clustering")

clustering_results = {}
if cluster_dir.exists():
    # Check for experiment results
    exp_results_file = cluster_dir / "experiment_results.json"
    if exp_results_file.exists():
        with open(exp_results_file) as f:
            experiment_data = json.load(f)
        print(f"‚úÖ Loaded {len(experiment_data)} experiment results")
    else:
        experiment_data = None
    
    # Load individual clustering results
    for subdir in cluster_dir.iterdir():
        if subdir.is_dir() and (subdir / "cluster_labels.npy").exists():
            clustering_results[subdir.name] = ClusteringResult.load(str(subdir))
            print(f"‚úÖ Loaded clustering: {subdir.name} - {clustering_results[subdir.name].n_clusters} clusters")

if not clustering_results:
    print("‚ùå No clustering results found. Run: python run_archetype_pipeline.py --cluster")

In [None]:
# Load feature outputs (for UMAP visualization)
feature_dir = get_output_path("archetypes", "phase_2_features")
feature_outputs = {}

if feature_dir.exists():
    for subdir in feature_dir.iterdir():
        if subdir.is_dir() and (subdir / "features.npy").exists():
            feature_outputs[subdir.name] = FeatureOutput.load(str(subdir))

print(f"‚úÖ Loaded {len(feature_outputs)} feature outputs")

## 1. Experiment Comparison

In [None]:
# Load experiment comparison if available
comparison_file = cluster_dir / "experiment_comparison.csv"

if comparison_file.exists():
    comparison_df = pd.read_csv(comparison_file)
    display(comparison_df.style.background_gradient(subset=['silhouette_score'], cmap='Greens')
                               .background_gradient(subset=['noise_ratio'], cmap='Reds_r'))
else:
    # Build comparison from loaded results
    comparison_data = []
    for name, cr in clustering_results.items():
        comparison_data.append({
            'experiment': name,
            'n_clusters': cr.n_clusters,
            'n_noise': cr.n_noise,
            'noise_ratio': cr.noise_ratio,
            'silhouette_score': cr.silhouette_score,
        })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        display(comparison_df)

In [None]:
# Silhouette vs Noise tradeoff
if len(comparison_df) > 1:
    fig = px.scatter(
        comparison_df,
        x='noise_ratio',
        y='silhouette_score',
        size='n_clusters',
        color='experiment' if 'experiment' in comparison_df.columns else 'experiment_id',
        hover_data=comparison_df.columns.tolist(),
        title='Silhouette Score vs Noise Ratio (size = # clusters)'
    )
    fig.update_layout(height=500, width=700)
    fig.show()
    
    print("\nüí° Ideal: High silhouette (top) + Low noise (left)")

## 2. Cluster Size Distribution

In [None]:
# Select clustering result to analyze
if clustering_results:
    # Use 'default' if available, else first one
    default_name = 'default' if 'default' in clustering_results else list(clustering_results.keys())[0]
    
    cluster_dropdown = widgets.Dropdown(
        options=list(clustering_results.keys()),
        value=default_name,
        description='Clustering:',
        style={'description_width': 'initial'}
    )
    
    output = widgets.Output()
    
    def show_cluster_sizes(change):
        with output:
            output.clear_output(wait=True)
            
            cr = clustering_results[change['new']]
            
            # Cluster sizes (excluding noise)
            sizes = {k: v for k, v in cr.cluster_sizes.items() if k != -1}
            noise_size = cr.cluster_sizes.get(-1, 0)
            
            sizes_df = pd.DataFrame([
                {'Cluster': f'Cluster {k}', 'Size': v}
                for k, v in sorted(sizes.items())
            ])
            
            # Add noise as separate bar
            if noise_size > 0:
                sizes_df = pd.concat([
                    sizes_df,
                    pd.DataFrame([{'Cluster': 'Noise (-1)', 'Size': noise_size}])
                ])
            
            fig = px.bar(
                sizes_df, x='Cluster', y='Size',
                title=f'Cluster Sizes ({len(sizes)} clusters + noise)',
                color='Size',
                color_continuous_scale='Blues'
            )
            fig.update_layout(height=400, width=800)
            fig.show()
            
            # Summary stats
            size_values = list(sizes.values())
            print(f"\nüìä Cluster Size Statistics (excluding noise):")
            print(f"   Min: {min(size_values)}")
            print(f"   Max: {max(size_values)}")
            print(f"   Mean: {np.mean(size_values):.1f}")
            print(f"   Median: {np.median(size_values):.1f}")
            print(f"   Std: {np.std(size_values):.1f}")
    
    cluster_dropdown.observe(show_cluster_sizes, names='value')
    display(cluster_dropdown)
    display(output)
    show_cluster_sizes({'new': default_name})

## 3. UMAP with Cluster Colors

In [None]:
# Compute UMAP and color by cluster
try:
    import umap
    UMAP_AVAILABLE = True
except ImportError:
    print("‚ö†Ô∏è umap-learn not installed")
    UMAP_AVAILABLE = False

if UMAP_AVAILABLE and clustering_results and feature_outputs:
    # Dropdown for selecting clustering result
    cluster_dropdown = widgets.Dropdown(
        options=list(clustering_results.keys()),
        value=list(clustering_results.keys())[0],
        description='Clustering:',
        style={'description_width': 'initial'}
    )
    
    # Dropdown for feature set (for UMAP)
    feature_dropdown = widgets.Dropdown(
        options=list(feature_outputs.keys()),
        value=list(feature_outputs.keys())[0],
        description='Features:',
        style={'description_width': 'initial'}
    )
    
    output = widgets.Output()
    
    # Cache UMAP results
    umap_cache = {}
    
    def show_cluster_umap(change):
        with output:
            output.clear_output(wait=True)
            
            cluster_name = cluster_dropdown.value
            feature_name = feature_dropdown.value
            
            cr = clustering_results[cluster_name]
            fo = feature_outputs[feature_name]
            
            # Compute UMAP (cached)
            if feature_name not in umap_cache:
                print(f"Computing UMAP for {feature_name}...")
                reducer = umap.UMAP(n_components=2, random_state=42)
                umap_cache[feature_name] = reducer.fit_transform(fo.features)
            
            embedding_2d = umap_cache[feature_name]
            
            # Build DataFrame
            id_to_cluster = dict(zip(cr.ids, cr.labels))
            
            plot_df = pd.DataFrame({
                'jd_id': fo.ids,
                'x': embedding_2d[:, 0],
                'y': embedding_2d[:, 1],
            })
            plot_df['cluster'] = plot_df['jd_id'].map(id_to_cluster)
            plot_df = plot_df.merge(metadata_df, on='jd_id', how='left')
            
            # Convert cluster to string for discrete colors
            plot_df['cluster_label'] = plot_df['cluster'].apply(
                lambda x: 'Noise' if x == -1 else f'Cluster {x}'
            )
            
            # Hover data
            hover_cols = ['jd_id', 'cluster']
            if 'title' in plot_df.columns:
                hover_cols.append('title')
            if 'top_skills' in plot_df.columns:
                hover_cols.append('top_skills')
            if 'division' in plot_df.columns:
                hover_cols.append('division')
            
            fig = px.scatter(
                plot_df, x='x', y='y',
                color='cluster_label',
                hover_data=hover_cols,
                title=f'UMAP: {feature_name} (colored by {cluster_name})'
            )
            
            fig.update_layout(height=600, width=900)
            fig.update_traces(marker=dict(size=6, opacity=0.7))
            fig.show()
    
    cluster_dropdown.observe(show_cluster_umap, names='value')
    feature_dropdown.observe(show_cluster_umap, names='value')
    
    display(widgets.HBox([cluster_dropdown, feature_dropdown]))
    display(output)
    show_cluster_umap(None)

## 4. Division Purity Analysis

In [None]:
# Compute division purity for each cluster
if 'division' in metadata_df.columns and clustering_results:
    cr = clustering_results[list(clustering_results.keys())[0]]
    
    # Build cluster -> division mapping
    id_to_cluster = dict(zip(cr.ids, cr.labels))
    analysis_df = metadata_df.copy()
    analysis_df['cluster'] = analysis_df['jd_id'].map(id_to_cluster)
    analysis_df = analysis_df.dropna(subset=['cluster'])
    analysis_df['cluster'] = analysis_df['cluster'].astype(int)
    
    # Exclude noise
    analysis_df = analysis_df[analysis_df['cluster'] != -1]
    
    # Compute purity
    purity_data = []
    for cluster_id in sorted(analysis_df['cluster'].unique()):
        cluster_data = analysis_df[analysis_df['cluster'] == cluster_id]
        division_counts = cluster_data['division'].value_counts()
        
        dominant_division = division_counts.index[0]
        dominant_count = division_counts.iloc[0]
        total = len(cluster_data)
        purity = dominant_count / total
        
        purity_data.append({
            'cluster': cluster_id,
            'size': total,
            'dominant_division': dominant_division,
            'purity': purity,
            'n_divisions': len(division_counts),
        })
    
    purity_df = pd.DataFrame(purity_data)
    
    # Overall purity
    overall_purity = sum(p['purity'] * p['size'] for p in purity_data) / sum(p['size'] for p in purity_data)
    
    print(f"\nüìä Division Purity Analysis")
    print(f"   Overall Purity: {overall_purity:.1%}")
    print(f"   Mean Cluster Purity: {purity_df['purity'].mean():.1%}")
    
    display(purity_df.sort_values('purity', ascending=False).style
            .background_gradient(subset=['purity'], cmap='Greens'))
    
    # Purity histogram
    fig = px.histogram(purity_df, x='purity', nbins=20,
                       title='Distribution of Cluster Purity Scores')
    fig.add_vline(x=overall_purity, line_dash='dash', line_color='red',
                  annotation_text=f'Overall: {overall_purity:.1%}')
    fig.update_layout(height=400, width=600)
    fig.show()
else:
    print("‚ö†Ô∏è Division field not available for purity analysis")

## 5. Cluster x Division Heatmap

In [None]:
# Heatmap of cluster vs division
if 'division' in metadata_df.columns and clustering_results:
    # Create cross-tabulation
    crosstab = pd.crosstab(analysis_df['cluster'], analysis_df['division'], normalize='index')
    
    fig = px.imshow(
        crosstab,
        labels=dict(x='Division', y='Cluster', color='Proportion'),
        title='Cluster vs Division (row-normalized)',
        color_continuous_scale='Blues'
    )
    fig.update_layout(height=500, width=700)
    fig.show()
    
    print("\nüí° Interpretation:")
    print("   - Bright rows = cluster dominated by one division (high purity)")
    print("   - Even rows = cluster spans multiple divisions (low purity)")

## 6. Noise Analysis

In [None]:
# Analyze noise points
if clustering_results:
    cr = clustering_results[list(clustering_results.keys())[0]]
    
    noise_ids = cr.get_noise_ids()
    
    if noise_ids:
        print(f"\nüîç Noise Analysis: {len(noise_ids)} JDs marked as noise ({cr.noise_ratio:.1%})")
        
        # Get metadata for noise points
        noise_df = metadata_df[metadata_df['jd_id'].isin(noise_ids)]
        
        # Show division distribution of noise
        if 'division' in noise_df.columns:
            print("\nüìä Division distribution in noise:")
            display(noise_df['division'].value_counts())
        
        # Show sample noise JDs
        print("\nüìã Sample noise JDs:")
        for jd_id in noise_ids[:5]:
            ext = extraction_lookup.get(jd_id)
            if ext:
                title = ext.metadata.get('title', 'Unknown')
                skills = ext.get_all_skills_flat()[:5]
                print(f"   - {jd_id}: {title}")
                print(f"     Skills: {', '.join(skills)}")
    else:
        print("‚úÖ No noise points - all JDs assigned to clusters")

## 7. Cluster Drill-Down

In [None]:
# Interactive cluster explorer
if clustering_results:
    cr = clustering_results[list(clustering_results.keys())[0]]
    
    cluster_ids = sorted([k for k in cr.cluster_sizes.keys() if k != -1])
    cluster_options = [(f"Cluster {c} ({cr.cluster_sizes[c]} JDs)", c) for c in cluster_ids]
    
    cluster_select = widgets.Dropdown(
        options=cluster_options,
        description='Select Cluster:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )
    
    output = widgets.Output()
    
    def show_cluster_details(change):
        with output:
            output.clear_output()
            
            cluster_id = change['new']
            cluster_jd_ids = cr.get_cluster_ids(cluster_id)
            
            print(f"\n{'='*60}")
            print(f"CLUSTER {cluster_id} DETAILS")
            print(f"{'='*60}")
            print(f"Size: {len(cluster_jd_ids)} JDs")
            
            # Get cluster metadata
            cluster_df = metadata_df[metadata_df['jd_id'].isin(cluster_jd_ids)]
            
            # Division distribution
            if 'division' in cluster_df.columns:
                print(f"\nüìä Division Distribution:")
                for div, count in cluster_df['division'].value_counts().items():
                    print(f"   {div}: {count} ({count/len(cluster_df):.1%})")
            
            # Common titles
            if 'title' in cluster_df.columns:
                print(f"\nüìã Common Titles:")
                for title, count in cluster_df['title'].value_counts().head(5).items():
                    print(f"   {title}: {count}")
            
            # Aggregate skills
            all_skills = []
            for jd_id in cluster_jd_ids:
                ext = extraction_lookup.get(jd_id)
                if ext:
                    all_skills.extend(ext.get_all_skills_flat())
            
            from collections import Counter
            skill_counts = Counter(all_skills).most_common(15)
            
            print(f"\nüõ†Ô∏è Top Skills:")
            for skill, count in skill_counts:
                freq = count / len(cluster_jd_ids)
                print(f"   {skill}: {count} ({freq:.0%})")
    
    cluster_select.observe(show_cluster_details, names='value')
    display(cluster_select)
    display(output)
    
    # Initial display
    show_cluster_details({'new': cluster_ids[0]})

## 8. Summary

In [None]:
print("\n" + "="*60)
print("PHASE 3 SUMMARY")
print("="*60)

if clustering_results:
    cr = clustering_results[list(clustering_results.keys())[0]]
    
    print(f"\nüìä Clustering Results:")
    print(f"   Total JDs: {len(cr.ids)}")
    print(f"   Clusters: {cr.n_clusters}")
    print(f"   Noise: {cr.n_noise} ({cr.noise_ratio:.1%})")
    print(f"   Silhouette Score: {cr.silhouette_score:.4f}" if cr.silhouette_score else "   Silhouette: N/A")
    
    if 'division' in metadata_df.columns:
        print(f"   Division Purity: {overall_purity:.1%}")

print("\nüí° Key Questions:")
print("   1. Is the noise ratio acceptable? (typically <20%)")
print("   2. Are cluster sizes balanced enough?")
print("   3. Do clusters align with business divisions?")
print("   4. Do the top skills per cluster make sense?")

print("\n‚û°Ô∏è Next: Run Phase 4 (Aggregation) to create job archetypes")