# Phase 4: Aggregation Analysis

This notebook analyzes the aggregated job archetypes.

**What to look for:**
- Archetype overview and distribution
- Skill frequency heatmap across archetypes
- Level and division distributions
- Individual archetype deep dive

In [None]:
# Setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import json
import pandas as pd
import numpy as np
from collections import Counter

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, HTML

from config import get_output_path
from src.archetypes.aggregation import ArchetypeAggregator, JobArchetype

In [None]:
# Load archetypes
archetype_path = get_output_path("archetypes", "phase_4_aggregation", "archetypes.json")

if not archetype_path.exists():
    # Try phase 5 (named archetypes)
    archetype_path = get_output_path("archetypes", "phase_5_naming", "final_archetypes.json")

if archetype_path.exists():
    archetypes = ArchetypeAggregator.load_archetypes(str(archetype_path))
    print(f"‚úÖ Loaded {len(archetypes)} archetypes")
else:
    print("‚ùå No archetypes found. Run: python run_archetype_pipeline.py --aggregate")
    archetypes = []

## 1. Archetype Overview

In [None]:
# Overview table
if archetypes:
    overview_data = []
    
    for a in archetypes:
        # Get all skills
        all_skills = {}
        for skill_type in [a.skills.technical, a.skills.domain, a.skills.soft]:
            all_skills.update(skill_type.required)
            all_skills.update(skill_type.preferred)
        
        top_skills = sorted(all_skills.items(), key=lambda x: x[1], reverse=True)[:3]
        top_skills_str = ', '.join([s[0] for s in top_skills])
        
        # Top division
        top_div = list(a.division_distribution.keys())[0] if a.division_distribution else ''
        
        overview_data.append({
            'Cluster': a.cluster_id,
            'Name': a.label or f'Cluster {a.cluster_id}',
            'Members': a.member_count,
            'Top Division': top_div,
            'Top Titles': '; '.join(a.representative_titles[:2]),
            'Top Skills': top_skills_str,
        })
    
    overview_df = pd.DataFrame(overview_data).sort_values('Members', ascending=False)
    display(overview_df.style.background_gradient(subset=['Members'], cmap='Blues'))

In [None]:
# Archetype size distribution
if archetypes:
    fig = px.bar(
        overview_df.sort_values('Members'),
        x='Members',
        y='Name',
        orientation='h',
        title='Archetype Size Distribution',
        color='Members',
        color_continuous_scale='Blues'
    )
    fig.update_layout(height=max(400, len(archetypes) * 25), width=700)
    fig.show()

## 2. Skill Frequency Heatmap

In [None]:
# Build skill frequency matrix
if archetypes:
    # Collect all unique skills
    all_skills_set = set()
    for a in archetypes:
        for skill_type in [a.skills.technical, a.skills.domain]:
            all_skills_set.update(skill_type.required.keys())
            all_skills_set.update(skill_type.preferred.keys())
    
    # Get top N skills by total frequency
    skill_totals = Counter()
    for a in archetypes:
        for skill_type in [a.skills.technical, a.skills.domain]:
            for skill, freq in skill_type.required.items():
                skill_totals[skill] += freq * a.member_count
            for skill, freq in skill_type.preferred.items():
                skill_totals[skill] += freq * a.member_count
    
    top_skills = [s[0] for s in skill_totals.most_common(30)]
    
    # Build matrix
    matrix_data = []
    for a in archetypes:
        row = {'archetype': a.label or f'Cluster {a.cluster_id}'}
        
        # Merge all skill frequencies
        skill_freq = {}
        for skill_type in [a.skills.technical, a.skills.domain]:
            for skill, freq in skill_type.required.items():
                skill_freq[skill] = max(skill_freq.get(skill, 0), freq)
            for skill, freq in skill_type.preferred.items():
                skill_freq[skill] = max(skill_freq.get(skill, 0), freq)
        
        for skill in top_skills:
            row[skill] = skill_freq.get(skill, 0)
        
        matrix_data.append(row)
    
    matrix_df = pd.DataFrame(matrix_data).set_index('archetype')
    
    # Create heatmap
    fig = px.imshow(
        matrix_df,
        labels=dict(x='Skill', y='Archetype', color='Frequency'),
        title='Skill Frequency by Archetype',
        color_continuous_scale='Blues',
        aspect='auto'
    )
    fig.update_layout(height=max(500, len(archetypes) * 30), width=1000)
    fig.update_xaxes(tickangle=45)
    fig.show()

## 3. Level Distribution by Archetype

In [None]:
# Level distribution stacked bar
if archetypes and any(a.level_distribution for a in archetypes):
    # Get all levels
    all_levels = set()
    for a in archetypes:
        all_levels.update(a.level_distribution.keys())
    all_levels = sorted(all_levels)
    
    # Build data
    level_data = []
    for a in archetypes:
        name = a.label or f'Cluster {a.cluster_id}'
        for level in all_levels:
            freq = a.level_distribution.get(level, 0)
            level_data.append({
                'Archetype': name,
                'Level': level,
                'Frequency': freq
            })
    
    level_df = pd.DataFrame(level_data)
    
    fig = px.bar(
        level_df,
        x='Archetype',
        y='Frequency',
        color='Level',
        title='Level Distribution by Archetype',
        barmode='stack'
    )
    fig.update_layout(height=500, width=900)
    fig.update_xaxes(tickangle=45)
    fig.show()
else:
    print("‚ö†Ô∏è No level distribution data available")

## 4. Division Distribution by Archetype

In [None]:
# Division distribution stacked bar
if archetypes and any(a.division_distribution for a in archetypes):
    # Get all divisions
    all_divisions = set()
    for a in archetypes:
        all_divisions.update(a.division_distribution.keys())
    all_divisions = sorted(all_divisions)
    
    # Build data
    div_data = []
    for a in archetypes:
        name = a.label or f'Cluster {a.cluster_id}'
        for div in all_divisions:
            freq = a.division_distribution.get(div, 0)
            div_data.append({
                'Archetype': name,
                'Division': div,
                'Frequency': freq
            })
    
    div_df = pd.DataFrame(div_data)
    
    fig = px.bar(
        div_df,
        x='Archetype',
        y='Frequency',
        color='Division',
        title='Division Distribution by Archetype',
        barmode='stack'
    )
    fig.update_layout(height=500, width=900)
    fig.update_xaxes(tickangle=45)
    fig.show()
else:
    print("‚ö†Ô∏è No division distribution data available")

## 5. Archetype Deep Dive (Interactive)

In [None]:
# Interactive archetype explorer
if archetypes:
    archetype_options = [
        (f"{a.label or f'Cluster {a.cluster_id}'} ({a.member_count} JDs)", i)
        for i, a in enumerate(archetypes)
    ]
    
    archetype_dropdown = widgets.Dropdown(
        options=archetype_options,
        description='Archetype:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )
    
    output = widgets.Output()
    
    def show_archetype_details(change):
        with output:
            output.clear_output()
            
            a = archetypes[change['new']]
            
            print(f"\n{'='*70}")
            print(f"ARCHETYPE: {a.label or f'Cluster {a.cluster_id}'}")
            print(f"{'='*70}")
            
            print(f"\nüìä OVERVIEW")
            print(f"   Cluster ID: {a.cluster_id}")
            print(f"   Archetype ID: {a.archetype_id or 'Not assigned'}")
            print(f"   Member Count: {a.member_count}")
            
            # Representative titles
            print(f"\nüìã REPRESENTATIVE TITLES")
            for title in a.representative_titles[:5]:
                print(f"   ‚Ä¢ {title}")
            
            # Division distribution
            if a.division_distribution:
                print(f"\nüè¢ DIVISION DISTRIBUTION")
                for div, freq in sorted(a.division_distribution.items(), key=lambda x: -x[1])[:5]:
                    print(f"   {div}: {freq:.0%}")
            
            # Level distribution
            if a.level_distribution:
                print(f"\nüìà LEVEL DISTRIBUTION")
                for level, freq in sorted(a.level_distribution.items(), key=lambda x: -x[1]):
                    print(f"   {level}: {freq:.0%}")
            
            # Technical Skills
            print(f"\nüõ†Ô∏è TECHNICAL SKILLS")
            if a.skills.technical.required:
                print("   Required:")
                for skill, freq in sorted(a.skills.technical.required.items(), key=lambda x: -x[1])[:10]:
                    print(f"      {skill}: {freq:.0%}")
            if a.skills.technical.preferred:
                print("   Preferred:")
                for skill, freq in sorted(a.skills.technical.preferred.items(), key=lambda x: -x[1])[:10]:
                    print(f"      {skill}: {freq:.0%}")
            
            # Domain Skills
            print(f"\nüìä DOMAIN SKILLS")
            if a.skills.domain.required:
                print("   Required:")
                for skill, freq in sorted(a.skills.domain.required.items(), key=lambda x: -x[1])[:10]:
                    print(f"      {skill}: {freq:.0%}")
            if a.skills.domain.preferred:
                print("   Preferred:")
                for skill, freq in sorted(a.skills.domain.preferred.items(), key=lambda x: -x[1])[:10]:
                    print(f"      {skill}: {freq:.0%}")
            
            # Licenses & Certifications
            if a.licenses.required or a.licenses.preferred:
                print(f"\nüìú LICENSES")
                if a.licenses.required:
                    for lic, freq in sorted(a.licenses.required.items(), key=lambda x: -x[1]):
                        print(f"   Required: {lic} ({freq:.0%})")
                if a.licenses.preferred:
                    for lic, freq in sorted(a.licenses.preferred.items(), key=lambda x: -x[1]):
                        print(f"   Preferred: {lic} ({freq:.0%})")
            
            if a.certifications.required or a.certifications.preferred:
                print(f"\nüèÜ CERTIFICATIONS")
                if a.certifications.required:
                    for cert, freq in sorted(a.certifications.required.items(), key=lambda x: -x[1]):
                        print(f"   Required: {cert} ({freq:.0%})")
                if a.certifications.preferred:
                    for cert, freq in sorted(a.certifications.preferred.items(), key=lambda x: -x[1]):
                        print(f"   Preferred: {cert} ({freq:.0%})")
            
            # Experience
            print(f"\n‚è±Ô∏è EXPERIENCE")
            if a.experience.years_min_median:
                print(f"   Years (median min): {a.experience.years_min_median:.0f}")
            if a.experience.years_preferred_median:
                print(f"   Years (median preferred): {a.experience.years_preferred_median:.0f}")
            if a.experience.years_range:
                print(f"   Range: {a.experience.years_range[0]}-{a.experience.years_range[1]} years")
            
            # Education
            if a.education_levels.required or a.education_fields:
                print(f"\nüéì EDUCATION")
                if a.education_levels.required:
                    for level, freq in sorted(a.education_levels.required.items(), key=lambda x: -x[1]):
                        print(f"   Required: {level} ({freq:.0%})")
                if a.education_fields:
                    print("   Fields:")
                    for field, freq in sorted(a.education_fields.items(), key=lambda x: -x[1])[:5]:
                        print(f"      {field}: {freq:.0%}")
            
            # Tools
            if a.tools.required or a.tools.preferred:
                print(f"\nüîß TOOLS")
                if a.tools.required:
                    for tool, freq in sorted(a.tools.required.items(), key=lambda x: -x[1])[:5]:
                        print(f"   Required: {tool} ({freq:.0%})")
                if a.tools.preferred:
                    for tool, freq in sorted(a.tools.preferred.items(), key=lambda x: -x[1])[:5]:
                        print(f"   Preferred: {tool} ({freq:.0%})")
    
    archetype_dropdown.observe(show_archetype_details, names='value')
    display(archetype_dropdown)
    display(output)
    
    # Initial display
    show_archetype_details({'new': 0})

## 6. Archetype Comparison

In [None]:
# Compare two archetypes side by side
if len(archetypes) >= 2:
    archetype_options = [
        (f"{a.label or f'Cluster {a.cluster_id}'}", i)
        for i, a in enumerate(archetypes)
    ]
    
    dropdown1 = widgets.Dropdown(
        options=archetype_options,
        value=0,
        description='Archetype 1:',
        style={'description_width': 'initial'}
    )
    
    dropdown2 = widgets.Dropdown(
        options=archetype_options,
        value=min(1, len(archetypes)-1),
        description='Archetype 2:',
        style={'description_width': 'initial'}
    )
    
    output = widgets.Output()
    
    def compare_archetypes(change):
        with output:
            output.clear_output()
            
            a1 = archetypes[dropdown1.value]
            a2 = archetypes[dropdown2.value]
            
            # Get all skills from both
            def get_all_skills(a):
                skills = {}
                for skill_type in [a.skills.technical, a.skills.domain]:
                    for skill, freq in skill_type.required.items():
                        skills[skill] = max(skills.get(skill, 0), freq)
                    for skill, freq in skill_type.preferred.items():
                        skills[skill] = max(skills.get(skill, 0), freq)
                return skills
            
            skills1 = get_all_skills(a1)
            skills2 = get_all_skills(a2)
            
            all_skills = set(skills1.keys()) | set(skills2.keys())
            top_skills = sorted(all_skills, key=lambda s: skills1.get(s, 0) + skills2.get(s, 0), reverse=True)[:15]
            
            # Build comparison data
            comparison_data = []
            for skill in top_skills:
                comparison_data.append({
                    'Skill': skill,
                    a1.label or f'Cluster {a1.cluster_id}': skills1.get(skill, 0),
                    a2.label or f'Cluster {a2.cluster_id}': skills2.get(skill, 0),
                })
            
            comp_df = pd.DataFrame(comparison_data)
            
            # Create grouped bar chart
            name1 = a1.label or f'Cluster {a1.cluster_id}'
            name2 = a2.label or f'Cluster {a2.cluster_id}'
            
            fig = go.Figure()
            fig.add_trace(go.Bar(name=name1, x=comp_df['Skill'], y=comp_df[name1]))
            fig.add_trace(go.Bar(name=name2, x=comp_df['Skill'], y=comp_df[name2]))
            
            fig.update_layout(
                barmode='group',
                title=f'Skill Comparison: {name1} vs {name2}',
                yaxis_title='Frequency',
                height=500, width=900
            )
            fig.update_xaxes(tickangle=45)
            fig.show()
            
            # Show unique skills
            unique_to_1 = set(skills1.keys()) - set(skills2.keys())
            unique_to_2 = set(skills2.keys()) - set(skills1.keys())
            
            print(f"\nüîπ Skills unique to {name1} ({len(unique_to_1)}):")
            for skill in sorted(unique_to_1, key=lambda s: skills1[s], reverse=True)[:10]:
                print(f"   {skill}: {skills1[skill]:.0%}")
            
            print(f"\nüî∏ Skills unique to {name2} ({len(unique_to_2)}):")
            for skill in sorted(unique_to_2, key=lambda s: skills2[s], reverse=True)[:10]:
                print(f"   {skill}: {skills2[skill]:.0%}")
    
    dropdown1.observe(compare_archetypes, names='value')
    dropdown2.observe(compare_archetypes, names='value')
    
    display(widgets.HBox([dropdown1, dropdown2]))
    display(output)
    
    # Initial comparison
    compare_archetypes(None)

## 7. Summary

In [None]:
print("\n" + "="*60)
print("PHASE 4 SUMMARY")
print("="*60)

if archetypes:
    total_members = sum(a.member_count for a in archetypes)
    avg_size = total_members / len(archetypes)
    
    print(f"\nüìä Archetype Statistics:")
    print(f"   Total Archetypes: {len(archetypes)}")
    print(f"   Total JDs Covered: {total_members}")
    print(f"   Average Size: {avg_size:.1f} JDs")
    print(f"   Largest: {max(a.member_count for a in archetypes)} JDs")
    print(f"   Smallest: {min(a.member_count for a in archetypes)} JDs")
    
    # Check naming status
    named_count = sum(1 for a in archetypes if a.label)
    print(f"\nüìù Naming Status:")
    print(f"   Named: {named_count}/{len(archetypes)}")

print("\nüí° Key Questions:")
print("   1. Do archetypes represent meaningful job families?")
print("   2. Are skill distributions within archetypes coherent?")
print("   3. Do level distributions make sense for each archetype?")
print("   4. Are there archetypes that should be merged or split?")

print("\n‚û°Ô∏è Next: Run Phase 5 (Naming) to generate archetype names")