# Phase 5: Naming & Final Catalog

This notebook displays the final named archetypes and skill standardization.

**What to look for:**
- Final archetype names and IDs
- Skill standardization mappings
- Complete archetype catalog
- Export options

In [None]:
# =============================================================================
# PLOTLY SETUP - Run this first!
# =============================================================================
# This configures Plotly to open interactive plots in your browser
# (VS Code notebooks don't render Plotly well natively)

import plotly.io as pio

# Open plots in browser for full interactivity
pio.renderers.default = "browser"

# Alternative options if browser doesn't work:
# pio.renderers.default = "png"        # Static images (always works)
# pio.renderers.default = "notebook"   # Try this for Jupyter
# pio.renderers.default = "vscode"     # Try this for VS Code

print(f"Plotly configured to use: {pio.renderers.default}")
print("Plots will open in your default browser")

In [None]:
# Setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import json
import pandas as pd
import numpy as np
from collections import Counter

import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, HTML, Markdown

from config import get_output_path
from src.archetypes.aggregation import ArchetypeAggregator, JobArchetype

In [None]:
# Load final archetypes
archetype_path = get_output_path("archetypes", "phase_5_naming", "final_archetypes.json")

if not archetype_path.exists():
    # Fallback to phase 4
    archetype_path = get_output_path("archetypes", "phase_4_aggregation", "archetypes.json")

if archetype_path.exists():
    archetypes = ArchetypeAggregator.load_archetypes(str(archetype_path))
    print(f"[OK] Loaded {len(archetypes)} archetypes from {archetype_path.name}")
else:
    print("[ERROR] No archetypes found. Run: python run_archetype_pipeline.py --name")
    archetypes = []

In [None]:
# Load skill mappings
mappings_path = get_output_path("archetypes", "phase_5_naming", "skill_mappings.json")

skill_mappings = {}
if mappings_path.exists():
    with open(mappings_path) as f:
        skill_mappings = json.load(f)
    print(f"[OK] Loaded {len(skill_mappings)} skill mappings")
else:
    print("[WARN] No skill mappings found")

## 1. Archetype Catalog

In [None]:
# Final catalog table
if archetypes:
    catalog_data = []
    
    for a in archetypes:
        # Get top skills
        all_skills = {}
        for skill_type in [a.skills.technical, a.skills.domain, a.skills.soft]:
            all_skills.update(skill_type.required)
            all_skills.update(skill_type.preferred)
        
        top_skills = sorted(all_skills.items(), key=lambda x: x[1], reverse=True)[:5]
        top_skills_str = ', '.join([s[0] for s in top_skills])
        
        # Top division
        top_div = list(a.division_distribution.keys())[0] if a.division_distribution else ''
        div_pct = a.division_distribution.get(top_div, 0) if top_div else 0
        
        # Experience
        exp_str = f"{a.experience.years_min_median:.0f}+ yrs" if a.experience.years_min_median else '-'
        
        catalog_data.append({
            'Archetype ID': a.archetype_id or f'cluster-{a.cluster_id}',
            'Name': a.label or f'Cluster {a.cluster_id}',
            'Members': a.member_count,
            'Division': f"{top_div} ({div_pct:.0%})" if top_div else '-',
            'Experience': exp_str,
            'Top Skills': top_skills_str,
        })
    
    catalog_df = pd.DataFrame(catalog_data).sort_values('Members', ascending=False)
    
    # Display with styling
    display(HTML("<h3>Job Archetype Catalog</h3>"))
    display(catalog_df.style
            .background_gradient(subset=['Members'], cmap='Blues')
            .set_properties(**{'text-align': 'left'}))

## 2. Name Word Cloud

In [None]:
# Word cloud from archetype names
if archetypes and any(a.label for a in archetypes):
    # Collect all words from names
    all_words = []
    for a in archetypes:
        if a.label:
            words = a.label.split()
            all_words.extend(words)
    
    # Create word cloud
    word_freq = Counter(all_words)
    
    try:
        wordcloud = WordCloud(
            width=800, height=400,
            background_color='white',
            colormap='Blues'
        ).generate_from_frequencies(word_freq)
        
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Words in Archetype Names')
        plt.show()
    except Exception as e:
        print(f"[WARN] Could not generate word cloud: {e}")
        print("\nTop words in archetype names:")
        for word, count in word_freq.most_common(20):
            print(f"   {word}: {count}")
else:
    print("[WARN] No archetype names available for word cloud")

## 3. Skill Standardization Analysis

In [None]:
# Show skill standardization
if skill_mappings:
    # Find changed mappings
    changed = {k: v for k, v in skill_mappings.items() if k != v}
    unchanged = {k: v for k, v in skill_mappings.items() if k == v}
    
    print(f"Skill Standardization Summary")
    print(f"   Total skills: {len(skill_mappings)}")
    print(f"   Changed: {len(changed)}")
    print(f"   Unchanged: {len(unchanged)}")
    
    if changed:
        print(f"\nSkill Mappings (showing first 30):")
        changed_df = pd.DataFrame([
            {'Original': k, 'Standardized': v}
            for k, v in list(changed.items())[:30]
        ])
        display(changed_df)
        
        # Show standardization groups
        print(f"\nStandardization Groups (skills mapped to same name):")
        from collections import defaultdict
        groups = defaultdict(list)
        for orig, std in skill_mappings.items():
            groups[std].append(orig)
        
        multi_groups = {k: v for k, v in groups.items() if len(v) > 1}
        for std_name, originals in list(multi_groups.items())[:10]:
            print(f"\n   '{std_name}' <- {originals}")
else:
    print("[WARN] No skill mappings to display")

## 4. Interactive Archetype Gallery

In [None]:
# Interactive gallery view
if archetypes:
    def create_archetype_card(a):
        """Create HTML card for an archetype."""
        # Get top skills
        all_skills = {}
        for skill_type in [a.skills.technical, a.skills.domain]:
            all_skills.update(skill_type.required)
        top_skills = sorted(all_skills.items(), key=lambda x: x[1], reverse=True)[:5]
        
        # Get top division
        top_div = list(a.division_distribution.keys())[0] if a.division_distribution else '-'
        
        # Build skills HTML
        skills_html = ''.join([
            f'<span style="background:#e8f4f8;padding:2px 8px;margin:2px;border-radius:3px;font-size:12px">{s[0]}</span>'
            for s in top_skills
        ])
        
        return f"""
        <div style="border:1px solid #ddd;border-radius:8px;padding:15px;margin:10px;width:350px;display:inline-block;vertical-align:top">
            <h4 style="margin:0 0 10px 0;color:#2c3e50">{a.label or f'Cluster {a.cluster_id}'}</h4>
            <p style="margin:5px 0;font-size:13px;color:#666">
                <strong>ID:</strong> {a.archetype_id or '-'}<br>
                <strong>Members:</strong> {a.member_count}<br>
                <strong>Division:</strong> {top_div}
            </p>
            <p style="margin:5px 0;font-size:12px">
                <strong>Top Skills:</strong><br>
                {skills_html}
            </p>
            <p style="margin:5px 0;font-size:12px">
                <strong>Common Titles:</strong><br>
                {', '.join(a.representative_titles[:2]) or '-'}
            </p>
        </div>
        """
    
    # Create gallery HTML
    cards_html = ''.join([create_archetype_card(a) for a in sorted(archetypes, key=lambda x: -x.member_count)])
    gallery_html = f'<div style="display:flex;flex-wrap:wrap">{cards_html}</div>'
    
    display(HTML("<h3>Archetype Gallery</h3>"))
    display(HTML(gallery_html))

## 5. Export Options

In [None]:
# Export to different formats
if archetypes:
    output_base = get_output_path("archetypes", "exports")
    output_base.mkdir(parents=True, exist_ok=True)
    
    # 1. Full JSON export
    full_export = [a.to_dict() for a in archetypes]
    with open(output_base / "archetypes_full.json", "w") as f:
        json.dump(full_export, f, indent=2, default=str)
    print(f"[OK] Full JSON: {output_base / 'archetypes_full.json'}")
    
    # 2. Summary CSV
    catalog_df.to_csv(output_base / "archetypes_summary.csv", index=False)
    print(f"[OK] Summary CSV: {output_base / 'archetypes_summary.csv'}")
    
    # 3. Skills matrix CSV
    if 'matrix_df' in dir():
        matrix_df.to_csv(output_base / "archetype_skills_matrix.csv")
        print(f"[OK] Skills Matrix: {output_base / 'archetype_skills_matrix.csv'}")
    
    # 4. Markdown documentation
    md_content = "# Job Archetype Catalog\n\n"
    md_content += f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d')}\n\n"
    md_content += f"Total Archetypes: {len(archetypes)}\n\n"
    md_content += "---\n\n"
    
    for a in sorted(archetypes, key=lambda x: -x.member_count):
        md_content += f"## {a.label or f'Cluster {a.cluster_id}'}\n\n"
        md_content += f"**ID:** `{a.archetype_id or '-'}`\n\n"
        md_content += f"**Members:** {a.member_count}\n\n"
        
        if a.representative_titles:
            md_content += f"**Common Titles:** {', '.join(a.representative_titles[:3])}\n\n"
        
        # Top skills
        all_skills = {}
        for skill_type in [a.skills.technical, a.skills.domain]:
            all_skills.update(skill_type.required)
        top_skills = sorted(all_skills.items(), key=lambda x: x[1], reverse=True)[:5]
        
        if top_skills:
            md_content += "**Top Skills:**\n"
            for skill, freq in top_skills:
                md_content += f"- {skill} ({freq:.0%})\n"
            md_content += "\n"
        
        md_content += "---\n\n"
    
    with open(output_base / "archetype_catalog.md", "w") as f:
        f.write(md_content)
    print(f"[OK] Markdown: {output_base / 'archetype_catalog.md'}")

## 6. Pipeline Summary

In [None]:
# Final pipeline summary
print("\n" + "="*70)
print("ARCHETYPE PIPELINE COMPLETE")
print("="*70)

if archetypes:
    total_jds = sum(a.member_count for a in archetypes)
    named_count = sum(1 for a in archetypes if a.label)
    
    print(f"\n[OK] Phase 1 (Extraction): Extracted requirements from JDs")
    print(f"[OK] Phase 2 (Features): Built embedding features")
    print(f"[OK] Phase 3 (Clustering): HDBSCAN clustering")
    print(f"[OK] Phase 4 (Aggregation): Created {len(archetypes)} archetypes")
    print(f"[OK] Phase 5 (Naming): Named {named_count}/{len(archetypes)} archetypes")
    
    print(f"\nFinal Statistics:")
    print(f"   Total Archetypes: {len(archetypes)}")
    print(f"   Total JDs Covered: {total_jds}")
    print(f"   Avg Archetype Size: {total_jds/len(archetypes):.1f}")
    print(f"   Skills Standardized: {len(skill_mappings)}")
    
    print(f"\nOutput Location:")
    print(f"   {get_output_path('archetypes')}")

print("\n" + "="*70)