# Phase 1: Extraction Analysis

This notebook analyzes the LLM-extracted requirements from job descriptions.

**What to look for:**
- Extraction success rate
- Distribution of extracted items per JD
- Most common skills, licenses, certifications
- Required vs preferred ratios
- Empty/sparse extractions

In [None]:
# Setup
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import json
import pandas as pd
import numpy as np
from collections import Counter

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, HTML

from config import get_output_path
from src.archetypes.extraction import ExtractionResult, RequirementExtractor

In [None]:
# Load extraction results
extraction_path = get_output_path("archetypes", "phase_1_extraction", "extracted_requirements.json")

if not extraction_path.exists():
    print(f"‚ùå Extraction file not found: {extraction_path}")
    print("Run: python run_archetype_pipeline.py --extract")
else:
    extractions = RequirementExtractor.load_results(str(extraction_path))
    print(f"‚úÖ Loaded {len(extractions)} extractions")

## 1. Extraction Success Rate

In [None]:
# Success vs failure
success_count = sum(1 for e in extractions if e.extraction_success)
failure_count = len(extractions) - success_count

fig = go.Figure(data=[go.Pie(
    labels=['Success', 'Failed'],
    values=[success_count, failure_count],
    marker_colors=['#2ecc71', '#e74c3c'],
    hole=0.4
)])

fig.update_layout(
    title=f"Extraction Success Rate: {success_count/len(extractions):.1%}",
    width=500, height=400
)
fig.show()

# Show failed extractions if any
if failure_count > 0:
    print(f"\n‚ùå Failed extractions ({failure_count}):")
    for e in extractions:
        if not e.extraction_success:
            print(f"  - {e.jd_id}: {e.extraction_error[:100]}...")

## 2. Items per JD Distribution

In [None]:
# Calculate counts per extraction
def count_items(ext: ExtractionResult) -> dict:
    return {
        'jd_id': ext.jd_id,
        'licenses': len(ext.licenses.required) + len(ext.licenses.preferred),
        'certifications': len(ext.certifications.required) + len(ext.certifications.preferred),
        'technical_skills': len(ext.skills.technical.required) + len(ext.skills.technical.preferred),
        'domain_skills': len(ext.skills.domain.required) + len(ext.skills.domain.preferred),
        'soft_skills': len(ext.skills.soft.required) + len(ext.skills.soft.preferred),
        'tools': len(ext.tools.required) + len(ext.tools.preferred),
        'languages': len(ext.languages.required) + len(ext.languages.preferred),
        'education_fields': len(ext.education.fields),
    }

counts_df = pd.DataFrame([count_items(e) for e in extractions if e.extraction_success])
counts_df['total_skills'] = counts_df['technical_skills'] + counts_df['domain_skills'] + counts_df['soft_skills']
counts_df['total_items'] = counts_df.drop(columns=['jd_id']).sum(axis=1)

print("Items per JD - Summary Statistics:")
display(counts_df.describe().round(2))

In [None]:
# Distribution histograms
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=['Total Skills', 'Technical Skills', 'Domain Skills', 
                    'Licenses', 'Certifications', 'Tools']
)

cols = ['total_skills', 'technical_skills', 'domain_skills', 'licenses', 'certifications', 'tools']
positions = [(1,1), (1,2), (1,3), (2,1), (2,2), (2,3)]

for col, (row, c) in zip(cols, positions):
    fig.add_trace(
        go.Histogram(x=counts_df[col], name=col, nbinsx=20),
        row=row, col=c
    )

fig.update_layout(height=500, width=900, title="Distribution of Extracted Items per JD", showlegend=False)
fig.show()

## 3. Most Common Extracted Items

In [None]:
# Aggregate all items
all_technical = []
all_domain = []
all_soft = []
all_licenses = []
all_certs = []
all_tools = []

for e in extractions:
    if e.extraction_success:
        all_technical.extend(e.skills.technical.required + e.skills.technical.preferred)
        all_domain.extend(e.skills.domain.required + e.skills.domain.preferred)
        all_soft.extend(e.skills.soft.required + e.skills.soft.preferred)
        all_licenses.extend(e.licenses.required + e.licenses.preferred)
        all_certs.extend(e.certifications.required + e.certifications.preferred)
        all_tools.extend(e.tools.required + e.tools.preferred)

def top_items_chart(items, title, n=15):
    counts = Counter(items).most_common(n)
    if not counts:
        print(f"No {title} found")
        return None
    
    df = pd.DataFrame(counts, columns=['item', 'count'])
    fig = px.bar(df, x='count', y='item', orientation='h', title=f"Top {n} {title}")
    fig.update_layout(height=400, width=600, yaxis={'categoryorder': 'total ascending'})
    return fig

In [None]:
# Technical Skills
fig = top_items_chart(all_technical, "Technical Skills")
if fig: fig.show()

In [None]:
# Domain Skills
fig = top_items_chart(all_domain, "Domain Skills")
if fig: fig.show()

In [None]:
# Licenses
fig = top_items_chart(all_licenses, "Licenses")
if fig: fig.show()

In [None]:
# Certifications
fig = top_items_chart(all_certs, "Certifications")
if fig: fig.show()

In [None]:
# Tools
fig = top_items_chart(all_tools, "Tools")
if fig: fig.show()

## 4. Required vs Preferred Ratio

In [None]:
# Calculate required vs preferred
req_pref_data = {
    'Category': [],
    'Required': [],
    'Preferred': []
}

categories = [
    ('Licenses', lambda e: (e.licenses.required, e.licenses.preferred)),
    ('Certifications', lambda e: (e.certifications.required, e.certifications.preferred)),
    ('Technical Skills', lambda e: (e.skills.technical.required, e.skills.technical.preferred)),
    ('Domain Skills', lambda e: (e.skills.domain.required, e.skills.domain.preferred)),
    ('Soft Skills', lambda e: (e.skills.soft.required, e.skills.soft.preferred)),
    ('Tools', lambda e: (e.tools.required, e.tools.preferred)),
]

for cat_name, getter in categories:
    req_count = 0
    pref_count = 0
    for e in extractions:
        if e.extraction_success:
            req, pref = getter(e)
            req_count += len(req)
            pref_count += len(pref)
    
    req_pref_data['Category'].append(cat_name)
    req_pref_data['Required'].append(req_count)
    req_pref_data['Preferred'].append(pref_count)

req_pref_df = pd.DataFrame(req_pref_data)

fig = go.Figure()
fig.add_trace(go.Bar(name='Required', x=req_pref_df['Category'], y=req_pref_df['Required'], marker_color='#3498db'))
fig.add_trace(go.Bar(name='Preferred', x=req_pref_df['Category'], y=req_pref_df['Preferred'], marker_color='#95a5a6'))

fig.update_layout(
    barmode='stack',
    title='Required vs Preferred Items by Category',
    height=400, width=700
)
fig.show()

## 5. Empty/Sparse Extractions Analysis

In [None]:
# Calculate empty rates per category
empty_rates = {}

for e in extractions:
    if not e.extraction_success:
        continue
    
    checks = {
        'licenses': len(e.licenses.required) + len(e.licenses.preferred) == 0,
        'certifications': len(e.certifications.required) + len(e.certifications.preferred) == 0,
        'technical_skills': len(e.skills.technical.required) + len(e.skills.technical.preferred) == 0,
        'domain_skills': len(e.skills.domain.required) + len(e.skills.domain.preferred) == 0,
        'soft_skills': len(e.skills.soft.required) + len(e.skills.soft.preferred) == 0,
        'tools': len(e.tools.required) + len(e.tools.preferred) == 0,
        'education': len(e.education.level.required) + len(e.education.fields) == 0,
        'experience_years': e.experience.years_min is None,
    }
    
    for cat, is_empty in checks.items():
        if cat not in empty_rates:
            empty_rates[cat] = {'empty': 0, 'total': 0}
        empty_rates[cat]['total'] += 1
        if is_empty:
            empty_rates[cat]['empty'] += 1

empty_df = pd.DataFrame([
    {'Category': k, 'Empty Rate': v['empty'] / v['total'] * 100}
    for k, v in empty_rates.items()
]).sort_values('Empty Rate', ascending=True)

fig = px.bar(empty_df, x='Empty Rate', y='Category', orientation='h',
             title='Empty Extraction Rate by Category (%)',
             color='Empty Rate', color_continuous_scale='Reds')
fig.update_layout(height=400, width=600)
fig.show()

print("\nüìä Categories with >50% empty rate may indicate:")
print("   - Category not commonly mentioned in JDs")
print("   - Extraction prompt needs tuning for this category")

## 6. Interactive Extraction Viewer

In [None]:
# Create dropdown to view individual extractions
successful_extractions = [e for e in extractions if e.extraction_success]

# Build options with metadata if available
options = []
for e in successful_extractions:
    title = e.metadata.get('title', 'Unknown')
    label = f"{e.jd_id} - {title[:50]}"
    options.append((label, e.jd_id))

extraction_lookup = {e.jd_id: e for e in successful_extractions}

dropdown = widgets.Dropdown(
    options=options,
    description='Select JD:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px')
)

output = widgets.Output()

def display_extraction(change):
    with output:
        output.clear_output()
        jd_id = change['new']
        ext = extraction_lookup[jd_id]
        
        print(f"\n{'='*60}")
        print(f"JD ID: {ext.jd_id}")
        print(f"{'='*60}")
        
        # Metadata
        if ext.metadata:
            print("\nüìã METADATA:")
            for k, v in ext.metadata.items():
                print(f"   {k}: {v}")
        
        # Skills
        print("\nüõ†Ô∏è TECHNICAL SKILLS:")
        print(f"   Required: {', '.join(ext.skills.technical.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.skills.technical.preferred) or '(none)'}")
        
        print("\nüìä DOMAIN SKILLS:")
        print(f"   Required: {', '.join(ext.skills.domain.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.skills.domain.preferred) or '(none)'}")
        
        print("\nü§ù SOFT SKILLS:")
        print(f"   Required: {', '.join(ext.skills.soft.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.skills.soft.preferred) or '(none)'}")
        
        # Licenses & Certs
        print("\nüìú LICENSES:")
        print(f"   Required: {', '.join(ext.licenses.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.licenses.preferred) or '(none)'}")
        
        print("\nüèÜ CERTIFICATIONS:")
        print(f"   Required: {', '.join(ext.certifications.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.certifications.preferred) or '(none)'}")
        
        # Tools
        print("\nüîß TOOLS:")
        print(f"   Required: {', '.join(ext.tools.required) or '(none)'}")
        print(f"   Preferred: {', '.join(ext.tools.preferred) or '(none)'}")
        
        # Education
        print("\nüéì EDUCATION:")
        print(f"   Level Required: {', '.join(ext.education.level.required) or '(none)'}")
        print(f"   Level Preferred: {', '.join(ext.education.level.preferred) or '(none)'}")
        print(f"   Fields: {', '.join(ext.education.fields) or '(none)'}")
        
        # Experience
        print("\n‚è±Ô∏è EXPERIENCE:")
        print(f"   Years Min: {ext.experience.years_min or '(not specified)'}")
        print(f"   Years Preferred: {ext.experience.years_preferred or '(not specified)'}")
        print(f"   Specific Required: {', '.join(ext.experience.specific.required) or '(none)'}")
        print(f"   Specific Preferred: {', '.join(ext.experience.specific.preferred) or '(none)'}")

dropdown.observe(display_extraction, names='value')

display(widgets.VBox([dropdown, output]))

# Trigger initial display
display_extraction({'new': dropdown.value})

## 7. Summary Statistics

In [None]:
# Summary table
total_jds = len(extractions)
successful = sum(1 for e in extractions if e.extraction_success)

summary = {
    'Metric': [
        'Total JDs',
        'Successful Extractions',
        'Success Rate',
        'Avg Technical Skills/JD',
        'Avg Domain Skills/JD',
        'Avg Licenses/JD',
        'Avg Certifications/JD',
        'Unique Technical Skills',
        'Unique Domain Skills',
        'Unique Licenses',
        'Unique Certifications',
    ],
    'Value': [
        total_jds,
        successful,
        f"{successful/total_jds:.1%}",
        f"{counts_df['technical_skills'].mean():.1f}",
        f"{counts_df['domain_skills'].mean():.1f}",
        f"{counts_df['licenses'].mean():.1f}",
        f"{counts_df['certifications'].mean():.1f}",
        len(set(all_technical)),
        len(set(all_domain)),
        len(set(all_licenses)),
        len(set(all_certs)),
    ]
}

summary_df = pd.DataFrame(summary)
display(summary_df.style.set_properties(**{'text-align': 'left'}))