# JD Corpus Analysis Notebook

This notebook runs Phase 1 analysis using your configuration from `config.py`.

## Configuration is Centralized

Edit `config.py` to change:
- **Which text field to analyze** (e.g., `jd_expertise` instead of `jd_text`)
- **Stratification dimensions** (e.g., `rank` instead of `org_unit`)
- **Cluster metadata fields** (e.g., add `hiring_manager`)
- **Output directory** (via `OUTPUT_CONFIG`)

You don't need to change any code in this notebook - just edit config.py and re-run.

## Prerequisites
1. JSON files in `jd_data/`
2. `config.py` configured with your field mappings
3. Packages installed: `pip install -r requirements.txt`

In [None]:
# =============================================================================
# PLOTLY SETUP - Run this first!
# =============================================================================
# This configures Plotly to open interactive plots in your browser
# (VS Code notebooks don't render Plotly well natively)

import plotly.io as pio

# Open plots in browser for full interactivity
pio.renderers.default = "browser"

# Alternative options if browser doesn't work:
# pio.renderers.default = "png"        # Static images (always works)
# pio.renderers.default = "notebook"   # Try this for Jupyter
# pio.renderers.default = "vscode"     # Try this for VS Code

print(f"Plotly configured to use: {pio.renderers.default}")
print("Plots will open in your default browser")

In [None]:
# === SETUP ===
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import json
from IPython.display import display, Markdown

# Import analysis modules
from src.data_loaders import JSONFileLoader, FieldMapping, SchemaDiscovery
from src.analysis import (
    DataCleaner,
    QualityEvaluator,
    StratifiedSampler,
    StructureParser,
    StructureAnalyzer,
    EmbeddingGenerator,
    ContentClusterer,
    ClusterAnalyzer,
    create_visualization_data,
)

# Import YOUR configuration (including output paths)
from config import (
    JSON_CONFIG, 
    JD_FIELD_MAPPING, 
    ANALYSIS_CONFIG,
    OUTPUT_CONFIG,
    EMBEDDING_CONFIG,
    get_output_path,
    get_phase_output_path,
    get_embedding_generator,
)

# Visualization
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set_style("whitegrid")
    HAS_MATPLOTLIB = True
except ImportError:
    HAS_MATPLOTLIB = False

try:
    import plotly.express as px
    HAS_PLOTLY = True
except ImportError:
    HAS_PLOTLY = False

print("Setup complete")
print(f"  Matplotlib: {HAS_MATPLOTLIB}")
print(f"  Plotly: {HAS_PLOTLY}")
print(f"  Output root: {get_output_path().absolute()}")

---
## 1. Review Your Configuration

Let's see what's configured in `config.py`.

In [None]:
print("=" * 60)
print("YOUR CONFIGURATION (from config.py)")
print("=" * 60)

# Field Mapping
print("\nFIELD MAPPING")
print("   Your JSON fields -> Standard names")
print("-" * 40)
mapping = JD_FIELD_MAPPING.to_dict()
if mapping:
    for field, path in mapping.items():
        print(f"   {field:20} <- {path}")
else:
    print("    No mappings configured!")
    print("   Run schema discovery below, then edit config.py")

# Analysis Config
print("\nANALYSIS CONFIGURATION")
print("   What fields to use for each analysis")
print("-" * 40)
print(f"   Text to analyze:     {ANALYSIS_CONFIG.primary_text_field}")
print(f"   Additional texts:    {ANALYSIS_CONFIG.additional_text_fields or 'None'}")
print(f"   ID field:            {ANALYSIS_CONFIG.id_field}")
print(f"   Stratify by:         {ANALYSIS_CONFIG.stratify_by_primary} x {ANALYSIS_CONFIG.stratify_by_secondary}")
print(f"   Cluster metadata:    {ANALYSIS_CONFIG.cluster_metadata_fields}")
print(f"   Purity field:        {ANALYSIS_CONFIG.cluster_purity_field}")

# Output Config
print("\nOUTPUT CONFIGURATION")
print("-" * 40)
print(f"   Root directory:      {OUTPUT_CONFIG['root_dir']}")
print(f"   Absolute path:       {get_output_path().absolute()}")

---
## 2. Schema Discovery (if needed)

Run this to see your JSON structure and get field path suggestions.

In [None]:
# Load raw data (no mapping)
loader_raw = JSONFileLoader(
    data_path=JSON_CONFIG["data_path"],
    content_key=JSON_CONFIG["content_key"],
)

print(f"Data source: {JSON_CONFIG['data_path']}")
print(f"File stats: {loader_raw.get_file_stats()}")
print(f"Total records: {loader_raw.count_records()}")

In [None]:
# Discover schema
schema = loader_raw.discover_schema(sample_size=100)
print(schema.print_schema_tree())

In [None]:
# Suggested mappings (copy to config.py)
print("SUGGESTED FIELD MAPPINGS")
print("   Copy these to config.py JD_FIELD_MAPPING:")
print("-" * 40)
for field, path in schema.suggest_field_mapping().items():
    print(f'   {field}="{path}",')

print("\nLIKELY TEXT FIELDS (by average length):")
for path, stats in schema.get_likely_text_fields()[:5]:
    print(f"   {path}: avg_length={stats['avg_length']:.0f}")

---
## 3. Load and Clean Data

Loads data using your field mapping, validates config, then cleans.

In [None]:
# Load with your field mapping
loader = JSONFileLoader(
    data_path=JSON_CONFIG["data_path"],
    content_key=JSON_CONFIG["content_key"],
    field_mapping=JD_FIELD_MAPPING,
)

df_raw = loader.load_as_dataframe()
print(f"Loaded {len(df_raw)} records")
print(f"Columns: {list(df_raw.columns)}")

In [None]:
# Validate your config against the actual data
validation = ANALYSIS_CONFIG.validate(list(df_raw.columns))

if validation["errors"]:
    print("CONFIGURATION ERRORS (fix in config.py):")
    for err in validation["errors"]:
        print(f"   - {err}")
    raise ValueError("Fix configuration errors before proceeding")
else:
    print("[OK] No configuration errors")

if validation["warnings"]:
    print("\nWARNINGS (analysis will continue with reduced functionality):")
    for warn in validation["warnings"]:
        print(f"   - {warn}")
else:
    print("[OK] No configuration warnings")

In [None]:
# Clean using configured text field
print(f"Cleaning text field: {ANALYSIS_CONFIG.primary_text_field}")

cleaner = DataCleaner(
    text_field=ANALYSIS_CONFIG.primary_text_field,
    id_field=ANALYSIS_CONFIG.id_field,
    min_text_length=50,
)

df, cleaning_stats = cleaner.clean(df_raw)
print(cleaning_stats.print_report())

In [None]:
# Visualize field fill rates
if HAS_MATPLOTLIB:
    fill_rates = cleaning_stats.field_fill_rates
    
    fig, ax = plt.subplots(figsize=(10, max(4, len(fill_rates) * 0.3)))
    fields = list(fill_rates.keys())
    rates = list(fill_rates.values())
    colors = ['green' if r > 0.8 else 'orange' if r > 0.5 else 'red' for r in rates]
    
    ax.barh(fields, rates, color=colors)
    ax.set_xlabel('Fill Rate')
    ax.set_title('Field Fill Rates')
    ax.axvline(x=0.8, color='green', linestyle='--', alpha=0.5, label='80%')
    ax.set_xlim(0, 1.1)
    plt.tight_layout()
    plt.show()

---
## 4. Phase 1.1: Quality Baseline

Creates stratified sample using `stratify_by_primary` x `stratify_by_secondary`.

In [None]:
# Get available stratification fields
primary_strat, secondary_strat = ANALYSIS_CONFIG.get_available_stratify_fields(list(df.columns))

print(f"Configured:  {ANALYSIS_CONFIG.stratify_by_primary} x {ANALYSIS_CONFIG.stratify_by_secondary}")
print(f"Available:   {primary_strat or '(none)'} x {secondary_strat or '(none)'}")

if primary_strat and secondary_strat:
    print("\n[OK] Will use 2D stratified sampling")
elif primary_strat:
    print("\n[WARN] Will use 1D stratified sampling")
else:
    print("\n[WARN] Will use random sampling (no stratification fields)")

In [None]:
# View strata distribution (if available)
if primary_strat and secondary_strat:
    sampler = StratifiedSampler(
        df,
        org_unit_field=primary_strat,
        level_field=secondary_strat,
        id_field=ANALYSIS_CONFIG.id_field,
    )
    dist = sampler.get_strata_distribution()
    print(f"Total strata: {len(dist)}")
    display(dist.head(20))

In [None]:
# Visualize stratification dimensions
if HAS_MATPLOTLIB and primary_strat:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Primary dimension
    counts = df[primary_strat].value_counts().head(15)
    axes[0].barh(counts.index.astype(str), counts.values)
    axes[0].set_title(f"JDs by {primary_strat} (Top 15)")
    axes[0].set_xlabel("Count")
    
    # Secondary dimension
    if secondary_strat and secondary_strat != primary_strat:
        counts2 = df[secondary_strat].value_counts().head(15)
        axes[1].barh(counts2.index.astype(str), counts2.values)
        axes[1].set_title(f"JDs by {secondary_strat}")
        axes[1].set_xlabel("Count")
    
    plt.tight_layout()
    plt.show()

In [None]:
# Create evaluation sample
SAMPLE_SIZE = 100

if primary_strat and secondary_strat:
    evaluator = QualityEvaluator(
        df=df,
        text_field=ANALYSIS_CONFIG.primary_text_field,
        id_field=ANALYSIS_CONFIG.id_field,
        org_unit_field=primary_strat,
        level_field=secondary_strat,
    )
    sample = evaluator.create_evaluation_sample(n=SAMPLE_SIZE)
elif primary_strat:
    evaluator = QualityEvaluator(
        df=df,
        text_field=ANALYSIS_CONFIG.primary_text_field,
        id_field=ANALYSIS_CONFIG.id_field,
        org_unit_field=primary_strat,
        level_field=primary_strat,
    )
    sample = evaluator.create_evaluation_sample(n=SAMPLE_SIZE)
else:
    sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42)
    evaluator = QualityEvaluator(
        df=df,
        text_field=ANALYSIS_CONFIG.primary_text_field,
        id_field=ANALYSIS_CONFIG.id_field,
        org_unit_field=ANALYSIS_CONFIG.id_field,
        level_field=ANALYSIS_CONFIG.id_field,
    )
    evaluator.sample_df = sample

print(f"Created sample of {len(sample)} JDs")

In [None]:
# Export for human evaluation - using config output path
OUTPUT_DIR = get_phase_output_path("phase_1_1_quality")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Include configured export fields
export_fields = [ANALYSIS_CONFIG.id_field, ANALYSIS_CONFIG.primary_text_field]
export_fields.extend([f for f in ANALYSIS_CONFIG.quality_export_fields if f in df.columns])
print(f"Including fields: {export_fields}")

evaluator.export_for_evaluation(
    str(OUTPUT_DIR / "jd_quality_evaluation.json"),
    include_fields=export_fields,
)
evaluator.export_for_evaluation_csv(str(OUTPUT_DIR / "jd_quality_evaluation.csv"))

print(f"\n[OK] Exported to: {OUTPUT_DIR}/")
print("\nNEXT STEPS:")
print("   1. Open jd_quality_evaluation.csv")
print("   2. Score each eval_* column 1-5")
print("   3. Mark gold standards as TRUE")
print("   4. Run the import cell below")

In [None]:
# === RUN AFTER FILLING IN CSV ===
EVAL_FILE = OUTPUT_DIR / "jd_quality_evaluation.csv"

if EVAL_FILE.exists():
    count = evaluator.import_evaluations(str(EVAL_FILE))
    if count > 0:
        evaluator.print_quality_report()
        
        # Save results
        results = evaluator.analyze_quality()
        with open(OUTPUT_DIR / "quality_analysis_results.json", "w") as f:
            json.dump(results, f, indent=2, default=str)
        print(f"\nResults saved to {OUTPUT_DIR}/quality_analysis_results.json")
    else:
        print("No evaluations found. Fill in the eval_* columns first.")
else:
    print(f"File not found: {EVAL_FILE}")

---
## 5. Phase 1.2: Structural Consistency

Parses `primary_text_field` to extract sections (and `additional_text_fields` if configured).

In [None]:
# Parse primary text field
print(f"Parsing: {ANALYSIS_CONFIG.primary_text_field}")

structure_analyzer = StructureAnalyzer()
parsed_jds = structure_analyzer.parse_corpus(
    df,
    text_field=ANALYSIS_CONFIG.primary_text_field,
    id_field=ANALYSIS_CONFIG.id_field,
)

In [None]:
# Print structure report
structure_analyzer.print_structure_report()

In [None]:
# Visualize section coverage
if HAS_MATPLOTLIB:
    coverage = structure_analyzer.get_section_coverage()
    
    fig, ax = plt.subplots(figsize=(10, max(4, len(coverage) * 0.4)))
    sections = list(coverage.keys())
    rates = list(coverage.values())
    colors = ['green' if r > 0.7 else 'orange' if r > 0.4 else 'red' for r in rates]
    
    ax.barh(sections, rates, color=colors)
    ax.set_xlabel('Coverage')
    ax.set_title(f'Section Coverage in {ANALYSIS_CONFIG.primary_text_field}')
    ax.axvline(x=0.7, color='green', linestyle='--', alpha=0.5)
    ax.set_xlim(0, 1.1)
    plt.tight_layout()
    plt.show()

In [None]:
# View example parsed structure
if parsed_jds:
    example = parsed_jds[0]
    print(f"Example JD: {example.jd_id}")
    print(f"Sections found: {len(example.sections)}")
    print(f"Section types: {example.section_names()}")
    
    print("\nSection previews:")
    for s in example.sections[:3]:
        print(f"  [{s['section_type']}] {s['header'][:40]}...")
        print(f"     {s['content'][:80]}...")

In [None]:
# Analyze additional text fields (if configured)
additional_results = {}

for text_field in ANALYSIS_CONFIG.additional_text_fields:
    if text_field in df.columns:
        print(f"\n{'='*50}")
        print(f"ADDITIONAL: {text_field}")
        print(f"{'='*50}")
        
        add_analyzer = StructureAnalyzer()
        add_analyzer.parse_corpus(
            df,
            text_field=text_field,
            id_field=ANALYSIS_CONFIG.id_field,
            show_progress=False,
        )
        
        consistency = add_analyzer.measure_consistency()
        print(f"Sections/JD: {consistency['num_sections_stats']['mean']:.1f}")
        print(f"Top sections: {list(consistency['section_coverage'].keys())[:5]}")
        
        additional_results[text_field] = add_analyzer
    else:
        print(f"[WARN] {text_field} not in data")

In [None]:
# Save structure results - using config output path
OUTPUT_DIR_STRUCT = get_phase_output_path("phase_1_2_structure")
OUTPUT_DIR_STRUCT.mkdir(parents=True, exist_ok=True)

structure_analyzer.export_parsed_jds(str(OUTPUT_DIR_STRUCT / "parsed_jd_structures.json"))

consistency = structure_analyzer.measure_consistency()
with open(OUTPUT_DIR_STRUCT / "structure_consistency.json", "w") as f:
    json.dump(consistency, f, indent=2, default=str)

# Save additional text field results
for text_field, analyzer in additional_results.items():
    add_consistency = analyzer.measure_consistency()
    with open(OUTPUT_DIR_STRUCT / f"structure_consistency_{text_field}.json", "w") as f:
        json.dump(add_consistency, f, indent=2, default=str)

print(f"Results saved to {OUTPUT_DIR_STRUCT}/")

---
## 6. Phase 1.3: Content Clustering

Embeds `primary_text_field`, clusters, and analyzes against `cluster_metadata_fields`.

In [None]:
# Generate embeddings using config
print(f"Embedding: {ANALYSIS_CONFIG.primary_text_field}")
print(f"Azure OpenAI deployment: {EMBEDDING_CONFIG['deployment_name']}")

embedder = get_embedding_generator()
embeddings, ids = embedder.embed_dataframe(
    df,
    text_field=ANALYSIS_CONFIG.primary_text_field,
    id_field=ANALYSIS_CONFIG.id_field,
)

print(f"\nShape: {embeddings.shape}")

In [None]:
# Find optimal k
clusterer = ContentClusterer(embeddings, ids)

K_VALUES = [5, 10, 15, 20, 30, 50]
k_scores = {}

print("Testing k values...")
for k in K_VALUES:
    if k < len(df):  # Can't have more clusters than samples
        result = clusterer.kmeans(n_clusters=k)
        k_scores[k] = result.silhouette_score
        print(f"  k={k:3d}: silhouette={result.silhouette_score:.4f}")

best_k = max(k_scores, key=k_scores.get)
print(f"\nBest k={best_k} (silhouette={k_scores[best_k]:.4f})")

In [None]:
# Visualize k selection
if HAS_MATPLOTLIB:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(list(k_scores.keys()), list(k_scores.values()), 'bo-', linewidth=2, markersize=8)
    ax.axvline(x=best_k, color='red', linestyle='--', label=f'Best k={best_k}')
    ax.set_xlabel('Number of Clusters (k)')
    ax.set_ylabel('Silhouette Score')
    ax.set_title('Cluster Quality by k')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.show()

In [None]:
# Analyze clusters with configured metadata fields
SELECTED_K = best_k  # Or override

best_result = clusterer.results[f"kmeans_{SELECTED_K}"]
cluster_df = clusterer.get_cluster_assignments(f"kmeans_{SELECTED_K}")

# Get available metadata fields
metadata_fields = ANALYSIS_CONFIG.get_available_cluster_fields(list(df.columns))
print(f"Analyzing clusters against: {metadata_fields}")

cluster_analyzer = ClusterAnalyzer(
    df, cluster_df,
    id_field=ANALYSIS_CONFIG.id_field,
)
cluster_analyzer.print_cluster_report(metadata_fields=metadata_fields)

In [None]:
# Cluster purity
purity_field = ANALYSIS_CONFIG.cluster_purity_field
purity = None

if purity_field in df.columns:
    purity = cluster_analyzer.compute_cluster_purity(purity_field)
    print(f"Cluster purity (vs {purity_field}): {purity['overall_purity']:.3f}")
    
    print("\nPurity by cluster:")
    for cid, info in sorted(purity["cluster_purities"].items()):
        print(f"  Cluster {cid}: {info['purity']:.2f} (dominant: {info['dominant_label']})")
else:
    print(f"[WARN] Purity field '{purity_field}' not in data")

In [None]:
# Dimensionality reduction for visualization
viz_df = None

try:
    print("Reducing dimensions with UMAP...")
    reduced = clusterer.reduce_dimensions(method="umap", n_components=2)
    
    viz_df = create_visualization_data(
        reduced,
        best_result.labels,
        ids,
        df,
        id_field=ANALYSIS_CONFIG.id_field,
    )
    print(f"[OK] Visualization data ready: {len(viz_df)} points")
    print(f"Columns: {list(viz_df.columns)}")
    print(f"x range: {viz_df['x'].min():.2f} to {viz_df['x'].max():.2f}")
    print(f"y range: {viz_df['y'].min():.2f} to {viz_df['y'].max():.2f}")
    print(f"\nFirst few rows:")
    display(viz_df.head())
except ImportError:
    print("[ERROR] UMAP not installed. Install with: pip install umap-learn")

In [None]:
# Interactive plot - colored by cluster
if HAS_PLOTLY and viz_df is not None and len(viz_df) > 0:
    # Convert cluster to string for better color handling
    viz_df['cluster_str'] = viz_df['cluster'].astype(str)
    
    hover_cols = [ANALYSIS_CONFIG.id_field, "cluster"]
    hover_cols.extend([f for f in metadata_fields[:3] if f in viz_df.columns])
    
    fig = px.scatter(
        viz_df,
        x="x", y="y",
        color="cluster_str",
        hover_data=hover_cols,
        title=f"Content Clusters (k={SELECTED_K}, text={ANALYSIS_CONFIG.primary_text_field})",
    )
    fig.update_traces(marker=dict(size=8, opacity=0.7))
    fig.update_layout(width=900, height=700)
    fig.show()
elif viz_df is not None:
    print(f"[WARN] viz_df has {len(viz_df)} rows - nothing to plot")
else:
    print("[WARN] No visualization data available")

In [None]:
# Interactive plot - colored by metadata field
COLOR_BY = ANALYSIS_CONFIG.cluster_purity_field  # Change to any available field

if HAS_PLOTLY and viz_df is not None and len(viz_df) > 0 and COLOR_BY in viz_df.columns:
    fig = px.scatter(
        viz_df,
        x="x", y="y",
        color=COLOR_BY,
        hover_data=[ANALYSIS_CONFIG.id_field, "cluster"],
        title=f"Embeddings colored by {COLOR_BY}",
    )
    fig.update_traces(marker=dict(size=8, opacity=0.7))
    fig.update_layout(width=900, height=700)
    fig.show()
elif viz_df is not None:
    print(f"[WARN] {COLOR_BY} not available for coloring. Available columns: {list(viz_df.columns)}")

In [None]:
# Cluster archetypes
title_field = "title" if "title" in df.columns else ANALYSIS_CONFIG.id_field

archetypes = cluster_analyzer.find_cluster_archetypes(
    text_field=ANALYSIS_CONFIG.primary_text_field,
    title_field=title_field,
    n_examples=2,
)

print("CLUSTER ARCHETYPES")
print("=" * 60)
for cluster_id, archetype in list(archetypes.items())[:5]:
    print(f"\nCluster {cluster_id} ({archetype['size']} JDs)")
    print(f"  Common titles: {list(archetype['common_titles'].keys())[:3]}")
    if archetype['examples']:
        print(f"  Example: {archetype['examples'][0]['title']}")

In [None]:
# Cluster additional text fields (if configured)
additional_cluster_results = {}

for text_field in ANALYSIS_CONFIG.additional_text_fields:
    if text_field in df.columns:
        print(f"\n{'='*50}")
        print(f"CLUSTERING: {text_field}")
        print(f"{'='*50}")
        
        add_embeddings, add_ids = embedder.embed_dataframe(
            df,
            text_field=text_field,
            id_field=ANALYSIS_CONFIG.id_field,
        )
        
        add_clusterer = ContentClusterer(add_embeddings, add_ids)
        add_result = add_clusterer.kmeans(n_clusters=SELECTED_K)
        
        print(f"Silhouette: {add_result.silhouette_score:.4f}")
        print(f"Cluster sizes: {add_result.cluster_sizes}")
        
        additional_cluster_results[text_field] = {
            "clusterer": add_clusterer,
            "result": add_result,
        }

In [None]:
# Save all Phase 1.3 results - using config output path
OUTPUT_DIR_CLUSTER = get_phase_output_path("phase_1_3_clustering")
OUTPUT_DIR_CLUSTER.mkdir(parents=True, exist_ok=True)

# Save embeddings
np.save(OUTPUT_DIR_CLUSTER / "embeddings.npy", embeddings)
with open(OUTPUT_DIR_CLUSTER / "embedding_ids.json", "w") as f:
    json.dump(ids, f)

# Save cluster results
cluster_df.to_csv(OUTPUT_DIR_CLUSTER / "cluster_assignments.csv", index=False)

with open(OUTPUT_DIR_CLUSTER / "k_scores.json", "w") as f:
    json.dump(k_scores, f, indent=2)

composition = cluster_analyzer.analyze_cluster_composition(metadata_fields)
with open(OUTPUT_DIR_CLUSTER / "cluster_composition.json", "w") as f:
    json.dump(composition, f, indent=2, default=str)

with open(OUTPUT_DIR_CLUSTER / "cluster_archetypes.json", "w") as f:
    json.dump(archetypes, f, indent=2, default=str)

if viz_df is not None:
    viz_df.to_csv(OUTPUT_DIR_CLUSTER / "visualization_data.csv", index=False)

# Save additional text field results
for text_field, data in additional_cluster_results.items():
    add_cluster_df = data["clusterer"].get_cluster_assignments(f"kmeans_{SELECTED_K}")
    add_cluster_df.to_csv(
        OUTPUT_DIR_CLUSTER / f"cluster_assignments_{text_field}.csv",
        index=False,
    )

print(f"Results saved to {OUTPUT_DIR_CLUSTER}/")

---
## 7. Load Previous Results (for visualization only)

Use this section if you already ran `run_analysis.py` and just want to visualize.

In [None]:
# Load visualization data from previous run
VIZ_FILE = get_phase_output_path("phase_1_3_clustering") / "visualization_data.csv"

print(f"Looking for: {VIZ_FILE}")
print(f"Absolute path: {VIZ_FILE.absolute()}")
print(f"Exists: {VIZ_FILE.exists()}")

if VIZ_FILE.exists():
    loaded_viz_df = pd.read_csv(VIZ_FILE)
    print(f"\nLoaded {len(loaded_viz_df)} points")
    print(f"Columns: {list(loaded_viz_df.columns)}")
    print(f"\nData types:")
    print(loaded_viz_df.dtypes)
    print(f"\nFirst few rows:")
    display(loaded_viz_df.head())
else:
    print(f"\n[ERROR] File not found: {VIZ_FILE}")
    print(f"\nRun 'python run_analysis.py --phase 1.3' first, or run the cells above.")
    
    # List what files exist in the output directory
    cluster_dir = get_phase_output_path("phase_1_3_clustering")
    if cluster_dir.exists():
        print(f"\nFiles in {cluster_dir}:")
        for f in cluster_dir.iterdir():
            print(f"  - {f.name}")
    else:
        print(f"\nOutput directory does not exist: {cluster_dir}")

In [None]:
# Plot loaded visualization data
if 'loaded_viz_df' in dir() and loaded_viz_df is not None and len(loaded_viz_df) > 0:
    if HAS_PLOTLY:
        # Check for required columns
        if 'x' not in loaded_viz_df.columns or 'y' not in loaded_viz_df.columns:
            print(f"[ERROR] Missing x/y columns. Available: {list(loaded_viz_df.columns)}")
        else:
            print(f"Plotting {len(loaded_viz_df)} points...")
            print(f"x range: {loaded_viz_df['x'].min():.2f} to {loaded_viz_df['x'].max():.2f}")
            print(f"y range: {loaded_viz_df['y'].min():.2f} to {loaded_viz_df['y'].max():.2f}")
            
            # Convert cluster to string for categorical coloring
            if 'cluster' in loaded_viz_df.columns:
                loaded_viz_df['cluster_str'] = loaded_viz_df['cluster'].astype(str)
                color_col = 'cluster_str'
            else:
                color_col = None
            
            fig = px.scatter(
                loaded_viz_df,
                x="x", y="y",
                color=color_col,
                title="Content Clusters (from saved data)",
            )
            fig.update_traces(marker=dict(size=8, opacity=0.7))
            fig.update_layout(width=900, height=700)
            fig.show()
    else:
        print("[WARN] Plotly not installed. Install with: pip install plotly")
else:
    print("[WARN] No data loaded. Run the cell above first.")

---
## 8. Summary

In [None]:
print("=" * 60)
print("PHASE 1 ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nDATA")
print(f"   Total JDs: {len(df)}")
print(f"   Text analyzed: {ANALYSIS_CONFIG.primary_text_field}")
if ANALYSIS_CONFIG.additional_text_fields:
    print(f"   Additional texts: {ANALYSIS_CONFIG.additional_text_fields}")

print(f"\nPHASE 1.1: QUALITY BASELINE")
print(f"   Stratification: {primary_strat or 'random'} x {secondary_strat or 'N/A'}")
print(f"   Sample size: {len(sample)}")
if evaluator.evaluations:
    results = evaluator.analyze_quality()
    print(f"   Evaluated: {results['total_evaluated']}")
    print(f"   Avg score: {results['overall_stats']['mean_average_score']:.2f}")
else:
    print(f"   Status: Awaiting evaluation")

print(f"\nPHASE 1.2: STRUCTURAL CONSISTENCY")
consistency = structure_analyzer.measure_consistency()
print(f"   Unique structures: {consistency['unique_structures']}")
print(f"   Top pattern coverage: {consistency['top_structure_coverage']:.1%}")
print(f"   Avg sections/JD: {consistency['num_sections_stats']['mean']:.1f}")

print(f"\nPHASE 1.3: CONTENT CLUSTERING")
print(f"   Best k: {best_k} (silhouette={k_scores[best_k]:.4f})")
print(f"   Metadata analyzed: {metadata_fields}")
if purity is not None:
    print(f"   Purity ({purity_field}): {purity['overall_purity']:.3f}")

print(f"\nOUTPUTS")
print(f"   {get_phase_output_path('phase_1_1_quality')}/")
print(f"   {get_phase_output_path('phase_1_2_structure')}/")
print(f"   {get_phase_output_path('phase_1_3_clustering')}/")

---
## Configuration Reference

To change analysis behavior, edit `config.py`:

```python
# Change output directory
OUTPUT_CONFIG = {
    "root_dir": "/path/to/your/output",
    ...
}

# Analyze different text field
ANALYSIS_CONFIG = AnalysisConfig(
    primary_text_field="jd_expertise",  # Instead of jd_text
)

# Stratify by different dimensions
ANALYSIS_CONFIG = AnalysisConfig(
    stratify_by_primary="rank",      # Instead of org_unit
    stratify_by_secondary="title",   # Instead of level
)

# Analyze clusters against different fields
ANALYSIS_CONFIG = AnalysisConfig(
    cluster_metadata_fields=["rank", "title", "hiring_manager"],
    cluster_purity_field="rank",
)

# Analyze multiple text fields
ANALYSIS_CONFIG = AnalysisConfig(
    primary_text_field="jd_text",
    additional_text_fields=["jd_expertise", "jd_requirements"],
)
```

Then re-run this notebook - no code changes needed!