# Enhanced Job Architecture System with SOC Titles

This notebook builds a comprehensive job architecture system using 18,000+ job titles from the Standard Occupational Classification (SOC) system:
1. Process and normalize 18K+ SOC job titles
2. Build graph database with hierarchical relationships
3. Map skills to job titles
4. Generate industry/company-specific architectures
5. Web services for normalization, career paths, and skills lookup
6. Integration with skill extraction service

## Setup and Imports

In [None]:
!pip install -q networkx pandas numpy scikit-learn sentence-transformers flask flask-cors requests python-dotenv rapidfuzz tqdm

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set
from dataclasses import dataclass, asdict
from collections import defaultdict, Counter
import re

# ML and embeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz, process
from tqdm.auto import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## Load and Analyze SOC Titles Dataset

In [None]:
# Load SOC titles
soc_df = pd.read_csv('/mnt/user-data/uploads/SOC_titles.csv')

# Drop the empty column
soc_df = soc_df.drop('Unnamed: 4', axis=1)

# Filter out 'not available' normalized titles
soc_df = soc_df[soc_df['normalized'] != 'not available'].copy()

print(f"Total job titles: {len(soc_df):,}")
print(f"Unique SOC5 categories: {soc_df['soc5_title'].nunique()}")
print(f"Unique normalized titles: {soc_df['normalized'].nunique():,}")
print(f"\nSample data:")
soc_df.head(10)

In [None]:
# Analyze SOC categories
print("Top 20 SOC5 Categories by Job Title Count:")
print(soc_df['soc5_title'].value_counts().head(20))

# Sample different categories
print("\nSample categories:")
sample_categories = [
    'Software Developers',
    'Data Scientists',
    'Product Managers',
    'Chief Executives',
    'Financial Managers'
]

for cat in sample_categories:
    titles = soc_df[soc_df['soc5_title'].str.contains(cat, case=False, na=False)]
    if len(titles) > 0:
        print(f"\n{cat}:")
        print(titles[['title_name', 'normalized']].head(5).to_string(index=False))

## Job Level Classification

Classify all SOC titles into organizational levels based on title patterns

In [None]:
class JobLevelClassifier:
    """Classify job titles into organizational levels"""
    
    def __init__(self):
        # Level 9: C-Suite / Executive
        self.level_9_patterns = [
            r'\bchief\s+\w+\s+officer\b',
            r'\bceo\b', r'\bcfo\b', r'\bcto\b', r'\bcoo\b', r'\bcmo\b', r'\bcpo\b', r'\bchro\b',
            r'\bpresident\b(?!.*\bassociate\b)',
            r'\bexecutive\s+director\b',
        ]
        
        # Level 8: Senior VP
        self.level_8_patterns = [
            r'\bsenior\s+vice\s+president\b',
            r'\bsvp\b',
            r'\bexecutive\s+vice\s+president\b',
            r'\bevp\b',
        ]
        
        # Level 7: VP
        self.level_7_patterns = [
            r'\bvice\s+president\b(?!.*senior)',
            r'\bvp\b(?!.*senior)',
        ]
        
        # Level 6: Director
        self.level_6_patterns = [
            r'\bdirector\b(?!.*assistant|.*associate|.*deputy)',
            r'\bhead\s+of\b',
        ]
        
        # Level 5: Senior Manager
        self.level_5_patterns = [
            r'\bsenior\s+manager\b',
            r'\bgroup\s+manager\b',
            r'\bprogram\s+manager\b(?!.*assistant)',
            r'\bprincipal\b(?!.*engineer|.*scientist|.*designer)',
        ]
        
        # Level 4: Manager / Principal IC
        self.level_4_patterns = [
            r'\bmanager\b(?!.*assistant|.*senior|.*program)',
            r'\bstaff\s+(engineer|scientist|designer|analyst)\b',
            r'\bprincipal\s+(engineer|scientist|designer|analyst)\b',
            r'\blead\s+(engineer|scientist|designer|developer|analyst)\b',
            r'\bsupervisor\b',
        ]
        
        # Level 3: Senior IC
        self.level_3_patterns = [
            r'\bsenior\s+(engineer|scientist|designer|developer|analyst|consultant|specialist)\b',
            r'\bsr\.?\s+(engineer|scientist|designer|developer|analyst)\b',
        ]
        
        # Level 2: Mid-level IC
        self.level_2_patterns = [
            r'\b(engineer|scientist|designer|developer|analyst|consultant|specialist)\b(?!.*senior|.*jr|.*junior|.*assistant)',
            r'\btechnician\b',
            r'\bcoordinator\b',
        ]
        
        # Level 1: Junior IC
        self.level_1_patterns = [
            r'\bjunior\b',
            r'\bjr\.?\b',
            r'\bassociate\s+(engineer|scientist|designer|developer|analyst|consultant)\b',
            r'\bassistant\b',
        ]
        
        # Level 0: Entry / Intern
        self.level_0_patterns = [
            r'\bintern\b',
            r'\btrainee\b',
            r'\bentry\s+level\b',
        ]
    
    def classify(self, title: str) -> int:
        """Classify a job title into a level (0-9)"""
        title_lower = title.lower()
        
        # Check levels from highest to lowest
        for level, patterns in [
            (9, self.level_9_patterns),
            (8, self.level_8_patterns),
            (7, self.level_7_patterns),
            (6, self.level_6_patterns),
            (5, self.level_5_patterns),
            (4, self.level_4_patterns),
            (3, self.level_3_patterns),
            (0, self.level_0_patterns),
            (1, self.level_1_patterns),
            (2, self.level_2_patterns),
        ]:
            for pattern in patterns:
                if re.search(pattern, title_lower):
                    return level
        
        # Default to level 2 (mid-level IC)
        return 2

In [None]:
# Classify all titles
classifier = JobLevelClassifier()

print("Classifying job levels...")
soc_df['level'] = soc_df['normalized'].progress_apply(classifier.classify)

print(f"\nLevel distribution:")
level_dist = soc_df['level'].value_counts().sort_index()
print(level_dist)

# Show sample titles per level
print("\nSample titles per level:")
for level in sorted(soc_df['level'].unique()):
    samples = soc_df[soc_df['level'] == level]['normalized'].drop_duplicates().head(5).tolist()
    print(f"\nLevel {level}:")
    for s in samples:
        print(f"  - {s}")

## Job Family Classification

Group titles into job families based on SOC categories and keywords

In [None]:
class JobFamilyClassifier:
    """Classify job titles into families"""
    
    def __init__(self):
        self.family_keywords = {
            'Engineering': [
                'software', 'engineer', 'developer', 'programmer', 'devops', 'sre',
                'site reliability', 'systems', 'infrastructure', 'backend', 'frontend',
                'full stack', 'mobile', 'web', 'application', 'platform', 'technical'
            ],
            'Data': [
                'data', 'analytics', 'scientist', 'machine learning', 'ml engineer',
                'artificial intelligence', 'ai', 'deep learning', 'nlp', 'computer vision',
                'data engineer', 'data analyst', 'business intelligence', 'bi'
            ],
            'Product': [
                'product manager', 'product owner', 'product lead', 'product director',
                'product management', 'program manager'
            ],
            'Design': [
                'designer', 'ux', 'ui', 'user experience', 'user interface',
                'visual design', 'interaction design', 'product design', 'graphic design'
            ],
            'Sales': [
                'sales', 'account executive', 'account manager', 'business development',
                'sales engineer', 'sales director', 'revenue', 'commercial'
            ],
            'Marketing': [
                'marketing', 'brand', 'advertising', 'content', 'social media',
                'digital marketing', 'growth', 'demand generation', 'communications'
            ],
            'HR': [
                'human resources', 'hr', 'people', 'talent', 'recruiting', 'recruiter',
                'people operations', 'compensation', 'benefits', 'training'
            ],
            'Finance': [
                'finance', 'accounting', 'financial', 'controller', 'treasury',
                'fp&a', 'financial planning', 'cfo', 'accountant', 'audit'
            ],
            'Operations': [
                'operations', 'supply chain', 'logistics', 'procurement',
                'facilities', 'office manager', 'project manager', 'operations manager'
            ],
            'Customer Success': [
                'customer success', 'customer support', 'customer service',
                'account management', 'client services', 'support engineer'
            ],
            'Legal': [
                'legal', 'counsel', 'attorney', 'lawyer', 'compliance', 'regulatory'
            ],
            'Executive': [
                'chief executive', 'ceo', 'president', 'chief operating', 'coo',
                'chief financial', 'cfo', 'chief technology', 'cto',
                'chief product', 'cpo', 'chief marketing', 'cmo', 'executive director'
            ],
        }
    
    def classify(self, title: str, soc_category: str = "") -> str:
        """Classify a job title into a family"""
        title_lower = title.lower()
        soc_lower = soc_category.lower()
        combined = f"{title_lower} {soc_lower}"
        
        # Score each family
        scores = {}
        for family, keywords in self.family_keywords.items():
            score = sum(1 for keyword in keywords if keyword in combined)
            if score > 0:
                scores[family] = score
        
        # Return family with highest score
        if scores:
            return max(scores.items(), key=lambda x: x[1])[0]
        
        # Default to Operations
        return 'Operations'

In [None]:
# Classify families
family_classifier = JobFamilyClassifier()

print("Classifying job families...")
soc_df['family'] = soc_df.progress_apply(
    lambda row: family_classifier.classify(row['normalized'], row['soc5_title']), 
    axis=1
)

print(f"\nFamily distribution:")
family_dist = soc_df['family'].value_counts()
print(family_dist)

# Visualize
plt.figure(figsize=(12, 6))
family_dist.plot(kind='bar')
plt.title('Job Titles by Family')
plt.xlabel('Job Family')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Create Enhanced Job Title Structure

In [None]:
@dataclass
class JobTitle:
    id: str
    title: str
    level: int
    family: str
    soc_category: str
    alternate_titles: List[str] = None
    industry: str = "General"
    company_size: str = "All"
    
    def __post_init__(self):
        if self.alternate_titles is None:
            self.alternate_titles = []

In [None]:
# Create job title objects from SOC data
# Group by normalized title to collect variations
print("Creating job title objects...")

job_titles = []
grouped = soc_df.groupby('normalized')

for idx, (normalized_title, group) in enumerate(tqdm(grouped, desc="Processing titles")):
    # Get the most common level and family
    level = group['level'].mode()[0] if len(group['level'].mode()) > 0 else group['level'].iloc[0]
    family = group['family'].mode()[0] if len(group['family'].mode()) > 0 else group['family'].iloc[0]
    soc_category = group['soc5_title'].iloc[0]
    
    # Collect alternate titles
    alternate_titles = group['title_name'].unique().tolist()
    if normalized_title in alternate_titles:
        alternate_titles.remove(normalized_title)
    
    job = JobTitle(
        id=f"job_{idx:05d}",
        title=normalized_title,
        level=int(level),
        family=family,
        soc_category=soc_category,
        alternate_titles=alternate_titles[:10]  # Limit to 10 alternates
    )
    
    job_titles.append(job)

print(f"\nCreated {len(job_titles):,} unique job titles")
print(f"Average alternates per title: {sum(len(j.alternate_titles) for j in job_titles) / len(job_titles):.1f}")

In [None]:
# Show examples
print("Sample job titles:")
for job in job_titles[100:110]:
    print(f"\n{job.title}")
    print(f"  Level: {job.level}, Family: {job.family}")
    print(f"  SOC: {job.soc_category}")
    if job.alternate_titles:
        print(f"  Alternates: {', '.join(job.alternate_titles[:3])}...")

## Build Job Architecture Graph

In [None]:
class JobArchitectureGraph:
    """Graph database for job titles and their relationships"""
    
    def __init__(self):
        self.graph = nx.DiGraph()
        self.job_lookup = {}  # id -> JobTitle
        self.title_to_id = {}  # title -> id (lowercase)
        
    def add_job(self, job: JobTitle):
        """Add a job title to the graph"""
        self.graph.add_node(job.id, **asdict(job))
        self.job_lookup[job.id] = job
        self.title_to_id[job.title.lower()] = job.id
        
        # Add alternate titles
        for alt_title in job.alternate_titles:
            self.title_to_id[alt_title.lower()] = job.id
    
    def add_reporting_relationship(self, reports_to_id: str, reports_from_id: str, 
                                   relationship_type: str = "reports_to"):
        """Add a hierarchical relationship between jobs"""
        self.graph.add_edge(reports_from_id, reports_to_id, relationship=relationship_type)
    
    def build_hierarchy(self, max_edges_per_node: int = 10):
        """Build reporting relationships based on levels and families"""
        print("Building hierarchy...")
        
        # Group jobs by family and level
        family_jobs = defaultdict(lambda: defaultdict(list))
        
        for job_id, job in self.job_lookup.items():
            family_jobs[job.family][job.level].append(job_id)
        
        edges_added = 0
        
        # Create reporting relationships within each family
        for family, levels in tqdm(family_jobs.items(), desc="Building family hierarchies"):
            sorted_levels = sorted(levels.keys())
            
            for i in range(len(sorted_levels) - 1):
                current_level = sorted_levels[i]
                next_level = sorted_levels[i + 1]
                
                current_jobs = levels[current_level]
                manager_jobs = levels[next_level]
                
                # Limit edges to avoid graph explosion
                for job_id in current_jobs[:max_edges_per_node]:
                    for manager_id in manager_jobs[:max_edges_per_node]:
                        self.add_reporting_relationship(manager_id, job_id)
                        edges_added += 1
        
        print(f"Added {edges_added:,} reporting relationships")
    
    def get_career_path(self, job_id: str, direction: str = "up", limit: int = 20) -> List[JobTitle]:
        """Get career path from a job"""
        job = self.job_lookup.get(job_id)
        if not job:
            return []
        
        if direction == "up":
            # Higher levels in same family
            results = [j for j in self.job_lookup.values() 
                      if j.family == job.family and j.level > job.level]
        elif direction == "down":
            # Lower levels in same family
            results = [j for j in self.job_lookup.values() 
                      if j.family == job.family and j.level < job.level]
        elif direction == "lateral":
            # Same level, different family
            results = [j for j in self.job_lookup.values() 
                      if j.level == job.level and j.family != job.family]
        else:
            return []
        
        # Sort by level
        results.sort(key=lambda x: x.level, reverse=(direction == "up"))
        return results[:limit]
    
    def save(self, filepath: str):
        """Save graph to file"""
        print(f"Saving graph to {filepath}...")
        data = {
            'job_lookup': {k: asdict(v) for k, v in self.job_lookup.items()},
            'title_to_id': self.title_to_id
        }
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Saved {len(self.job_lookup):,} jobs and {len(self.title_to_id):,} title mappings")
    
    @classmethod
    def load(cls, filepath: str):
        """Load graph from file"""
        print(f"Loading graph from {filepath}...")
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        graph_obj = cls()
        graph_obj.job_lookup = {k: JobTitle(**v) for k, v in data['job_lookup'].items()}
        graph_obj.title_to_id = data['title_to_id']
        
        # Rebuild graph structure
        for job_id, job in graph_obj.job_lookup.items():
            graph_obj.graph.add_node(job_id, **asdict(job))
        
        print(f"Loaded {len(graph_obj.job_lookup):,} jobs")
        return graph_obj

In [None]:
# Build the graph
job_graph = JobArchitectureGraph()

print("Adding jobs to graph...")
for job in tqdm(job_titles, desc="Adding jobs"):
    job_graph.add_job(job)

print(f"\nGraph contains {len(job_graph.graph.nodes):,} nodes")
print(f"Title lookup contains {len(job_graph.title_to_id):,} entries")

# Build hierarchy (this can take a while for large graphs)
job_graph.build_hierarchy(max_edges_per_node=5)

print(f"\nFinal graph: {len(job_graph.graph.nodes):,} nodes, {len(job_graph.graph.edges):,} edges")

## Job Title Normalizer with Embeddings

In [None]:
class JobTitleNormalizer:
    """Normalize job titles using hybrid matching"""
    
    def __init__(self, job_graph: JobArchitectureGraph, model_name: str = "all-MiniLM-L6-v2"):
        self.job_graph = job_graph
        self.model = SentenceTransformer(model_name)
        
        print("Preparing title data...")
        # Prepare all titles for matching
        self.all_titles = []
        self.title_to_job = {}
        
        for job in tqdm(job_graph.job_lookup.values(), desc="Collecting titles"):
            self.all_titles.append(job.title)
            self.title_to_job[job.title] = job
            
            for alt in job.alternate_titles:
                self.all_titles.append(alt)
                self.title_to_job[alt] = job
        
        print(f"Total searchable titles: {len(self.all_titles):,}")
        
        # Pre-compute embeddings
        print("Computing embeddings (this may take a few minutes)...")
        self.title_embeddings = self.model.encode(
            self.all_titles, 
            show_progress_bar=True,
            batch_size=256
        )
        print(f"Embeddings shape: {self.title_embeddings.shape}")
    
    def normalize(self, input_title: str, top_k: int = 5, fuzzy_threshold: int = 80) -> List[Dict]:
        """Normalize a job title and return similar matches"""
        
        # 1. Exact match
        if input_title.lower() in self.job_graph.title_to_id:
            job_id = self.job_graph.title_to_id[input_title.lower()]
            job = self.job_graph.job_lookup[job_id]
            return [{
                "title": job.title,
                "job_id": job_id,
                "level": job.level,
                "family": job.family,
                "soc_category": job.soc_category,
                "similarity_score": 1.0,
                "match_type": "exact"
            }]
        
        # 2. Fuzzy matching
        fuzzy_matches = process.extract(
            input_title, 
            self.all_titles, 
            scorer=fuzz.token_sort_ratio,
            limit=top_k * 2
        )
        
        fuzzy_results = []
        for match_title, score, _ in fuzzy_matches:
            if score >= fuzzy_threshold:
                job = self.title_to_job[match_title]
                fuzzy_results.append({
                    "title": job.title,
                    "job_id": job.id,
                    "level": job.level,
                    "family": job.family,
                    "soc_category": job.soc_category,
                    "similarity_score": score / 100.0,
                    "match_type": "fuzzy"
                })
        
        # 3. Semantic similarity
        input_embedding = self.model.encode([input_title])
        similarities = cosine_similarity(input_embedding, self.title_embeddings)[0]
        
        top_indices = np.argsort(similarities)[-top_k * 2:][::-1]
        
        semantic_results = []
        for idx in top_indices:
            match_title = self.all_titles[idx]
            job = self.title_to_job[match_title]
            semantic_results.append({
                "title": job.title,
                "job_id": job.id,
                "level": job.level,
                "family": job.family,
                "soc_category": job.soc_category,
                "similarity_score": float(similarities[idx]),
                "match_type": "semantic"
            })
        
        # Combine and deduplicate
        seen_ids = set()
        combined_results = []
        
        for result_list in [fuzzy_results, semantic_results]:
            for result in result_list:
                if result["job_id"] not in seen_ids:
                    seen_ids.add(result["job_id"])
                    combined_results.append(result)
        
        # Sort by similarity
        combined_results.sort(key=lambda x: x["similarity_score"], reverse=True)
        
        return combined_results[:top_k]
    
    def save(self, filepath: str):
        """Save normalizer data"""
        print(f"Saving normalizer to {filepath}...")
        with open(filepath, 'wb') as f:
            pickle.dump({
                'all_titles': self.all_titles,
                'title_to_job': {k: asdict(v) for k, v in self.title_to_job.items()},
                'title_embeddings': self.title_embeddings,
            }, f)
        print("Normalizer saved")

In [None]:
# Build normalizer
normalizer = JobTitleNormalizer(job_graph)

In [None]:
# Test normalization
test_titles = [
    "Software Developer",
    "ML Engineer",
    "Product Lead",
    "VP of Engineering",
    "Data Analyst",
    "UX Designer",
    "Sales Rep",
]

for test_title in test_titles:
    print(f"\n{'='*60}")
    print(f"Input: '{test_title}'")
    print(f"{'='*60}")
    results = normalizer.normalize(test_title, top_k=5)
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['title']}")
        print(f"   Score: {result['similarity_score']:.3f} | Level: {result['level']} | "
              f"Family: {result['family']} | Type: {result['match_type']}")

## Save All Data

In [None]:
# Create output directory
output_dir = Path("/home/claude/job_architecture_data")
output_dir.mkdir(exist_ok=True)

print("Saving all data...")

# Save job graph
job_graph.save(str(output_dir / "job_graph.json"))

# Save normalizer
normalizer.save(str(output_dir / "normalizer_data.pkl"))

# Save statistics
stats = {
    "total_jobs": len(job_titles),
    "total_searchable_titles": len(normalizer.all_titles),
    "families": soc_df['family'].value_counts().to_dict(),
    "levels": soc_df['level'].value_counts().to_dict(),
    "soc_categories": int(soc_df['soc5_title'].nunique())
}

with open(output_dir / "statistics.json", 'w') as f:
    json.dump(stats, f, indent=2)

print(f"\nAll data saved to {output_dir}")
print(f"\nStatistics:")
for key, value in stats.items():
    if isinstance(value, dict):
        print(f"  {key}: {len(value)} unique values")
    else:
        print(f"  {key}: {value:,}")

## Export Sample Data

In [None]:
# Export sample of processed data
sample_df = soc_df.sample(min(1000, len(soc_df)))
sample_df.to_csv(output_dir / "sample_processed_titles.csv", index=False)

print(f"Saved sample of {len(sample_df)} titles to sample_processed_titles.csv")

# Create level/family summary
summary = soc_df.groupby(['family', 'level']).size().reset_index(name='count')
summary = summary.pivot(index='family', columns='level', values='count').fillna(0).astype(int)
summary.to_csv(output_dir / "family_level_summary.csv")

print("\nFamily x Level Distribution:")
print(summary)

## Summary

In [None]:
print("="*60)
print("JOB ARCHITECTURE SYSTEM - SUMMARY")
print("="*60)
print(f"\n✅ Processed {len(soc_df):,} job titles from SOC dataset")
print(f"✅ Created {len(job_titles):,} unique normalized job titles")
print(f"✅ Built graph with {len(job_graph.graph.nodes):,} nodes and {len(job_graph.graph.edges):,} edges")
print(f"✅ Generated embeddings for {len(normalizer.all_titles):,} searchable titles")
print(f"✅ Classified into {len(stats['families'])} job families")
print(f"✅ Organized into {len(stats['levels'])} organizational levels")
print(f"\nJob Families: {', '.join(sorted(stats['families'].keys()))}")
print(f"\nOrganizational Levels: 0 (Intern) → 9 (C-Suite)")
print(f"\nData saved to: {output_dir}")
print(f"\nNext step: Run the web service creation cells")
print("="*60)