In [16]:
!pip install pandas numpy fuzzywuzzy python-Levenshtein



In [17]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

# Load the Dataset

In [18]:
#Load CSV file into pandas

try:
    df = pd.read_csv('dataset.csv')
    print("Dataset loaded successfully!")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print("Please provide the correct path to your CSV file")

Dataset loaded successfully!
Initial shape: (27940, 26)


# Standardization of Job Titles using Fuzzy Matching

In [19]:
"""
Convert to lowercase and remove extra whitespace.
Remove special characters except basic punctuation
"""
def clean_text(text):
    if pd.isna(text):
        return text
    text = str(text).lower().strip()
    text = re.sub(r'[^\w\s.,-]', '', text)
    return text

In [20]:
"""
Standardize job titles using fuzzy matching
threshold: similarity score threshold (0-100)
"""
def standardize_job_titles(df, column_name, threshold=50):
    # Clean the job titles first
    df[column_name] = df[column_name].apply(clean_text)
    print(f"Starting standardization of '{column_name}' column...")
    
    # Get unique job titles
    unique_titles = df[column_name].dropna().unique()
    print(f"Found {len(unique_titles)} unique job titles to process")
    
    # Create clusters of similar titles
    standardized_titles = {}
    remaining_titles = list(unique_titles)
    total_clusters = 0
    
    while remaining_titles:
        base_title = remaining_titles.pop(0)
        print(f"\nProcessing base title: '{base_title}'")
        matches = process.extract(base_title, remaining_titles, 
                               scorer=fuzz.token_sort_ratio)
        
        # Find similar titles above threshold
        similar_titles = [m[0] for m in matches if m[1] >= threshold]
        
        if similar_titles:
            total_clusters += 1
            print(f"Found {len(similar_titles)} similar titles (threshold={threshold}):")
            for match, score in [(m[0], m[1]) for m in matches if m[1] >= threshold]:
                print(f"  - '{match}' (similarity score: {score}) -> standardized to '{base_title}'")
            standardized_titles[base_title] = similar_titles
            # Remove matched titles from remaining list
            remaining_titles = [t for t in remaining_titles if t not in similar_titles]
            print(f"Remaining titles to process: {len(remaining_titles)}")
        else:
            print("  No matches found above threshold")
    
    # Create mapping dictionary
    title_mapping = {}
    for standard_title, similar_list in standardized_titles.items():
        title_mapping[standard_title] = standard_title
        for similar in similar_list:
            title_mapping[similar] = standard_title
    
    # Apply mapping to dataframe
    print(f"\nApplying {len(title_mapping)} mappings to the dataset...")
    df[f'standardized_{column_name}'] = df[column_name].map(
        lambda x: title_mapping.get(x, x) if pd.notna(x) else x
    )
    
    # Summary
    print(f"Standardization complete!")
    print(f"Total clusters formed: {total_clusters}")
    print(f"Unique titles reduced from {len(unique_titles)} to {df[f'standardized_{column_name}'].nunique()}")
    
    return df, title_mapping

In [21]:
"""
Apply Standardization
"""
def apply_standardization(df):
    job_column = 'Job title'
    
    # Initial data exploration
    print("Initial unique job titles:", df[job_column].nunique())
    print("Missing values:", df[job_column].isna().sum())
    
    # Apply fuzzy matching for remaining titles
    df, title_mapping = standardize_job_titles(df, f'{job_column}')
    
    return df

try:
    # Process the data
    processed_df = apply_standardization(df)
    
    # Display sample of results
    print("\nSample of processed data:")
    print(processed_df[[
        'Job title',
        'standardized_Job title'
    ]].head(10))

except Exception as e:
    print(f"An error occurred: {str(e)}")

Initial unique job titles: 14288
Missing values: 0
Starting standardization of 'Job title' column...
Found 12062 unique job titles to process

Processing base title: 'research and instruction librarian'
Found 5 similar titles (threshold=50):
  - 'research  instruction librarian' (similarity score: 94) -> standardized to 'research and instruction librarian'
  - 'reference and instruction librarian' (similarity score: 90) -> standardized to 'research and instruction librarian'
  - 'head of research and instruction' (similarity score: 76) -> standardized to 'research and instruction librarian'
  - 'instruction librarian' (similarity score: 76) -> standardized to 'research and instruction librarian'
  - 'professor  instruction and assessment librarian' (similarity score: 75) -> standardized to 'research and instruction librarian'
Remaining titles to process: 12056

Processing base title: 'change  internal communications manager'
Found 5 similar titles (threshold=50):
  - 'internal communic

Found 5 similar titles (threshold=50):
  - 'senior it manager' (similarity score: 90) -> standardized to 'senior manager'
  - 'senior manager, it' (similarity score: 90) -> standardized to 'senior manager'
  - 'senior manager, hr' (similarity score: 90) -> standardized to 'senior manager'
  - 'senior hr manager' (similarity score: 90) -> standardized to 'senior manager'
  - 'hr senior manager' (similarity score: 90) -> standardized to 'senior manager'
Remaining titles to process: 11948

Processing base title: 'assistant director of academic advising'
Found 5 similar titles (threshold=50):
  - 'assistant director, academic  advising' (similarity score: 96) -> standardized to 'assistant director of academic advising'
  - 'director of academic advising' (similarity score: 85) -> standardized to 'assistant director of academic advising'
  - 'assistant director of admission' (similarity score: 83) -> standardized to 'assistant director of academic advising'
  - 'assistant director of admiss

Found 5 similar titles (threshold=50):
  - 'engineering lead' (similarity score: 90) -> standardized to 'lead engineer'
  - 'lead qa engineer' (similarity score: 90) -> standardized to 'lead engineer'
  - 'lead web engineer' (similarity score: 87) -> standardized to 'lead engineer'
  - 'lead ios engineer' (similarity score: 87) -> standardized to 'lead engineer'
  - 'lead data engineer' (similarity score: 84) -> standardized to 'lead engineer'
Remaining titles to process: 11828

Processing base title: 'senior copywriter'
Found 5 similar titles (threshold=50):
  - 'junior copywriter' (similarity score: 88) -> standardized to 'senior copywriter'
  - 'editor, copywriter' (similarity score: 88) -> standardized to 'senior copywriter'
  - 'sr copywriter' (similarity score: 87) -> standardized to 'senior copywriter'
  - 'senior copyeditor' (similarity score: 82) -> standardized to 'senior copywriter'
  - 'senior recruiter' (similarity score: 79) -> standardized to 'senior copywriter'
Remainin

Found 5 similar titles (threshold=50):
  - 'hr coordinator' (similarity score: 88) -> standardized to 'coordinator'
  - 'coordinator ii' (similarity score: 88) -> standardized to 'coordinator'
  - 'qa coordinator' (similarity score: 88) -> standardized to 'coordinator'
  - 'rn coordinator' (similarity score: 88) -> standardized to 'coordinator'
  - 'sr. coordinator' (similarity score: 88) -> standardized to 'coordinator'
Remaining titles to process: 11714

Processing base title: 'senior advisor'
Found 5 similar titles (threshold=50):
  - 'senior it advisor' (similarity score: 90) -> standardized to 'senior advisor'
  - 'senior hr advisor' (similarity score: 90) -> standardized to 'senior advisor'
  - 'senior aditor' (similarity score: 89) -> standardized to 'senior advisor'
  - 'senior auditor' (similarity score: 86) -> standardized to 'senior advisor'
  - 'senior donor advisor' (similarity score: 82) -> standardized to 'senior advisor'
Remaining titles to process: 11708

Processing ba

Found 5 similar titles (threshold=50):
  - 'technical writer' (similarity score: 81) -> standardized to 'tech writer'
  - 'tv writer' (similarity score: 80) -> standardized to 'tech writer'
  - 'hr writer' (similarity score: 80) -> standardized to 'tech writer'
  - 'speechwriter' (similarity score: 78) -> standardized to 'tech writer'
  - 'techical writer ii' (similarity score: 76) -> standardized to 'tech writer'
Remaining titles to process: 11630

Processing base title: 'supervisory archivist'
Found 5 similar titles (threshold=50):
  - 'archives supervisor' (similarity score: 90) -> standardized to 'supervisory archivist'
  - 'senior archivist' (similarity score: 81) -> standardized to 'supervisory archivist'
  - 'it supervisor' (similarity score: 76) -> standardized to 'supervisory archivist'
  - 'shift supervisor' (similarity score: 76) -> standardized to 'supervisory archivist'
  - 'supervisor hr specialist' (similarity score: 76) -> standardized to 'supervisory archivist'
Remaini

Found 5 similar titles (threshold=50):
  - 'director or development' (similarity score: 96) -> standardized to 'director of development'
  - 'development director' (similarity score: 93) -> standardized to 'director of development'
  - 'director, development' (similarity score: 93) -> standardized to 'director of development'
  - 'director of web development' (similarity score: 92) -> standardized to 'director of development'
  - 'director, fund development' (similarity score: 92) -> standardized to 'director of development'
Remaining titles to process: 11516

Processing base title: 'quality assurance lead'
Found 5 similar titles (threshold=50):
  - 'quality assurance' (similarity score: 87) -> standardized to 'quality assurance lead'
  - 'quality assurance auditor' (similarity score: 85) -> standardized to 'quality assurance lead'
  - 'quality assurance manager' (similarity score: 81) -> standardized to 'quality assurance lead'
  - 'quality assurance engineer' (similarity score: 79) -

Remaining titles to process: 11438

Processing base title: 'software engineer'
Found 5 similar titles (threshold=50):
  - 'software engineer 2' (similarity score: 94) -> standardized to 'software engineer'
  - 'software engineer 3' (similarity score: 94) -> standardized to 'software engineer'
  - 'software engineer i' (similarity score: 94) -> standardized to 'software engineer'
  - 'software engineer 1' (similarity score: 94) -> standardized to 'software engineer'
  - 'software engineer iv' (similarity score: 92) -> standardized to 'software engineer'
Remaining titles to process: 11432

Processing base title: 'medical education director'
Found 5 similar titles (threshold=50):
  - 'medical education coordinator' (similarity score: 84) -> standardized to 'medical education director'
  - 'director of special education' (similarity score: 84) -> standardized to 'medical education director'
  - 'education director' (similarity score: 82) -> standardized to 'medical education director'
  - 

Found 5 similar titles (threshold=50):
  - 'online news editor' (similarity score: 76) -> standardized to 'news editor'
  - 'online editor' (similarity score: 75) -> standardized to 'news editor'
  - 'lead editor' (similarity score: 73) -> standardized to 'news editor'
  - 'english editor' (similarity score: 72) -> standardized to 'news editor'
  - 'website editor' (similarity score: 72) -> standardized to 'news editor'
Remaining titles to process: 11354

Processing base title: 'product manager'
Found 5 similar titles (threshold=50):
  - 'product manager 2' (similarity score: 94) -> standardized to 'product manager'
  - 'produce manager' (similarity score: 93) -> standardized to 'product manager'
  - 'manager, production' (similarity score: 91) -> standardized to 'product manager'
  - 'sr. product manager' (similarity score: 91) -> standardized to 'product manager'
  - 'sr product manager' (similarity score: 91) -> standardized to 'product manager'
Remaining titles to process: 11348

P

Found 5 similar titles (threshold=50):
  - 'senior online sales manager' (similarity score: 87) -> standardized to 'online sales support manager'
  - 'sales support manager' (similarity score: 86) -> standardized to 'online sales support manager'
  - 'senior manager of sales' (similarity score: 75) -> standardized to 'online sales support manager'
  - 'sr. regional sales manager' (similarity score: 75) -> standardized to 'online sales support manager'
  - 'senior regional sales manager' (similarity score: 74) -> standardized to 'online sales support manager'
Remaining titles to process: 11270

Processing base title: 'phd candidate'
Found 5 similar titles (threshold=50):
  - 'md phd candidate' (similarity score: 90) -> standardized to 'phd candidate'
  - 'doctoral candidate' (similarity score: 71) -> standardized to 'phd candidate'
  - 'phd candidate graduate student' (similarity score: 60) -> standardized to 'phd candidate'
  - 'cad drafter' (similarity score: 58) -> standardized to 'p

Found 5 similar titles (threshold=50):
  - 'senior talent acquisition specialist' (similarity score: 89) -> standardized to 'talent acquisition specialist'
  - 'talent acquisition strategist' (similarity score: 83) -> standardized to 'talent acquisition specialist'
  - 'talent acquisition supervisor' (similarity score: 83) -> standardized to 'talent acquisition specialist'
  - 'talent acquisition partner' (similarity score: 80) -> standardized to 'talent acquisition specialist'
  - 'talent acquisition advisor' (similarity score: 80) -> standardized to 'talent acquisition specialist'
Remaining titles to process: 11150

Processing base title: 'content strategist and writer'
Found 5 similar titles (threshold=50):
  - 'content strategist' (similarity score: 77) -> standardized to 'content strategist and writer'
  - 'content and brand strategist' (similarity score: 77) -> standardized to 'content strategist and writer'
  - 'senior marketing strategist and writer' (similarity score: 75) -> s

Found 5 similar titles (threshold=50):
  - 'assistant director, student advancement' (similarity score: 81) -> standardized to 'assistant director, student affairs'
  - 'director of student affairs' (similarity score: 79) -> standardized to 'assistant director, student affairs'
  - 'assistant director of study abroad' (similarity score: 76) -> standardized to 'assistant director, student affairs'
  - 'senior assistant director' (similarity score: 75) -> standardized to 'assistant director, student affairs'
  - 'assistant to the director' (similarity score: 75) -> standardized to 'assistant director, student affairs'
Remaining titles to process: 11026

Processing base title: 'director of client services'
Found 5 similar titles (threshold=50):
  - 'client services director' (similarity score: 94) -> standardized to 'director of client services'
  - 'director client services' (similarity score: 94) -> standardized to 'director of client services'
  - 'senior director of client services' (

Found 5 similar titles (threshold=50):
  - 'supervisory librarian' (similarity score: 91) -> standardized to 'librarian, non-supervisory'
  - 'librarian supervisory' (similarity score: 91) -> standardized to 'librarian, non-supervisory'
  - 'library supervisor' (similarity score: 79) -> standardized to 'librarian, non-supervisory'
  - 'librarian non-supervisor, tech services' (similarity score: 76) -> standardized to 'librarian, non-supervisory'
  - 'interlibrary loan supervisor' (similarity score: 75) -> standardized to 'librarian, non-supervisory'
Remaining titles to process: 10942

Processing base title: 'programme administrator'
Found 5 similar titles (threshold=50):
  - 'program administrator' (similarity score: 95) -> standardized to 'programme administrator'
  - 'gme program administrator' (similarity score: 88) -> standardized to 'programme administrator'
  - 'health program administrator' (similarity score: 82) -> standardized to 'programme administrator'
  - 'grant administra

Found 5 similar titles (threshold=50):
  - 'development specialist' (similarity score: 83) -> standardized to 'business development specialist'
  - 'alumni development specialist' (similarity score: 83) -> standardized to 'business development specialist'
  - 'sales and business development' (similarity score: 82) -> standardized to 'business development specialist'
  - 'content development specialist' (similarity score: 82) -> standardized to 'business development specialist'
  - 'campaign development specialist' (similarity score: 81) -> standardized to 'business development specialist'
Remaining titles to process: 10858

Processing base title: 'supply chain manager'
Found 5 similar titles (threshold=50):
  - 'senior supply chain manager' (similarity score: 85) -> standardized to 'supply chain manager'
  - 'clinical supply manager' (similarity score: 84) -> standardized to 'supply chain manager'
  - 'supply chain project manager' (similarity score: 83) -> standardized to 'supply chai

Found 5 similar titles (threshold=50):
  - 'credit manager' (similarity score: 71) -> standardized to 'qc chemistry manager'
  - 'community manager' (similarity score: 70) -> standardized to 'qc chemistry manager'
  - 'it change manager' (similarity score: 70) -> standardized to 'qc chemistry manager'
  - 'sr. manager, community' (similarity score: 70) -> standardized to 'qc chemistry manager'
  - 'commodity manager' (similarity score: 70) -> standardized to 'qc chemistry manager'
Remaining titles to process: 10735

Processing base title: 'research administrator'
Found 5 similar titles (threshold=50):
  - 'research administration' (similarity score: 93) -> standardized to 'research administrator'
  - 'research grants administrator' (similarity score: 86) -> standardized to 'research administrator'
  - 'senior research administrator' (similarity score: 86) -> standardized to 'research administrator'
  - 'lease administrator' (similarity score: 83) -> standardized to 'research administra

Found 5 similar titles (threshold=50):
  - 'vp operations' (similarity score: 87) -> standardized to 'operations'
  - 'vp, operations' (similarity score: 87) -> standardized to 'operations'
  - 'it operations' (similarity score: 87) -> standardized to 'operations'
  - 'operations lead' (similarity score: 80) -> standardized to 'operations'
  - 'operations asst' (similarity score: 80) -> standardized to 'operations'
Remaining titles to process: 10651

Processing base title: 'senior project specialist'
Found 5 similar titles (threshold=50):
  - 'senior product specialist' (similarity score: 92) -> standardized to 'senior project specialist'
  - 'senior policy specialist' (similarity score: 86) -> standardized to 'senior project specialist'
  - 'senior procurement specialist' (similarity score: 85) -> standardized to 'senior project specialist'
  - 'senior content specialist' (similarity score: 84) -> standardized to 'senior project specialist'
  - 'project specialist' (similarity score: 

Found 5 similar titles (threshold=50):
  - 'executive communications manager' (similarity score: 86) -> standardized to 'internal communications executive'
  - 'communications executive' (similarity score: 84) -> standardized to 'internal communications executive'
  - 'internal communications director' (similarity score: 83) -> standardized to 'internal communications executive'
  - 'internal communications' (similarity score: 82) -> standardized to 'internal communications executive'
  - 'executive communications specialist' (similarity score: 82) -> standardized to 'internal communications executive'
Remaining titles to process: 10567

Processing base title: 'analyst'
Found 5 similar titles (threshold=50):
  - 'analysts' (similarity score: 93) -> standardized to 'analyst'
  - 'sr. analyst' (similarity score: 82) -> standardized to 'analyst'
  - 'sr analyst' (similarity score: 82) -> standardized to 'analyst'
  - 'bi analyst' (similarity score: 82) -> standardized to 'analyst'
  - 'hr

Found 5 similar titles (threshold=50):
  - 'associate director, annual giving' (similarity score: 96) -> standardized to 'associate director of annual giving'
  - 'assistant director, annual giving' (similarity score: 87) -> standardized to 'associate director of annual giving'
  - 'associate director, annual fund' (similarity score: 83) -> standardized to 'associate director of annual giving'
  - 'director of annual giving' (similarity score: 83) -> standardized to 'associate director of annual giving'
  - 'assistant director of annual fundraising' (similarity score: 80) -> standardized to 'associate director of annual giving'
Remaining titles to process: 10483

Processing base title: 'cash logistics manager'
Found 5 similar titles (threshold=50):
  - 'logistics manager' (similarity score: 87) -> standardized to 'cash logistics manager'
  - 'global logistics manager' (similarity score: 83) -> standardized to 'cash logistics manager'
  - 'sr logistics manager' (similarity score: 81) ->

Found 5 similar titles (threshold=50):
  - 'payroll tax manager' (similarity score: 88) -> standardized to 'payroll manager'
  - 'paralegal manager' (similarity score: 81) -> standardized to 'payroll manager'
  - 'park manager' (similarity score: 81) -> standardized to 'payroll manager'
  - 'problem manager' (similarity score: 80) -> standardized to 'payroll manager'
  - 'parts manager' (similarity score: 79) -> standardized to 'payroll manager'
Remaining titles to process: 10399

Processing base title: 'shop coordinator'
Found 5 similar titles (threshold=50):
  - 'seo coordinator' (similarity score: 90) -> standardized to 'shop coordinator'
  - 'workshop coordinator' (similarity score: 89) -> standardized to 'shop coordinator'
  - 'ehs coordinator' (similarity score: 84) -> standardized to 'shop coordinator'
  - 'shipping coordinator' (similarity score: 83) -> standardized to 'shop coordinator'
  - 'showroom coordinator' (similarity score: 83) -> standardized to 'shop coordinator'
Rem

Found 5 similar titles (threshold=50):
  - 'management consultant' (similarity score: 85) -> standardized to 'managing consultant'
  - 'training consultant' (similarity score: 84) -> standardized to 'managing consultant'
  - 'senior managing consultant' (similarity score: 84) -> standardized to 'managing consultant'
  - 'marketing consultant' (similarity score: 82) -> standardized to 'managing consultant'
  - 'leasing consultant' (similarity score: 81) -> standardized to 'managing consultant'
Remaining titles to process: 10315

Processing base title: 'assistant'
Found 5 similar titles (threshold=50):
  - 'hr assistant' (similarity score: 86) -> standardized to 'assistant'
  - 'lab assistant' (similarity score: 82) -> standardized to 'assistant'
  - 'assistant dean' (similarity score: 78) -> standardized to 'assistant'
  - 'assistant prof' (similarity score: 78) -> standardized to 'assistant'
  - 'exec assistant' (similarity score: 78) -> standardized to 'assistant'
Remaining titles to 

Found 5 similar titles (threshold=50):
  - 'pa' (similarity score: 80) -> standardized to 'cpa'
  - 'fpa' (similarity score: 67) -> standardized to 'cpa'
  - 'cpo' (similarity score: 67) -> standardized to 'cpa'
  - 'cna' (similarity score: 67) -> standardized to 'cpa'
  - 'chaplain' (similarity score: 55) -> standardized to 'cpa'
Remaining titles to process: 10231

Processing base title: 'director of distance education'
Found 5 similar titles (threshold=50):
  - 'director of distance learning' (similarity score: 85) -> standardized to 'director of distance education'
  - 'director of interpretation' (similarity score: 75) -> standardized to 'director of distance education'
  - 'director of sales  education' (similarity score: 74) -> standardized to 'director of distance education'
  - 'director of evaluation' (similarity score: 73) -> standardized to 'director of distance education'
  - 'director of innovation' (similarity score: 73) -> standardized to 'director of distance education'

Remaining titles to process: 10141

Processing base title: 'counsel'
Found 5 similar titles (threshold=50):
  - 'counselor' (similarity score: 88) -> standardized to 'counsel'
  - 'of counsel' (similarity score: 82) -> standardized to 'counsel'
  - 'counsellor' (similarity score: 82) -> standardized to 'counsel'
  - 'govt counsel' (similarity score: 74) -> standardized to 'counsel'
  - 'lead counsel' (similarity score: 74) -> standardized to 'counsel'
Remaining titles to process: 10135

Processing base title: 'technical services team lead'
Found 5 similar titles (threshold=50):
  - 'head of technical services' (similarity score: 81) -> standardized to 'technical services team lead'
  - 'technical services' (similarity score: 78) -> standardized to 'technical services team lead'
  - 'field services technician' (similarity score: 75) -> standardized to 'technical services team lead'
  - 'field service technician' (similarity score: 73) -> standardized to 'technical services team lead'
  

Found 5 similar titles (threshold=50):
  - 'engineering manager i' (similarity score: 95) -> standardized to 'engineering manager'
  - 'engineering mamager' (similarity score: 95) -> standardized to 'engineering manager'
  - 'sr engineering manager' (similarity score: 93) -> standardized to 'engineering manager'
  - 'engineering manager ii' (similarity score: 93) -> standardized to 'engineering manager'
  - 'engineer manager' (similarity score: 91) -> standardized to 'engineering manager'
Remaining titles to process: 10051

Processing base title: 'customer success manager'
Found 5 similar titles (threshold=50):
  - 'customer success manger' (similarity score: 98) -> standardized to 'customer success manager'
  - 'manager of customer success' (similarity score: 94) -> standardized to 'customer success manager'
  - 'sr. customer success manager' (similarity score: 94) -> standardized to 'customer success manager'
  - 'sr. manager, customer success' (similarity score: 94) -> standardized 

Found 5 similar titles (threshold=50):
  - 'database analyst' (similarity score: 91) -> standardized to 'sr. database analyst'
  - 'senior database analyst' (similarity score: 90) -> standardized to 'sr. database analyst'
  - 'hcis database analyst' (similarity score: 90) -> standardized to 'sr. database analyst'
  - 'master data analyst' (similarity score: 84) -> standardized to 'sr. database analyst'
  - 'sr fraud data analyst' (similarity score: 80) -> standardized to 'sr. database analyst'
Remaining titles to process: 9955

Processing base title: 'senior manufacturing engineer'
Found 5 similar titles (threshold=50):
  - 'manufacturing engineer' (similarity score: 86) -> standardized to 'senior manufacturing engineer'
  - 'supervisor, manufacturing engineering' (similarity score: 86) -> standardized to 'senior manufacturing engineer'
  - 'semiconductor manufacturing engineer' (similarity score: 86) -> standardized to 'senior manufacturing engineer'
  - 'senior planning engineer' (si

Found 5 similar titles (threshold=50):
  - 'engineer in training' (similarity score: 87) -> standardized to 'engineer in training civil'
  - 'civil engineer ii' (similarity score: 79) -> standardized to 'engineer in training civil'
  - 'civil engineer i' (similarity score: 76) -> standardized to 'engineer in training civil'
  - 'process engineer in training' (similarity score: 74) -> standardized to 'engineer in training civil'
  - 'resident civil engineer' (similarity score: 73) -> standardized to 'engineer in training civil'
Remaining titles to process: 9853

Processing base title: 'software developer'
Found 5 similar titles (threshold=50):
  - 'software-developer' (similarity score: 100) -> standardized to 'software developer'
  - 'software developer i' (similarity score: 95) -> standardized to 'software developer'
  - 'software developer 1' (similarity score: 95) -> standardized to 'software developer'
  - 'software developer 3' (similarity score: 95) -> standardized to 'software d

Found 5 similar titles (threshold=50):
  - 'snr data engineer' (similarity score: 87) -> standardized to 'data engineer'
  - 'dsp engineer' (similarity score: 80) -> standardized to 'data engineer'
  - 'senior data engineer' (similarity score: 79) -> standardized to 'data engineer'
  - 'broadcast engineer' (similarity score: 77) -> standardized to 'data engineer'
  - 'desktop engineer' (similarity score: 76) -> standardized to 'data engineer'
Remaining titles to process: 9757

Processing base title: 'flight test engineer'
Found 5 similar titles (threshold=50):
  - 'engineer in test' (similarity score: 83) -> standardized to 'flight test engineer'
  - 'test engineer ii' (similarity score: 83) -> standardized to 'flight test engineer'
  - 'test engineer iv' (similarity score: 83) -> standardized to 'flight test engineer'
  - 'test engineer' (similarity score: 79) -> standardized to 'flight test engineer'
  - 'lead test engineer' (similarity score: 79) -> standardized to 'flight test engi

Found 5 similar titles (threshold=50):
  - 'program specialist ii' (similarity score: 98) -> standardized to 'program specialist iii'
  - 'program specialist iv' (similarity score: 93) -> standardized to 'program specialist iii'
  - 'program specialist' (similarity score: 90) -> standardized to 'program specialist iii'
  - 'process specialist iii' (similarity score: 82) -> standardized to 'program specialist iii'
  - 'housing programs specialist' (similarity score: 82) -> standardized to 'program specialist iii'
Remaining titles to process: 9667

Processing base title: 'embedded software engineer'
Found 5 similar titles (threshold=50):
  - 'senior embedded software engineer' (similarity score: 88) -> standardized to 'embedded software engineer'
  - 'embedded firmware engineer' (similarity score: 88) -> standardized to 'embedded software engineer'
  - 'senior embedded engineer' (similarity score: 84) -> standardized to 'embedded software engineer'
  - 'embedded test engineer ii' (simila

Found 5 similar titles (threshold=50):
  - 'region vice president' (similarity score: 86) -> standardized to 'senior vice president'
  - 'vice president, senior counsel' (similarity score: 84) -> standardized to 'senior vice president'
  - 'vice president' (similarity score: 80) -> standardized to 'senior vice president'
  - 'vice president, strategy' (similarity score: 77) -> standardized to 'senior vice president'
  - 'vice president of sales' (similarity score: 77) -> standardized to 'senior vice president'
Remaining titles to process: 9577

Processing base title: 'ux developer  product designer junior'
Found 5 similar titles (threshold=50):
  - 'ux designer  developer' (similarity score: 74) -> standardized to 'ux developer  product designer junior'
  - 'director of product design  development' (similarity score: 73) -> standardized to 'ux developer  product designer junior'
  - 'ux product design lead' (similarity score: 69) -> standardized to 'ux developer  product designer junio

Found 5 similar titles (threshold=50):
  - 'data entry and records specialist' (similarity score: 81) -> standardized to 'records management specialist'
  - 'records management team supervisor' (similarity score: 79) -> standardized to 'records management specialist'
  - 'it service management specialist' (similarity score: 79) -> standardized to 'records management specialist'
  - 'records specialist' (similarity score: 77) -> standardized to 'records management specialist'
  - 'shareholder engagement specialist' (similarity score: 77) -> standardized to 'records management specialist'
Remaining titles to process: 9487

Processing base title: 'program manager staff'
Found 5 similar titles (threshold=50):
  - 'sr program manager' (similarity score: 87) -> standardized to 'program manager staff'
  - 'sales program manager' (similarity score: 86) -> standardized to 'program manager staff'
  - 'program manager  staff attorney' (similarity score: 82) -> standardized to 'program manager sta

Found 5 similar titles (threshold=50):
  - 'web development' (similarity score: 86) -> standardized to 'web developer'
  - 'etl developer' (similarity score: 85) -> standardized to 'web developer'
  - 'lead web developer' (similarity score: 84) -> standardized to 'web developer'
  - 'exhibit developer' (similarity score: 80) -> standardized to 'web developer'
  - 'senior web developer' (similarity score: 79) -> standardized to 'web developer'
Remaining titles to process: 9379

Processing base title: 'user research  design lead'
Found 5 similar titles (threshold=50):
  - 'design research lead' (similarity score: 89) -> standardized to 'user research  design lead'
  - 'design researcher' (similarity score: 81) -> standardized to 'user research  design lead'
  - 'senior design researcher' (similarity score: 78) -> standardized to 'user research  design lead'
  - 'principal design researcher' (similarity score: 73) -> standardized to 'user research  design lead'
  - 'senior researcher and 

Found 5 similar titles (threshold=50):
  - 'director of revenue management' (similarity score: 87) -> standardized to 'regional director of revenue management'
  - 'area director of revenue management' (similarity score: 81) -> standardized to 'regional director of revenue management'
  - 'director of risk management' (similarity score: 76) -> standardized to 'regional director of revenue management'
  - 'director of product management' (similarity score: 75) -> standardized to 'regional director of revenue management'
  - 'director of program management' (similarity score: 75) -> standardized to 'regional director of revenue management'
Remaining titles to process: 9277

Processing base title: 'digital content strategist'
Found 5 similar titles (threshold=50):
  - 'digital strategy consultant' (similarity score: 83) -> standardized to 'digital content strategist'
  - 'digital strategist' (similarity score: 82) -> standardized to 'digital content strategist'
  - 'digital content specia

Found 5 similar titles (threshold=50):
  - 'contract management specialist' (similarity score: 86) -> standardized to 'configuration management specialist'
  - 'grant management specialist' (similarity score: 84) -> standardized to 'configuration management specialist'
  - 'control management specialist' (similarity score: 84) -> standardized to 'configuration management specialist'
  - 'grants management specialist' (similarity score: 83) -> standardized to 'configuration management specialist'
  - 'category management specialist' (similarity score: 80) -> standardized to 'configuration management specialist'
Remaining titles to process: 9175

Processing base title: 'graduate software engineer'
Found 5 similar titles (threshold=50):
  - 'rd software engineer' (similarity score: 87) -> standardized to 'graduate software engineer'
  - 'lead software engineer' (similarity score: 83) -> standardized to 'graduate software engineer'
  - 'graduate software developer' (similarity score: 83) -

Found 5 similar titles (threshold=50):
  - 'box office supervisor' (similarity score: 86) -> standardized to 'front office supervisor'
  - 'office supervisor' (similarity score: 85) -> standardized to 'front office supervisor'
  - 'community supervision officer' (similarity score: 77) -> standardized to 'front office supervisor'
  - 'service supervisor' (similarity score: 73) -> standardized to 'front office supervisor'
  - 'supervisory asylum officer' (similarity score: 73) -> standardized to 'front office supervisor'
Remaining titles to process: 9079

Processing base title: 'safety and quality coordinator'
Found 5 similar titles (threshold=50):
  - 'sr health and safety coordinator' (similarity score: 84) -> standardized to 'safety and quality coordinator'
  - 'quality systems coordinator' (similarity score: 77) -> standardized to 'safety and quality coordinator'
  - 'quality assurance coordinator' (similarity score: 75) -> standardized to 'safety and quality coordinator'
  - 'gifts 

Found 5 similar titles (threshold=50):
  - 'payroll  benefits specialist' (similarity score: 92) -> standardized to 'lead payroll  benefits specialist'
  - 'senior payroll  benefits specialist' (similarity score: 82) -> standardized to 'lead payroll  benefits specialist'
  - 'benefits coordinator payroll specialist' (similarity score: 82) -> standardized to 'lead payroll  benefits specialist'
  - 'member benefits specialist' (similarity score: 76) -> standardized to 'lead payroll  benefits specialist'
  - 'senior benefits specialist' (similarity score: 76) -> standardized to 'lead payroll  benefits specialist'
Remaining titles to process: 8977

Processing base title: 'director, alumni relations'
Found 5 similar titles (threshold=50):
  - 'director of alumni relations' (similarity score: 94) -> standardized to 'director, alumni relations'
  - 'alumni relations coordinator' (similarity score: 83) -> standardized to 'director, alumni relations'
  - 'associate director, alumni relations' (

Found 5 similar titles (threshold=50):
  - 'director of technical services' (similarity score: 81) -> standardized to 'social services director'
  - 'director of support services' (similarity score: 77) -> standardized to 'social services director'
  - 'director of special events' (similarity score: 76) -> standardized to 'social services director'
  - 'director of social' (similarity score: 76) -> standardized to 'social services director'
  - 'director of social media' (similarity score: 75) -> standardized to 'social services director'
Remaining titles to process: 8887

Processing base title: 'ux specialist'
Found 5 similar titles (threshold=50):
  - 'tax specialist' (similarity score: 89) -> standardized to 'ux specialist'
  - 'specialist' (similarity score: 87) -> standardized to 'ux specialist'
  - 'sr. specialist' (similarity score: 85) -> standardized to 'ux specialist'
  - 'urban specialist' (similarity score: 83) -> standardized to 'ux specialist'
  - 'specialst' (similarity 

Found 5 similar titles (threshold=50):
  - 'chemical technician ii' (similarity score: 71) -> standardized to 'geospatial data technician ii'
  - 'digital imaging technician' (similarity score: 69) -> standardized to 'geospatial data technician ii'
  - 'art restoration technician' (similarity score: 69) -> standardized to 'geospatial data technician ii'
  - 'materials lab technician' (similarity score: 68) -> standardized to 'geospatial data technician ii'
  - 'ophthalmic technician' (similarity score: 68) -> standardized to 'geospatial data technician ii'
Remaining titles to process: 8785

Processing base title: 'information resources and services support specialist'
Found 5 similar titles (threshold=50):
  - 'merchant services operations specialist' (similarity score: 67) -> standardized to 'information resources and services support specialist'
  - 'operations support specialist' (similarity score: 66) -> standardized to 'information resources and services support specialist'
  - 's

Found 5 similar titles (threshold=50):
  - 'tax paraprofessional' (similarity score: 82) -> standardized to 'student paraprofessional'
  - 'student affairs professional ii' (similarity score: 80) -> standardized to 'student paraprofessional'
  - 'paraprofessional' (similarity score: 80) -> standardized to 'student paraprofessional'
  - 'title i paraprofessional' (similarity score: 79) -> standardized to 'student paraprofessional'
  - 'student services professional' (similarity score: 75) -> standardized to 'student paraprofessional'
Remaining titles to process: 8683

Processing base title: 'science manager'
Found 5 similar titles (threshold=50):
  - 'data science manager' (similarity score: 86) -> standardized to 'science manager'
  - 'scientific manager' (similarity score: 85) -> standardized to 'science manager'
  - 'scientist manager' (similarity score: 81) -> standardized to 'science manager'
  - 'site manager' (similarity score: 81) -> standardized to 'science manager'
  - 'succes

Found 5 similar titles (threshold=50):
  - 'principal data scientist' (similarity score: 88) -> standardized to 'principal scientist'
  - 'senior principal scientist' (similarity score: 84) -> standardized to 'principal scientist'
  - 'principal research scientist' (similarity score: 81) -> standardized to 'principal scientist'
  - 'physical scientist' (similarity score: 81) -> standardized to 'principal scientist'
  - 'clinical scientist' (similarity score: 81) -> standardized to 'principal scientist'
Remaining titles to process: 8587

Processing base title: 'associate director, information governance'
Found 5 similar titles (threshold=50):
  - 'associate director, transformation' (similarity score: 84) -> standardized to 'associate director, information governance'
  - 'associate director, finance operations' (similarity score: 82) -> standardized to 'associate director, information governance'
  - 'associate director - financial operations' (similarity score: 78) -> standardized to 

Found 5 similar titles (threshold=50):
  - 'school media specialist' (similarity score: 79) -> standardized to 'farm to school specialist'
  - 'school library media specialist' (similarity score: 75) -> standardized to 'farm to school specialist'
  - 'air traffic control specialist' (similarity score: 73) -> standardized to 'farm to school specialist'
  - 'school budget specialist' (similarity score: 73) -> standardized to 'farm to school specialist'
  - 'payroll specialist' (similarity score: 70) -> standardized to 'farm to school specialist'
Remaining titles to process: 8485

Processing base title: 'marketing automation developer'
Found 5 similar titles (threshold=50):
  - 'marketing automation manager' (similarity score: 79) -> standardized to 'marketing automation developer'
  - 'marketing automation coordinator' (similarity score: 77) -> standardized to 'marketing automation developer'
  - 'development and marketing coordinator' (similarity score: 72) -> standardized to 'marketing

Found 5 similar titles (threshold=50):
  - 'customer service quality manager' (similarity score: 77) -> standardized to 'north american service quality manager'
  - 'support manager, north america' (similarity score: 72) -> standardized to 'north american service quality manager'
  - 'assistant service manager nhs' (similarity score: 69) -> standardized to 'north american service quality manager'
  - 'sr. manager quality engineering' (similarity score: 68) -> standardized to 'north american service quality manager'
  - 'quality control regional manager americas' (similarity score: 68) -> standardized to 'north american service quality manager'
Remaining titles to process: 8389

Processing base title: 'sr. document control specialist'
Found 5 similar titles (threshold=50):
  - 'data  document specialist' (similarity score: 78) -> standardized to 'sr. document control specialist'
  - 'document specialist' (similarity score: 78) -> standardized to 'sr. document control specialist'
  - 'in

Found 5 similar titles (threshold=50):
  - 'manager assistant' (similarity score: 100) -> standardized to 'assistant manager'
  - 'qa assistant manager' (similarity score: 92) -> standardized to 'assistant manager'
  - 'assistant seo manager' (similarity score: 89) -> standardized to 'assistant manager'
  - 'assistant team manager' (similarity score: 87) -> standardized to 'assistant manager'
  - 'management assistant' (similarity score: 86) -> standardized to 'assistant manager'
Remaining titles to process: 8287

Processing base title: 'brand ambassadorsales associate'
Found 5 similar titles (threshold=50):
  - 'associate credit analyst' (similarity score: 62) -> standardized to 'brand ambassadorsales associate'
  - 'actuarial associate' (similarity score: 60) -> standardized to 'brand ambassadorsales associate'
  - 'associate data analyst' (similarity score: 60) -> standardized to 'brand ambassadorsales associate'
  - 'hris analyst associate' (similarity score: 60) -> standardized to

Found 5 similar titles (threshold=50):
  - 'contracts manager' (similarity score: 97) -> standardized to 'contract manager'
  - 'contracting manager' (similarity score: 91) -> standardized to 'contract manager'
  - 'sr contracts manager' (similarity score: 89) -> standardized to 'contract manager'
  - 'content manager' (similarity score: 84) -> standardized to 'contract manager'
  - 'vendor contract manager' (similarity score: 82) -> standardized to 'contract manager'
Remaining titles to process: 8191

Processing base title: 'claims administration manager'
Found 5 similar titles (threshold=50):
  - 'manager, educational administration' (similarity score: 83) -> standardized to 'claims administration manager'
  - 'business administration management' (similarity score: 76) -> standardized to 'claims administration manager'
  - 'pricing administration manager' (similarity score: 75) -> standardized to 'claims administration manager'
  - 'manager records administration' (similarity score: 

Found 5 similar titles (threshold=50):
  - 'human resources and operations manager' (similarity score: 71) -> standardized to 'shipping and warehouse operations manager'
  - 'wholesale operations manager' (similarity score: 70) -> standardized to 'shipping and warehouse operations manager'
  - 'data warehouse operations analyst' (similarity score: 70) -> standardized to 'shipping and warehouse operations manager'
  - 'manager ii, revenue operations' (similarity score: 69) -> standardized to 'shipping and warehouse operations manager'
  - 'samples and partner relationship manager' (similarity score: 69) -> standardized to 'shipping and warehouse operations manager'
Remaining titles to process: 8083

Processing base title: 'sales incentive analyst'
Found 5 similar titles (threshold=50):
  - 'sales marketing analyst' (similarity score: 74) -> standardized to 'sales incentive analyst'
  - 'sr. sales compensation analyst' (similarity score: 73) -> standardized to 'sales incentive analyst'
 

Found 5 similar titles (threshold=50):
  - 'exhibitions manager' (similarity score: 73) -> standardized to 'exhibitions manager and registrar'
  - 'communications and proposal manager' (similarity score: 65) -> standardized to 'exhibitions manager and registrar'
  - 'archivist and records manager' (similarity score: 65) -> standardized to 'exhibitions manager and registrar'
  - 'communications and outreach manager' (similarity score: 65) -> standardized to 'exhibitions manager and registrar'
  - 'communications and marketing manager' (similarity score: 64) -> standardized to 'exhibitions manager and registrar'
Remaining titles to process: 7981

Processing base title: 'product and inventory manager'
Found 5 similar titles (threshold=50):
  - 'product manager, director' (similarity score: 83) -> standardized to 'product and inventory manager'
  - 'web product and content manager' (similarity score: 80) -> standardized to 'product and inventory manager'
  - 'investment product manager' (s

Found 5 similar titles (threshold=50):
  - 'writer and researcher' (similarity score: 100) -> standardized to 'researcher and writer'
  - 'lead ux researcher' (similarity score: 72) -> standardized to 'researcher and writer'
  - 'games user researcher' (similarity score: 71) -> standardized to 'researcher and writer'
  - 'freelance writer' (similarity score: 70) -> standardized to 'researcher and writer'
  - 'security researcher' (similarity score: 70) -> standardized to 'researcher and writer'
Remaining titles to process: 7873

Processing base title: 'senior analystinvestment performance  risk'
Found 5 similar titles (threshold=50):
  - 'programmer analyst senior' (similarity score: 67) -> standardized to 'senior analystinvestment performance  risk'
  - 'senior analyst programmer' (similarity score: 67) -> standardized to 'senior analystinvestment performance  risk'
  - 'senior credit risk analyst' (similarity score: 66) -> standardized to 'senior analystinvestment performance  risk'


Found 5 similar titles (threshold=50):
  - 'lead designer' (similarity score: 79) -> standardized to 'floral designer'
  - 'designer  developer' (similarity score: 73) -> standardized to 'floral designer'
  - 'junior designer' (similarity score: 73) -> standardized to 'floral designer'
  - 'game designer' (similarity score: 71) -> standardized to 'floral designer'
  - 'jewelry designer' (similarity score: 71) -> standardized to 'floral designer'
Remaining titles to process: 7753

Processing base title: 'product manager - technical'
Found 5 similar titles (threshold=50):
  - 'technical product manager' (similarity score: 100) -> standardized to 'product manager - technical'
  - 'staff technical product manager' (similarity score: 89) -> standardized to 'product manager - technical'
  - 'technical product owner' (similarity score: 88) -> standardized to 'product manager - technical'
  - 'senior product manager - technical' (similarity score: 88) -> standardized to 'product manager - tech

Found 5 similar titles (threshold=50):
  - 'account managerclient relations' (similarity score: 66) -> standardized to 'agency cost allocation accountant'
  - 'construction project accountant' (similarity score: 66) -> standardized to 'agency cost allocation accountant'
  - 'operations and accounting lead' (similarity score: 63) -> standardized to 'agency cost allocation accountant'
  - 'national account manager' (similarity score: 63) -> standardized to 'agency cost allocation accountant'
  - 'international accountant' (similarity score: 63) -> standardized to 'agency cost allocation accountant'
Remaining titles to process: 7639

Processing base title: 'design engineer'
Found 5 similar titles (threshold=50):
  - 'design engineer 2' (similarity score: 94) -> standardized to 'design engineer'
  - 'rfic design engineer' (similarity score: 86) -> standardized to 'design engineer'
  - 'devops engineer' (similarity score: 80) -> standardized to 'design engineer'
  - 'digital design engineer

Found 5 similar titles (threshold=50):
  - 'vice president and general counsel' (similarity score: 59) -> standardized to 'museum educator and planetarium presenter'
  - 'vice president, learning and development' (similarity score: 57) -> standardized to 'museum educator and planetarium presenter'
  - 'senior director, brand marketing' (similarity score: 56) -> standardized to 'museum educator and planetarium presenter'
  - 'music director and organist' (similarity score: 56) -> standardized to 'museum educator and planetarium presenter'
  - 'vice president, grants and contracts' (similarity score: 55) -> standardized to 'museum educator and planetarium presenter'
Remaining titles to process: 7535

Processing base title: 'content writercopywriter'
Found 5 similar titles (threshold=50):
  - 'content manager  copywriter' (similarity score: 72) -> standardized to 'content writercopywriter'
  - 'copywriter  content creator' (similarity score: 72) -> standardized to 'content writercopywrite

Found 5 similar titles (threshold=50):
  - 'data admin' (similarity score: 77) -> standardized to 'admin  data entry'
  - 'data entry' (similarity score: 77) -> standardized to 'admin  data entry'
  - 'data entry associate' (similarity score: 72) -> standardized to 'admin  data entry'
  - 'data entry clerk' (similarity score: 69) -> standardized to 'admin  data entry'
  - 'sr admin asst' (similarity score: 69) -> standardized to 'admin  data entry'
Remaining titles to process: 7415

Processing base title: 'public health specialist'
Found 5 similar titles (threshold=50):
  - 'public affairs specialist' (similarity score: 78) -> standardized to 'public health specialist'
  - 'health insurance specialist' (similarity score: 78) -> standardized to 'public health specialist'
  - 'public health communication specialist' (similarity score: 77) -> standardized to 'public health specialist'
  - 'public information specialist' (similarity score: 75) -> standardized to 'public health specialist'


Found 5 similar titles (threshold=50):
  - 'senior compliance associate' (similarity score: 85) -> standardized to 'compliance associate'
  - 'compliance assistant' (similarity score: 85) -> standardized to 'compliance associate'
  - 'office associate' (similarity score: 78) -> standardized to 'compliance associate'
  - 'client associate' (similarity score: 78) -> standardized to 'compliance associate'
  - 'corporate finance associate' (similarity score: 77) -> standardized to 'compliance associate'
Remaining titles to process: 7295

Processing base title: 'principal product analyst'
Found 5 similar titles (threshold=50):
  - 'principal project manager' (similarity score: 76) -> standardized to 'principal product analyst'
  - 'principal product engineer' (similarity score: 75) -> standardized to 'principal product analyst'
  - 'principal carrier support analyst' (similarity score: 72) -> standardized to 'principal product analyst'
  - 'senior principal system analyst' (similarity score

Found 5 similar titles (threshold=50):
  - 'bar staff' (similarity score: 82) -> standardized to 'staff rn'
  - 'staff nurse' (similarity score: 74) -> standardized to 'staff rn'
  - 'assurance staff' (similarity score: 70) -> standardized to 'staff rn'
  - 'design staff' (similarity score: 70) -> standardized to 'staff rn'
  - 'staff editor' (similarity score: 70) -> standardized to 'staff rn'
Remaining titles to process: 7175

Processing base title: 'marketing executive'
Found 5 similar titles (threshold=50):
  - 'email marketing executive' (similarity score: 86) -> standardized to 'marketing executive'
  - 'senior marketing executive' (similarity score: 84) -> standardized to 'marketing executive'
  - 'casino marketing executive' (similarity score: 84) -> standardized to 'marketing executive'
  - 'digital marketing executive' (similarity score: 83) -> standardized to 'marketing executive'
  - 'executive manager' (similarity score: 78) -> standardized to 'marketing executive'
Remaini

Found 5 similar titles (threshold=50):
  - 'senior trial attorney' (similarity score: 71) -> standardized to 'title review attorney'
  - 'securities attorney' (similarity score: 70) -> standardized to 'title review attorney'
  - 'document review attorney' (similarity score: 67) -> standardized to 'title review attorney'
  - 'attorney editor' (similarity score: 67) -> standardized to 'title review attorney'
  - 'general attorney' (similarity score: 65) -> standardized to 'title review attorney'
Remaining titles to process: 7061

Processing base title: 'senior hris analyst'
Found 5 similar titles (threshold=50):
  - 'senior risk analyst' (similarity score: 95) -> standardized to 'senior hris analyst'
  - 'senior fiscal analyst' (similarity score: 85) -> standardized to 'senior hris analyst'
  - 'senior records analyst' (similarity score: 83) -> standardized to 'senior hris analyst'
  - 'senior gis data analyst' (similarity score: 81) -> standardized to 'senior hris analyst'
  - 'senior s

Found 5 similar titles (threshold=50):
  - 'trust officer' (similarity score: 88) -> standardized to 'test officer'
  - 'trustee officer' (similarity score: 81) -> standardized to 'test officer'
  - 'press officer' (similarity score: 80) -> standardized to 'test officer'
  - 'title officer' (similarity score: 80) -> standardized to 'test officer'
  - 'risk officer' (similarity score: 75) -> standardized to 'test officer'
Remaining titles to process: 6941

Processing base title: 'processing archivist'
Found 5 similar titles (threshold=50):
  - 'process chemist' (similarity score: 74) -> standardized to 'processing archivist'
  - 'data processing' (similarity score: 74) -> standardized to 'processing archivist'
  - 'processing archivist and records manager' (similarity score: 67) -> standardized to 'processing archivist'
  - 'corporate archivist' (similarity score: 67) -> standardized to 'processing archivist'
  - 'archivistrecords manager' (similarity score: 64) -> standardized to 'proc

Found 5 similar titles (threshold=50):
  - 'change management consultant' (similarity score: 70) -> standardized to 'director internal comms  change management'
  - 'director, initiative management' (similarity score: 68) -> standardized to 'director internal comms  change management'
  - 'director, community engagement' (similarity score: 66) -> standardized to 'director internal comms  change management'
  - 'community engagement director' (similarity score: 66) -> standardized to 'director internal comms  change management'
  - 'internal audit manager' (similarity score: 63) -> standardized to 'director internal comms  change management'
Remaining titles to process: 6821

Processing base title: 'quality management consultant'
Found 5 similar titles (threshold=50):
  - 'safety management consultant' (similarity score: 88) -> standardized to 'quality management consultant'
  - 'management product consultant' (similarity score: 83) -> standardized to 'quality management consultant'
  -

Found 5 similar titles (threshold=50):
  - 'marketing operations manager' (similarity score: 79) -> standardized to 'public relations and marketing manager'
  - 'public relations manager' (similarity score: 77) -> standardized to 'public relations and marketing manager'
  - 'sales and marketing manager' (similarity score: 77) -> standardized to 'public relations and marketing manager'
  - 'marketing and public relations specialist' (similarity score: 76) -> standardized to 'public relations and marketing manager'
  - 'marketingcommunications manager' (similarity score: 75) -> standardized to 'public relations and marketing manager'
Remaining titles to process: 6695

Processing base title: 'director of grants'
Found 5 similar titles (threshold=50):
  - 'grants director' (similarity score: 91) -> standardized to 'director of grants'
  - 'grant director' (similarity score: 88) -> standardized to 'director of grants'
  - 'director of grants admin' (similarity score: 86) -> standardized to 

Found 5 similar titles (threshold=50):
  - 'director of legislative affairs' (similarity score: 95) -> standardized to 'legislative affairs director'
  - 'legislative director' (similarity score: 83) -> standardized to 'legislative affairs director'
  - 'director of legislative and public affairs' (similarity score: 80) -> standardized to 'legislative affairs director'
  - 'director, federal affairs' (similarity score: 73) -> standardized to 'legislative affairs director'
  - 'director of medical affairs' (similarity score: 73) -> standardized to 'legislative affairs director'
Remaining titles to process: 6563

Processing base title: 'wetland biologist'
Found 5 similar titles (threshold=50):
  - 'marine biologist' (similarity score: 73) -> standardized to 'wetland biologist'
  - 'molecular biologist' (similarity score: 72) -> standardized to 'wetland biologist'
  - 'field ecologist' (similarity score: 69) -> standardized to 'wetland biologist'
  - 'computational biologist' (similarity 

Found 5 similar titles (threshold=50):
  - 'implementation specialist' (similarity score: 75) -> standardized to 'ops tech implementation'
  - 'technical implementation manager' (similarity score: 73) -> standardized to 'ops tech implementation'
  - 'software implementation analyst' (similarity score: 67) -> standardized to 'ops tech implementation'
  - 'event implementation specialist' (similarity score: 67) -> standardized to 'ops tech implementation'
  - 'se implementation consultant' (similarity score: 67) -> standardized to 'ops tech implementation'
Remaining titles to process: 6431

Processing base title: 'senior registered associate'
Found 5 similar titles (threshold=50):
  - 'senior associate, registered architect' (similarity score: 84) -> standardized to 'senior registered associate'
  - 'senior associate scientist' (similarity score: 79) -> standardized to 'senior registered associate'
  - 'audit senior associate' (similarity score: 78) -> standardized to 'senior registered 

Found 5 similar titles (threshold=50):
  - 'postdoctoral scholar' (similarity score: 78) -> standardized to 'post-doctoral scholar'
  - 'doctoral researcher' (similarity score: 65) -> standardized to 'post-doctoral scholar'
  - 'post doctoral fellow' (similarity score: 63) -> standardized to 'post-doctoral scholar'
  - 'post-doctoral fellow' (similarity score: 63) -> standardized to 'post-doctoral scholar'
  - 'hospital doctor' (similarity score: 61) -> standardized to 'post-doctoral scholar'
Remaining titles to process: 6293

Processing base title: 'associate head, library'
Found 5 similar titles (threshold=50):
  - 'library associate' (similarity score: 87) -> standardized to 'associate head, library'
  - 'library associate iii' (similarity score: 84) -> standardized to 'associate head, library'
  - 'associate library director' (similarity score: 79) -> standardized to 'associate head, library'
  - 'assoicate library director' (similarity score: 75) -> standardized to 'associate head

Found 5 similar titles (threshold=50):
  - 'sales support specialist' (similarity score: 84) -> standardized to 'mission support specialist'
  - 'it support specialist ii' (similarity score: 84) -> standardized to 'mission support specialist'
  - 'client support specialist' (similarity score: 82) -> standardized to 'mission support specialist'
  - 'marketing support specialist' (similarity score: 81) -> standardized to 'mission support specialist'
  - 'sales support senior specialist' (similarity score: 81) -> standardized to 'mission support specialist'
Remaining titles to process: 6161

Processing base title: 'head of adult services'
Found 5 similar titles (threshold=50):
  - 'head of childrens services' (similarity score: 75) -> standardized to 'head of adult services'
  - 'head of user services' (similarity score: 74) -> standardized to 'head of adult services'
  - 'head of teen services' (similarity score: 74) -> standardized to 'head of adult services'
  - 'director of home based

Remaining titles to process: 6023

Processing base title: 'manager learning  training'
Found 5 similar titles (threshold=50):
  - 'manager, learning  media' (similarity score: 77) -> standardized to 'manager learning  training'
  - 'training  capacity manager' (similarity score: 76) -> standardized to 'manager learning  training'
  - 'manager pv compliance  training' (similarity score: 73) -> standardized to 'manager learning  training'
  - 'manager - training and quality' (similarity score: 72) -> standardized to 'manager learning  training'
  - 'quality and training manager' (similarity score: 72) -> standardized to 'manager learning  training'
Remaining titles to process: 6017

Processing base title: 'credentialing specialist'
Found 5 similar titles (threshold=50):
  - 'medical credentialing specialist' (similarity score: 86) -> standardized to 'credentialing specialist'
  - 'hospital credentialing' (similarity score: 83) -> standardized to 'credentialing specialist'
  - 'scheduling

Found 5 similar titles (threshold=50):
  - 'police office assistant' (similarity score: 83) -> standardized to 'remote office assistant'
  - 'office assistant' (similarity score: 82) -> standardized to 'remote office assistant'
  - 'office support assistant' (similarity score: 81) -> standardized to 'remote office assistant'
  - 'office assistant 4' (similarity score: 78) -> standardized to 'remote office assistant'
  - 'assistant professor - fixed term' (similarity score: 72) -> standardized to 'remote office assistant'
Remaining titles to process: 5879

Processing base title: 'owner and ceo'
Found 5 similar titles (threshold=50):
  - 'owner and founder' (similarity score: 73) -> standardized to 'owner and ceo'
  - 'partnerowner' (similarity score: 64) -> standardized to 'owner and ceo'
  - 'advice worker' (similarity score: 62) -> standardized to 'owner and ceo'
  - 'senior brand writer' (similarity score: 62) -> standardized to 'owner and ceo'
  - 'lead ad writer' (similarity score:

Found 5 similar titles (threshold=50):
  - 'medical lab technician, ascp' (similarity score: 90) -> standardized to 'medical lab technician'
  - 'emergency medical technician' (similarity score: 76) -> standardized to 'medical lab technician'
  - 'biological technician' (similarity score: 74) -> standardized to 'medical lab technician'
  - 'electromechanical technician' (similarity score: 72) -> standardized to 'medical lab technician'
  - 'gate service technician' (similarity score: 71) -> standardized to 'medical lab technician'
Remaining titles to process: 5729

Processing base title: 'security officer'
Found 5 similar titles (threshold=50):
  - 'casino security officer' (similarity score: 82) -> standardized to 'security officer'
  - 'special security officer' (similarity score: 80) -> standardized to 'security officer'
  - 'facility security officer' (similarity score: 78) -> standardized to 'security officer'
  - 'security concierge' (similarity score: 76) -> standardized to 'sec

Found 5 similar titles (threshold=50):
  - 'advertising and events manager' (similarity score: 84) -> standardized to 'brand advertising manager'
  - 'digital advertising manager' (similarity score: 81) -> standardized to 'brand advertising manager'
  - 'brand asset manager' (similarity score: 77) -> standardized to 'brand advertising manager'
  - 'design manager' (similarity score: 72) -> standardized to 'brand advertising manager'
  - 'hr diversity manager' (similarity score: 71) -> standardized to 'brand advertising manager'
Remaining titles to process: 5585

Processing base title: 'success coachprogram coordinator'
Found 5 similar titles (threshold=50):
  - 'study abroad coordinator' (similarity score: 68) -> standardized to 'success coachprogram coordinator'
  - 'childrens services coordinator' (similarity score: 65) -> standardized to 'success coachprogram coordinator'
  - 'supply chain coordinator' (similarity score: 64) -> standardized to 'success coachprogram coordinator'
  - 

Found 5 similar titles (threshold=50):
  - 'mechanical engineer ii' (similarity score: 98) -> standardized to 'mechanical engineer iii'
  - 'mechanical engineer i' (similarity score: 95) -> standardized to 'mechanical engineer iii'
  - 'mechanical engineering' (similarity score: 89) -> standardized to 'mechanical engineer iii'
  - 'facilities mechanical engineer' (similarity score: 87) -> standardized to 'mechanical engineer iii'
  - 'mechanical engineer 2' (similarity score: 86) -> standardized to 'mechanical engineer iii'
Remaining titles to process: 5429

Processing base title: 'librarian  senior library specialist'
Found 5 similar titles (threshold=50):
  - 'senior research area specialist' (similarity score: 76) -> standardized to 'librarian  senior library specialist'
  - 'senior data  policy specialist' (similarity score: 75) -> standardized to 'librarian  senior library specialist'
  - 'senior data quality specialist' (similarity score: 74) -> standardized to 'librarian  senior

Found 5 similar titles (threshold=50):
  - 'accounts trainee' (similarity score: 76) -> standardized to 'audit trainee'
  - 'trainee solicitor' (similarity score: 67) -> standardized to 'audit trainee'
  - 'tax auditor' (similarity score: 67) -> standardized to 'audit trainee'
  - 'training admin' (similarity score: 67) -> standardized to 'audit trainee'
  - 'revenue auditor' (similarity score: 64) -> standardized to 'audit trainee'
Remaining titles to process: 5279

Processing base title: 'intermediate architectural designer'
Found 5 similar titles (threshold=50):
  - 'intermediate architect' (similarity score: 77) -> standardized to 'intermediate architectural designer'
  - 'architectural project managerdesigner' (similarity score: 72) -> standardized to 'intermediate architectural designer'
  - 'business analyst-intermediate' (similarity score: 66) -> standardized to 'intermediate architectural designer'
  - 'architectural drafting technician ii' (similarity score: 65) -> standardiz

Found 5 similar titles (threshold=50):
  - 'admin assistant iii' (similarity score: 83) -> standardized to 'intake admin assistant'
  - 'admin assistant freelance' (similarity score: 77) -> standardized to 'intake admin assistant'
  - 'senior admin assistant' (similarity score: 77) -> standardized to 'intake admin assistant'
  - 'admin. assistantaccounts payable' (similarity score: 72) -> standardized to 'intake admin assistant'
  - 'maintenance administrative assistant' (similarity score: 72) -> standardized to 'intake admin assistant'
Remaining titles to process: 5201

Processing base title: 'staff associate - transfer services'
Found 5 similar titles (threshold=50):
  - 'university services associate' (similarity score: 71) -> standardized to 'staff associate - transfer services'
  - 'associate professor of business' (similarity score: 66) -> standardized to 'staff associate - transfer services'
  - 'hrstaffing associate' (similarity score: 64) -> standardized to 'staff associate - 

Found 5 similar titles (threshold=50):
  - 'contract administrator' (similarity score: 98) -> standardized to 'contracts administrator'
  - 'subcontracts administrator ii' (similarity score: 88) -> standardized to 'contracts administrator'
  - 'government contracts administrator' (similarity score: 81) -> standardized to 'contracts administrator'
  - 'construction administration' (similarity score: 80) -> standardized to 'contracts administrator'
  - 'collections administrator' (similarity score: 79) -> standardized to 'contracts administrator'
Remaining titles to process: 5117

Processing base title: 'data governance'
Found 5 similar titles (threshold=50):
  - 'data governance analyst' (similarity score: 79) -> standardized to 'data governance'
  - 'data science' (similarity score: 67) -> standardized to 'data governance'
  - 'data mechanic' (similarity score: 64) -> standardized to 'data governance'
  - 'maintenance' (similarity score: 62) -> standardized to 'data governance'
  - 'da

Found 5 similar titles (threshold=50):
  - 'marketing and business development manager' (similarity score: 68) -> standardized to 'business manager and client service officer'
  - 'supervisor - financial and business services' (similarity score: 68) -> standardized to 'business manager and client service officer'
  - 'senior client services manager' (similarity score: 68) -> standardized to 'business manager and client service officer'
  - 'client services officer' (similarity score: 67) -> standardized to 'business manager and client service officer'
  - 'sr. business process manager' (similarity score: 66) -> standardized to 'business manager and client service officer'
Remaining titles to process: 5036

Processing base title: 'graphics supervisor'
Found 5 similar titles (threshold=50):
  - 'cash apps supervisor' (similarity score: 77) -> standardized to 'graphics supervisor'
  - 'logistics supervisor' (similarity score: 77) -> standardized to 'graphics supervisor'
  - 'programmatic 

Found 5 similar titles (threshold=50):
  - 'actuary 1' (similarity score: 88) -> standardized to 'actuary'
  - 'avp  actuary' (similarity score: 78) -> standardized to 'actuary'
  - 'senioractuary' (similarity score: 70) -> standardized to 'actuary'
  - 'actor' (similarity score: 67) -> standardized to 'actuary'
  - 'actuarial analyst' (similarity score: 58) -> standardized to 'actuary'
Remaining titles to process: 4952

Processing base title: 'appellate judicial law clerk'
Found 5 similar titles (threshold=50):
  - 'judicial law clerk' (similarity score: 78) -> standardized to 'appellate judicial law clerk'
  - 'judicial clerk' (similarity score: 67) -> standardized to 'appellate judicial law clerk'
  - 'career law clerk' (similarity score: 59) -> standardized to 'appellate judicial law clerk'
  - 'clinical lab associate' (similarity score: 56) -> standardized to 'appellate judicial law clerk'
  - 'electrician apprentice' (similarity score: 56) -> standardized to 'appellate judicial l

Found 5 similar titles (threshold=50):
  - 'registered nurse, clinic' (similarity score: 82) -> standardized to 'registered nurse'
  - 'registered nurse - cardiac cath' (similarity score: 71) -> standardized to 'registered nurse'
  - 'critical care registered nurse' (similarity score: 70) -> standardized to 'registered nurse'
  - 'nursery nurse' (similarity score: 69) -> standardized to 'registered nurse'
  - 'rn - nurse supervisor' (similarity score: 63) -> standardized to 'registered nurse'
Remaining titles to process: 4862

Processing base title: 'process development supervisor'
Found 5 similar titles (threshold=50):
  - 'loan processing supervisor' (similarity score: 79) -> standardized to 'process development supervisor'
  - 'senior process development engineer' (similarity score: 77) -> standardized to 'process development supervisor'
  - 'senior development officer' (similarity score: 75) -> standardized to 'process development supervisor'
  - 'senior director development' (simi

Found 5 similar titles (threshold=50):
  - 'government worker' (similarity score: 69) -> standardized to 'recovery worker'
  - 'grocery stocker' (similarity score: 67) -> standardized to 'recovery worker'
  - 'outreach worker' (similarity score: 67) -> standardized to 'recovery worker'
  - 'countryside worker' (similarity score: 67) -> standardized to 'recovery worker'
  - 'caseworker' (similarity score: 64) -> standardized to 'recovery worker'
Remaining titles to process: 4778

Processing base title: 'legal assistant - office of general counsel'
Found 5 similar titles (threshold=50):
  - 'assistant director of health  counseling services' (similarity score: 70) -> standardized to 'legal assistant - office of general counsel'
  - 'bilingual office assistant' (similarity score: 66) -> standardized to 'legal assistant - office of general counsel'
  - 'office managerfinancial assistant' (similarity score: 65) -> standardized to 'legal assistant - office of general counsel'
  - 'office  co

Found 5 similar titles (threshold=50):
  - 'senior administration officer' (similarity score: 81) -> standardized to 'senior fraud investigations officer'
  - 'senior communications officer' (similarity score: 75) -> standardized to 'senior fraud investigations officer'
  - 'senior administrative officer' (similarity score: 75) -> standardized to 'senior fraud investigations officer'
  - 'senior intelligence officer' (similarity score: 68) -> standardized to 'senior fraud investigations officer'
  - 'administrative support officer' (similarity score: 65) -> standardized to 'senior fraud investigations officer'
Remaining titles to process: 4688

Processing base title: 'policy officer'
Found 5 similar titles (threshold=50):
  - 'officer, policy' (similarity score: 100) -> standardized to 'policy officer'
  - 'police  officer' (similarity score: 93) -> standardized to 'policy officer'
  - 'privacy officer' (similarity score: 83) -> standardized to 'policy officer'
  - 'senior policy offic

Found 5 similar titles (threshold=50):
  - 'senior benefits manager' (similarity score: 82) -> standardized to 'benefits manager'
  - 'hr and benefits manager' (similarity score: 82) -> standardized to 'benefits manager'
  - 'benefits program manager' (similarity score: 80) -> standardized to 'benefits manager'
  - 'manager erp its' (similarity score: 77) -> standardized to 'benefits manager'
  - 'benefits and leave manager' (similarity score: 76) -> standardized to 'benefits manager'
Remaining titles to process: 4604

Processing base title: 'clinical auditor'
Found 5 similar titles (threshold=50):
  - 'compliance auditor' (similarity score: 76) -> standardized to 'clinical auditor'
  - 'technical author' (similarity score: 75) -> standardized to 'clinical auditor'
  - 'auditor iii' (similarity score: 74) -> standardized to 'clinical auditor'
  - 'night auditor' (similarity score: 69) -> standardized to 'clinical auditor'
  - 'sales auditor' (similarity score: 69) -> standardized to 'c

Found 5 similar titles (threshold=50):
  - 'service core director' (similarity score: 89) -> standardized to 'director, career services'
  - 'director, advancement services' (similarity score: 79) -> standardized to 'director, career services'
  - 'director business services' (similarity score: 76) -> standardized to 'director, career services'
  - 'director of desktop services' (similarity score: 73) -> standardized to 'director, career services'
  - 'direct care worker' (similarity score: 71) -> standardized to 'director, career services'
Remaining titles to process: 4520

Processing base title: 'digital channel manager'
Found 5 similar titles (threshold=50):
  - 'digital campaigns manager' (similarity score: 79) -> standardized to 'digital channel manager'
  - 'digital delivery manager' (similarity score: 77) -> standardized to 'digital channel manager'
  - 'digital  design manager' (similarity score: 76) -> standardized to 'digital channel manager'
  - 'digital media campaign manag

Found 5 similar titles (threshold=50):
  - 'camera hardware engineer' (similarity score: 75) -> standardized to 'electronics hardware engineer'
  - 'hardware engineer' (similarity score: 74) -> standardized to 'electronics hardware engineer'
  - 'hardware engineer i' (similarity score: 71) -> standardized to 'electronics hardware engineer'
  - 'electrical engineering manager' (similarity score: 71) -> standardized to 'electronics hardware engineer'
  - 'electrical systems engineer' (similarity score: 68) -> standardized to 'electronics hardware engineer'
Remaining titles to process: 4430

Processing base title: 'front of store attendant'
Found 5 similar titles (threshold=50):
  - 'food stand attendant' (similarity score: 73) -> standardized to 'front of store attendant'
  - 'home attendant hha' (similarity score: 62) -> standardized to 'front of store attendant'
  - 'on-site maintenance' (similarity score: 60) -> standardized to 'front of store attendant'
  - 'flight attendant' (simila

Found 5 similar titles (threshold=50):
  - 'technology consultant' (similarity score: 86) -> standardized to 'senior technology consultant'
  - 'senior technical sales consultant' (similarity score: 79) -> standardized to 'senior technology consultant'
  - 'sr tech consultant' (similarity score: 78) -> standardized to 'senior technology consultant'
  - 'technology consulting manager' (similarity score: 77) -> standardized to 'senior technology consultant'
  - 'it technical consultant' (similarity score: 75) -> standardized to 'senior technology consultant'
Remaining titles to process: 4334

Processing base title: 'grant offier'
Found 5 similar titles (threshold=50):
  - 'hr officer' (similarity score: 73) -> standardized to 'grant offier'
  - 'health officer' (similarity score: 69) -> standardized to 'grant offier'
  - 'clerical officer' (similarity score: 64) -> standardized to 'grant offier'
  - 'officer' (similarity score: 63) -> standardized to 'grant offier'
  - 'asylum officer' (

Found 5 similar titles (threshold=50):
  - 'marriage  family therapist' (similarity score: 78) -> standardized to 'family therapist'
  - 'senior therapist' (similarity score: 69) -> standardized to 'family therapist'
  - 'residential therapist' (similarity score: 65) -> standardized to 'family therapist'
  - 'group therapist' (similarity score: 65) -> standardized to 'family therapist'
  - 'speech therapist' (similarity score: 62) -> standardized to 'family therapist'
Remaining titles to process: 4226

Processing base title: 'brewery sales manager, beer brand manager'
Found 5 similar titles (threshold=50):
  - 'branch manager  childrens services manager' (similarity score: 67) -> standardized to 'brewery sales manager, beer brand manager'
  - 'general manager, inventory manager' (similarity score: 66) -> standardized to 'brewery sales manager, beer brand manager'
  - 'general manager  quality manager' (similarity score: 65) -> standardized to 'brewery sales manager, beer brand manager'

Found 5 similar titles (threshold=50):
  - 'director of ecommerce' (similarity score: 78) -> standardized to 'ecommerce marketing director'
  - 'director, ecommerce' (similarity score: 78) -> standardized to 'ecommerce marketing director'
  - 'director, growth marketing' (similarity score: 75) -> standardized to 'ecommerce marketing director'
  - 'director, loyalty marketing' (similarity score: 74) -> standardized to 'ecommerce marketing director'
  - 'director, meetings  events' (similarity score: 69) -> standardized to 'ecommerce marketing director'
Remaining titles to process: 4118

Processing base title: 'nurse'
Found 5 similar titles (threshold=50):
  - 'scrub nurse' (similarity score: 62) -> standardized to 'nurse'
  - 'charge nurse' (similarity score: 59) -> standardized to 'nurse'
  - 'school nurse' (similarity score: 59) -> standardized to 'nurse'
  - 'se' (similarity score: 57) -> standardized to 'nurse'
  - 'purchaser' (similarity score: 57) -> standardized to 'nurse'
Remain

Found 5 similar titles (threshold=50):
  - 'associate director of admission' (similarity score: 98) -> standardized to 'associate director of admissions'
  - 'associate director of advancement' (similarity score: 80) -> standardized to 'associate director of admissions'
  - 'associate director of analytics' (similarity score: 79) -> standardized to 'associate director of admissions'
  - 'associate director of accounting' (similarity score: 78) -> standardized to 'associate director of admissions'
  - 'assistant dean of admissions' (similarity score: 77) -> standardized to 'associate director of admissions'
Remaining titles to process: 4016

Processing base title: 'deputy district attorney'
Found 5 similar titles (threshold=50):
  - 'deputy district attorney iv' (similarity score: 94) -> standardized to 'deputy district attorney'
  - 'deputy prosecuting attorney' (similarity score: 75) -> standardized to 'deputy district attorney'
  - 'deputy attorney general' (similarity score: 72) -> 

Found 5 similar titles (threshold=50):
  - 'elearning designer' (similarity score: 75) -> standardized to 'learning designer project lead'
  - 'digital learning designer' (similarity score: 69) -> standardized to 'learning designer project lead'
  - 'project manager, global learning' (similarity score: 66) -> standardized to 'learning designer project lead'
  - 'lead software designer' (similarity score: 65) -> standardized to 'learning designer project lead'
  - 'project engineer team lead' (similarity score: 64) -> standardized to 'learning designer project lead'
Remaining titles to process: 3914

Processing base title: 'lobbyist'
Found 5 similar titles (threshold=50):
  - 'typist' (similarity score: 57) -> standardized to 'lobbyist'
  - 'hydrologist' (similarity score: 53) -> standardized to 'lobbyist'
  - 'biologist 1' (similarity score: 53) -> standardized to 'lobbyist'
  - 'botanist' (similarity score: 50) -> standardized to 'lobbyist'
  - 'geophysicist' (similarity score: 50) ->

Found 5 similar titles (threshold=50):
  - 'interior stylist' (similarity score: 74) -> standardized to 'senior hair stylist'
  - 'senior payroll specialist' (similarity score: 68) -> standardized to 'senior hair stylist'
  - 'studio stylist' (similarity score: 67) -> standardized to 'senior hair stylist'
  - 'senior supply chain specialist' (similarity score: 65) -> standardized to 'senior hair stylist'
  - 'senior study director' (similarity score: 65) -> standardized to 'senior hair stylist'
Remaining titles to process: 3812

Processing base title: 'auditor staff'
Found 5 similar titles (threshold=50):
  - 'staff editorwriter' (similarity score: 71) -> standardized to 'auditor staff'
  - 'public safety auditor' (similarity score: 65) -> standardized to 'auditor staff'
  - 'program auditor' (similarity score: 64) -> standardized to 'auditor staff'
  - 'assistant staff' (similarity score: 64) -> standardized to 'auditor staff'
  - 'ad traffic coordinator' (similarity score: 63) -> sta

Found 5 similar titles (threshold=50):
  - 'hospital claims analyst' (similarity score: 70) -> standardized to 'supply chain analyst'
  - 'campaign analyst' (similarity score: 67) -> standardized to 'supply chain analyst'
  - 'grants analyst' (similarity score: 65) -> standardized to 'supply chain analyst'
  - 'insurance analyst' (similarity score: 65) -> standardized to 'supply chain analyst'
  - 'executive pay analyst' (similarity score: 63) -> standardized to 'supply chain analyst'
Remaining titles to process: 3710

Processing base title: 'customer service support'
Found 5 similar titles (threshold=50):
  - 'customer servicetech support agent' (similarity score: 83) -> standardized to 'customer service support'
  - 'customer service co-worker' (similarity score: 76) -> standardized to 'customer service support'
  - 'customer support readiness manager' (similarity score: 72) -> standardized to 'customer service support'
  - 'customer service technician' (similarity score: 71) -> stan

Found 5 similar titles (threshold=50):
  - 'teaching and learning librarian  coordinator of reference services' (similarity score: 61) -> standardized to 'teen services and digital creativity librarian'
  - 'director of library and database services' (similarity score: 60) -> standardized to 'teen services and digital creativity librarian'
  - 'digital creative director' (similarity score: 59) -> standardized to 'teen services and digital creativity librarian'
  - 'director of international student and scholar services' (similarity score: 58) -> standardized to 'teen services and digital creativity librarian'
  - 'chief digital and information officer' (similarity score: 58) -> standardized to 'teen services and digital creativity librarian'
Remaining titles to process: 3602

Processing base title: 'child support supervisor'
Found 5 similar titles (threshold=50):
  - 'supply chain supervisor' (similarity score: 81) -> standardized to 'child support supervisor'
  - 'legislative support 

Found 5 similar titles (threshold=50):
  - 'it support assistant' (similarity score: 88) -> standardized to 'pupil support assistant'
  - 'business support assistant' (similarity score: 82) -> standardized to 'pupil support assistant'
  - 'learning support assistant' (similarity score: 78) -> standardized to 'pupil support assistant'
  - 'business support associate' (similarity score: 69) -> standardized to 'pupil support assistant'
  - 'assistant store leader' (similarity score: 67) -> standardized to 'pupil support assistant'
Remaining titles to process: 3488

Processing base title: 'mortgage servicing manager'
Found 5 similar titles (threshold=50):
  - 'mortgage operations manager' (similarity score: 79) -> standardized to 'mortgage servicing manager'
  - 'member service manager' (similarity score: 71) -> standardized to 'mortgage servicing manager'
  - 'offering manager' (similarity score: 67) -> standardized to 'mortgage servicing manager'
  - 'manager ii rehab services' (similari

Found 5 similar titles (threshold=50):
  - 'beamline scientist' (similarity score: 80) -> standardized to 'applied scientist'
  - 'advanced scientist' (similarity score: 74) -> standardized to 'applied scientist'
  - 'scientific aid' (similarity score: 71) -> standardized to 'applied scientist'
  - 'geoscientist' (similarity score: 69) -> standardized to 'applied scientist'
  - 'computer scientist' (similarity score: 69) -> standardized to 'applied scientist'
Remaining titles to process: 3356

Processing base title: 'director of equitable core mission'
Found 5 similar titles (threshold=50):
  - 'director of christian education' (similarity score: 65) -> standardized to 'director of equitable core mission'
  - 'associate director of evaluation' (similarity score: 64) -> standardized to 'director of equitable core mission'
  - 'director of membership' (similarity score: 61) -> standardized to 'director of equitable core mission'
  - 'director of marketing  compliance' (similarity score: 

Found 5 similar titles (threshold=50):
  - 'adult education coordinator' (similarity score: 71) -> standardized to 'catalogtransfer evaluation coordinator'
  - 'computer operations coordinator' (similarity score: 70) -> standardized to 'catalogtransfer evaluation coordinator'
  - 'attorney recruiting coordinator' (similarity score: 70) -> standardized to 'catalogtransfer evaluation coordinator'
  - 'database and evaluation coordinator' (similarity score: 68) -> standardized to 'catalogtransfer evaluation coordinator'
  - 'event  alumni relations coordinator' (similarity score: 67) -> standardized to 'catalogtransfer evaluation coordinator'
Remaining titles to process: 3235

Processing base title: 'strategic projects manager'
Found 5 similar titles (threshold=50):
  - 'projects team manager' (similarity score: 81) -> standardized to 'strategic projects manager'
  - 'manager, strategic messaging' (similarity score: 75) -> standardized to 'strategic projects manager'
  - 'marketing strate

Found 5 similar titles (threshold=50):
  - 'senior director, deal strategy' (similarity score: 88) -> standardized to 'senior director of digital strategy'
  - 'senior director, web and digital' (similarity score: 76) -> standardized to 'senior director of digital strategy'
  - 'director of research  strategy' (similarity score: 75) -> standardized to 'senior director of digital strategy'
  - 'executive director of digital strategies' (similarity score: 75) -> standardized to 'senior director of digital strategy'
  - 'senior director of seo' (similarity score: 74) -> standardized to 'senior director of digital strategy'
Remaining titles to process: 3085

Processing base title: 'tech writercompliance analyst'
Found 5 similar titles (threshold=50):
  - 'lead performance analyst' (similarity score: 68) -> standardized to 'tech writercompliance analyst'
  - 'performance analyst' (similarity score: 67) -> standardized to 'tech writercompliance analyst'
  - 'interface analyst' (similarity sc

Found 5 similar titles (threshold=50):
  - 'content analyst' (similarity score: 73) -> standardized to 'venture analyst'
  - 'investment analyst' (similarity score: 73) -> standardized to 'venture analyst'
  - 'warehouse analyst' (similarity score: 69) -> standardized to 'venture analyst'
  - 'game test analyst' (similarity score: 69) -> standardized to 'venture analyst'
  - 'cyber threat analyst' (similarity score: 69) -> standardized to 'venture analyst'
Remaining titles to process: 2941

Processing base title: 'bike mechanic'
Found 5 similar titles (threshold=50):
  - 'bicycle mechanic' (similarity score: 83) -> standardized to 'bike mechanic'
  - 'industrial mechanic' (similarity score: 62) -> standardized to 'bike mechanic'
  - 'setup mechanic' (similarity score: 59) -> standardized to 'bike mechanic'
  - 'associate merchant' (similarity score: 58) -> standardized to 'bike mechanic'
  - 'bicycle mechanicski technician' (similarity score: 56) -> standardized to 'bike mechanic'
Rema

Found 5 similar titles (threshold=50):
  - 'manager of simulation training' (similarity score: 70) -> standardized to 'technology training manager'
  - 'training and technical assistance manager' (similarity score: 68) -> standardized to 'technology training manager'
  - 'geotechnical engineer in training' (similarity score: 67) -> standardized to 'technology training manager'
  - 'technology transformation projectprogram manager' (similarity score: 67) -> standardized to 'technology training manager'
  - 'merchandise planning manager' (similarity score: 65) -> standardized to 'technology training manager'
Remaining titles to process: 2803

Processing base title: 'site reliability engineer'
Found 5 similar titles (threshold=50):
  - 'site reliability engineer sre' (similarity score: 93) -> standardized to 'site reliability engineer'
  - 'lead site reliability engineer' (similarity score: 91) -> standardized to 'site reliability engineer'
  - 'reliability engineer' (similarity score: 89

Found 5 similar titles (threshold=50):
  - 'technology team leader' (similarity score: 76) -> standardized to 'clinical technologist team leader'
  - 'licensed clinical psychologist' (similarity score: 70) -> standardized to 'clinical technologist team leader'
  - 'clinical neuropsychologist' (similarity score: 68) -> standardized to 'clinical technologist team leader'
  - 'creative technologist' (similarity score: 67) -> standardized to 'clinical technologist team leader'
  - 'clinical operations lead' (similarity score: 67) -> standardized to 'clinical technologist team leader'
Remaining titles to process: 2659

Processing base title: 'intellectual property manager'
Found 5 similar titles (threshold=50):
  - 'head of intellectual property' (similarity score: 76) -> standardized to 'intellectual property manager'
  - 'sr. intellectual property counsel' (similarity score: 69) -> standardized to 'intellectual property manager'
  - 'chief intellectual property counsel' (similarity score:

Found 5 similar titles (threshold=50):
  - 'director of digital marketing' (similarity score: 88) -> standardized to 'director of digital learning'
  - 'director, digital marketing' (similarity score: 81) -> standardized to 'director of digital learning'
  - 'senior director of digital marketing' (similarity score: 78) -> standardized to 'director of digital learning'
  - 'director of capital planning' (similarity score: 71) -> standardized to 'director of digital learning'
  - 'director of measurement and learning' (similarity score: 69) -> standardized to 'director of digital learning'
Remaining titles to process: 2509

Processing base title: 'biological science technician'
Found 5 similar titles (threshold=50):
  - 'marine science technician' (similarity score: 74) -> standardized to 'biological science technician'
  - 'industrial service technician' (similarity score: 69) -> standardized to 'biological science technician'
  - 'biomedical equipment technician senior' (similarity sco

Found 5 similar titles (threshold=50):
  - 'cybersecurity engineering' (similarity score: 94) -> standardized to 'cybersecurity engineer'
  - 'systems engineer - cybersecurity' (similarity score: 85) -> standardized to 'cybersecurity engineer'
  - 'director, cybersecurity' (similarity score: 77) -> standardized to 'cybersecurity engineer'
  - 'director of cybersecurity' (similarity score: 72) -> standardized to 'cybersecurity engineer'
  - 'senior cybersecurity analyst' (similarity score: 72) -> standardized to 'cybersecurity engineer'
Remaining titles to process: 2347

Processing base title: 'assistant professor of chemistry and biochemistry'
Found 5 similar titles (threshold=50):
  - 'manager of governance and assistant to the president' (similarity score: 59) -> standardized to 'assistant professor of chemistry and biochemistry'
  - 'assistant to chief of programs' (similarity score: 58) -> standardized to 'assistant professor of chemistry and biochemistry'
  - 'professor of humanit

Found 5 similar titles (threshold=50):
  - 'group credit manager' (similarity score: 78) -> standardized to 'group finance manager'
  - 'audience growth manager' (similarity score: 73) -> standardized to 'group finance manager'
  - 'group manager of analytics' (similarity score: 72) -> standardized to 'group finance manager'
  - 'group hr manager' (similarity score: 70) -> standardized to 'group finance manager'
  - 'assoc. hr manager' (similarity score: 65) -> standardized to 'group finance manager'
Remaining titles to process: 2169

Processing base title: 'outreach professional'
Found 5 similar titles (threshold=50):
  - 'it professional' (similarity score: 78) -> standardized to 'outreach professional'
  - 'professional nurse 1' (similarity score: 78) -> standardized to 'outreach professional'
  - 'extension outreach professional ii' (similarity score: 76) -> standardized to 'outreach professional'
  - 'professional administrator' (similarity score: 68) -> standardized to 'outreach 

Found 5 similar titles (threshold=50):
  - 'cyber security engineer 2' (similarity score: 96) -> standardized to 'cyber security engineer'
  - 'security engineer ii' (similarity score: 79) -> standardized to 'cyber security engineer'
  - 'cyber security lead' (similarity score: 76) -> standardized to 'cyber security engineer'
  - 'application security engineer' (similarity score: 73) -> standardized to 'cyber security engineer'
  - 'product security engineer' (similarity score: 71) -> standardized to 'cyber security engineer'
Remaining titles to process: 1978

Processing base title: 'supervisory physical scientist'
Found 5 similar titles (threshold=50):
  - 'senior physical scientist' (similarity score: 87) -> standardized to 'supervisory physical scientist'
  - 'fiscal services supervisor' (similarity score: 71) -> standardized to 'supervisory physical scientist'
  - 'critical facilities supervisor' (similarity score: 70) -> standardized to 'supervisory physical scientist'
  - 'clinic

Found 5 similar titles (threshold=50):
  - 'high ropes instructor' (similarity score: 68) -> standardized to 'workforce instructor'
  - 'sr analyst - workforce technology' (similarity score: 63) -> standardized to 'workforce instructor'
  - 'instructor, psychology' (similarity score: 63) -> standardized to 'workforce instructor'
  - 'math instructor tenured' (similarity score: 60) -> standardized to 'workforce instructor'
  - 'workforce optimisation analyst' (similarity score: 60) -> standardized to 'workforce instructor'
Remaining titles to process: 1779

Processing base title: 'family education'
Found 5 similar titles (threshold=50):
  - 'educational technology manager' (similarity score: 57) -> standardized to 'family education'
  - 'catering sales' (similarity score: 53) -> standardized to 'family education'
  - 'higher education assistant' (similarity score: 52) -> standardized to 'family education'
  - 'pipeline operations' (similarity score: 51) -> standardized to 'family educat

Found 5 similar titles (threshold=50):
  - 'telephonic practice area consultant' (similarity score: 57) -> standardized to 'apprentice mechanic for construction machinery'
  - 'technical documentation manager' (similarity score: 55) -> standardized to 'apprentice mechanic for construction machinery'
  - 'distribution partner account manager' (similarity score: 54) -> standardized to 'apprentice mechanic for construction machinery'
  - 'apprentice early years assistant' (similarity score: 54) -> standardized to 'apprentice mechanic for construction machinery'
  - 'teaching instructor of first year writing' (similarity score: 53) -> standardized to 'apprentice mechanic for construction machinery'
Remaining titles to process: 1557

Processing base title: 'officemarketing manager'
Found 5 similar titles (threshold=50):
  - 'trucking back office manager' (similarity score: 75) -> standardized to 'officemarketing manager'
  - 'group marketing manager' (similarity score: 74) -> standardized t

Found 5 similar titles (threshold=50):
  - 'hr recruiter' (similarity score: 53) -> standardized to 'minister united church'
  - 'it intern' (similarity score: 52) -> standardized to 'minister united church'
  - 'threat hunter' (similarity score: 51) -> standardized to 'minister united church'
  - 'commercial real estate underwriter' (similarity score: 50) -> standardized to 'minister united church'
  - 'hvac installer' (similarity score: 50) -> standardized to 'minister united church'
Remaining titles to process: 1301

Processing base title: 'ghostwriter'
Found 5 similar titles (threshold=50):
  - 'scriptwriter' (similarity score: 70) -> standardized to 'ghostwriter'
  - 'copywritier' (similarity score: 64) -> standardized to 'ghostwriter'
  - 'subtitler' (similarity score: 60) -> standardized to 'ghostwriter'
  - 'strategic writer' (similarity score: 59) -> standardized to 'ghostwriter'
  - 'scientific writer' (similarity score: 57) -> standardized to 'ghostwriter'
Remaining titles t

Found 5 similar titles (threshold=50):
  - 'general ledger accountant' (similarity score: 65) -> standardized to 'payroll accountant'
  - 'account lead' (similarity score: 60) -> standardized to 'payroll accountant'
  - 'financial reporting accountant' (similarity score: 58) -> standardized to 'payroll accountant'
  - 'payroll and benefit coordinator' (similarity score: 57) -> standardized to 'payroll accountant'
  - 'audit  accounts assistant' (similarity score: 57) -> standardized to 'payroll accountant'
Remaining titles to process: 1001

Processing base title: 'senior technical writer contract'
Found 5 similar titles (threshold=50):
  - 'senior technical product owner' (similarity score: 71) -> standardized to 'senior technical writer contract'
  - 'security technical consultant' (similarity score: 66) -> standardized to 'senior technical writer contract'
  - 'technical consultant data scienceai' (similarity score: 63) -> standardized to 'senior technical writer contract'
  - 'senio



Found 5 similar titles (threshold=50):
  - 'senior ios developerconsultant' (similarity score: 60) -> standardized to 'senior human rights advisor'
  - 'advisor, disease and vector control' (similarity score: 59) -> standardized to 'senior human rights advisor'
  - 'senior human resources executive' (similarity score: 58) -> standardized to 'senior human rights advisor'
  - 'senior draughtsperson' (similarity score: 58) -> standardized to 'senior human rights advisor'
  - 'senior inbound marketer' (similarity score: 56) -> standardized to 'senior human rights advisor'
Remaining titles to process: 553

Processing base title: 'event monitor'
Found 4 similar titles (threshold=50):
  - 'senior geochemist' (similarity score: 53) -> standardized to 'event monitor'
  - 'monitoring  evaluation manager' (similarity score: 52) -> standardized to 'event monitor'
  - 'development officer monthly giving' (similarity score: 51) -> standardized to 'event monitor'
  - 'telecommunicator iv' (similarity

# Analyze Standardization Quality

In [22]:
"""
Additional analysis of standardization quality
"""
def analyze_standardization_quality(df):
    # Count occurrences of each standardized title
    title_counts = df['standardized_Job title'].value_counts()
    print("\nTop 10 standardized job titles:")
    print(title_counts.head(10))
    
    # Check for remaining inconsistencies
    potential_issues = title_counts[title_counts < 5].index
    print(f"\nNumber of titles appearing less than 5 times: {len(potential_issues)}")
    
    return title_counts

# Run quality analysis
title_counts = analyze_standardization_quality(processed_df)


Top 10 standardized job titles:
software engineer           415
project manager             321
director                    280
senior software engineer    254
program manager             218
executive assistant         213
manager                     200
teacher                     186
librarian                   175
product manager             161
Name: standardized_Job title, dtype: int64

Number of titles appearing less than 5 times: 162


#  Feature Construction for Adjusted Salary

In [23]:
"""
Define approximate conversion rates to USD as of April 2025 (update these as needed)
"""
conversion_rates = {
    'USD': 1.0,
    'GBP': 1.32,    # British Pound
    'CAD': 0.74,    # Canadian Dollar
    'EUR': 1.10,    # Euro
    'AUD': 0.67,    # Australian Dollar
    'SGD': 0.76,    # Singapore Dollar
    'PLN': 0.25,    # Polish Zloty
    'INR': 0.012,   # Indian Rupee
    'TRY': 0.029,   # Turkish Lira
    'NGN': 0.00061, # Nigerian Naira
    'COP': 0.00024, # Colombian Peso
    'INR': 0.012,    # Indian Rupee
    'DKK': 0.15     # Danish Kronor
}

"""
Function to clean and convert salary/bonus to float
"""
def clean_numeric(value):
    if pd.isna(value) or value == '':
        return 0.0
    try:
        # Remove commas and convert to float
        return float(str(value).replace(',', ''))
    except ValueError:
        return 0.0
    
"""
Clean the salary and bonus columns
"""
processed_df['Annual salary'] = processed_df['Annual salary'].apply(clean_numeric)
processed_df['Bonuses'].fillna(0, inplace=True)
processed_df['Bonuses'] = processed_df['Bonuses'].apply(clean_numeric)


"""
Determine currency and convert to USD
"""
def convert_to_usd(row):
    currency = row['Currency']
    salary = row['Annual salary']
    bonus = row['Bonuses']
    
    # If currency is "Other", try to get it from the "Other Currency" field
    if currency == 'Other':
        other_currency = str(row['Other Currency']).strip()
        # Map common "Other" currencies to our conversion rates
        currency_map = {
            'Singapore Dollara': 'SGD',
            'Singaporw': 'SGD',
            'PLN': 'PLN',
            'INR': 'INR',
            'TRY': 'TRY',
            'NGN': 'NGN',
            'COP': 'COP',
            'Rupees': 'Rupees'
        }
        currency = currency_map.get(other_currency, 'USD')
    
    # Get conversion rate (default to 1.0 if currency not found)
    rate = conversion_rates.get(currency, 1.0)
    
    # Convert total compensation (salary + bonus) to USD
    total_usd = (salary + bonus) * rate
    return total_usd

In [24]:
#Create the new "Salary in USD" column
processed_df['Salary in USD'] = processed_df.apply(convert_to_usd, axis=1)


# Apply Purchasing Power Parity Adjustment
ppp_factors = {'USD': 1.0,
               'GBP': 0.73,
               'CAD': 0.75,
               'EUR': 0.8,
               'AUD/NZD': 0.78,
               'CHF': 1.1,
               'ZAR': 0.25,
               'SEK': 0.84,
               'HKD': 0.85,
               'JPY': 0.62}

processed_df['Salary in USD (PPP Adjusted)'] = processed_df.apply(lambda row: row['Salary in USD'] / ppp_factors.get(row['Currency'], 0.50), axis=1)
print(processed_df[['Salary in USD', 'What country do you work in?', 'Salary in USD (PPP Adjusted)']])

       Salary in USD What country do you work in?  \
0            55000.0                United States   
1            77352.0               United Kingdom   
2            34000.0                           US   
3            65000.0                          USA   
4            67000.0                           US   
...              ...                          ...   
27935        25000.0                     Colombia   
27936        55000.0               United States    
27937        94000.0               United States    
27938        64000.0                      denmark   
27939       150000.0                           US   

       Salary in USD (PPP Adjusted)  
0                      55000.000000  
1                     105961.643836  
2                      34000.000000  
3                      65000.000000  
4                      67000.000000  
...                             ...  
27935                  25000.000000  
27936                  55000.000000  
27937                

# Outlier Handling

In [25]:
"""
Apply an upper bound to the adjusted salary values at the 95th percentile
"""

Q1 = processed_df['Salary in USD (PPP Adjusted)'].quantile(0.05)
Q3 = processed_df['Salary in USD (PPP Adjusted)'].quantile(0.95)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
processed_df['Salary in USD (Capped)'] = processed_df['Salary in USD (PPP Adjusted)'].clip(upper=upper_bound)

max_salary_usd = processed_df['Salary in USD (PPP Adjusted)'].max()
max_salary_usd_capped = processed_df['Salary in USD (Capped)'].max()

print(f"Maximum Uncapped Salary in USD': {max_salary_usd}")
print(f"Maximum Capped Salary in USD': {max_salary_usd_capped}")

Maximum Uncapped Salary in USD': 1980000000.0
Maximum Capped Salary in USD': 517900.0


# Feature Aggregation for Location related Features

In [26]:
"""
Standardize country names
"""
def standardize_country(country):
    # Dictionary mapping variations of United States to 'USA'
    us_variations = {
        'USA': 'USA',
        'US': 'USA',
        'United States': 'USA',
        'United Status': 'USA',
        'America': 'USA',
        'U.S.': 'USA',
        'U.S.A.': 'USA',
        'Usa': 'USA',
        'us': 'USA',
        'united states': 'USA'
    }
    # Return 'USA' if the country matches a variation, otherwise return the original country
    return us_variations.get(str(country).strip(), country)

"""
Aggregates location data into a single feature.
"""
def combine_location(row, country_col, state_col, city_col, us_country_value='USA'):
    country = row[country_col]
    state = row[state_col]
    city = row[city_col]
    
    # For US locations, include state if available
    if country == us_country_value and pd.notna(state) and pd.notna(city):
        return f"{country}-{state}-{city}"
    # For non-US locations or missing state/city, omit state
    elif pd.notna(city):
        return f"{country}-{city}"
    # Fallback if city is missing
    return country

In [27]:
"""
Apply Feature Aggregation
"""

# Standardize the country names
processed_df['What country do you work in?'] = processed_df['What country do you work in?'].apply(standardize_country)

# Apply the function to create the new 'Location' column
processed_df['Location'] = processed_df.apply(
    lambda row: combine_location(
        row,
        country_col='What country do you work in?',
        state_col="If you're in the U.S., what state do you work in?",
        city_col='What city do you work in?',
        us_country_value='USA'
    ),
    axis=1
)

# Display the first few rows of the result
print("\nFirst 10 rows with the new 'Location' column:")
print(processed_df[[
    'What country do you work in?',
    "If you're in the U.S., what state do you work in?",
    'What city do you work in?',
    'Location'
]].head(10))


First 10 rows with the new 'Location' column:
  What country do you work in?  \
0                          USA   
1               United Kingdom   
2                          USA   
3                          USA   
4                          USA   
5                          USA   
6                          USA   
7                          USA   
8                          USA   
9                          USA   

  If you're in the U.S., what state do you work in? What city do you work in?  \
0                                     Massachusetts                    Boston   
1                                               NaN                 Cambridge   
2                                         Tennessee               Chattanooga   
3                                         Wisconsin                 Milwaukee   
4                                    South Carolina                Greenville   
5                                     New Hampshire                   Hanover   
6          

In [28]:
# Save the updated dataset to a new CSV file
processed_df.to_csv('processed_dataset.csv', index=False)