In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
import numpy as np
import os
import re
import spacy
from spacy.cli import download

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [50]:
df = pd.read_excel("../extra_data/Task Ratings 20.1.xlsx")
df.to_csv("../extra_data/Task Ratings 20.1.csv", index=False)

In [51]:
task_statements_new_df = pd.read_csv("../extra_data/Task Statements 20.1.csv")
task_statements_claude_df = pd.read_csv("../original_data/onet_task_statements.csv")

task_statements_new_df["task_norm"] = task_statements_new_df["Task"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()
task_statements_claude_df["task_norm"] = task_statements_claude_df["Task"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()

new_tasks_set = set(task_statements_new_df["task_norm"].dropna().unique())
claude_tasks_set = set(task_statements_claude_df["task_norm"].dropna().unique())

# 1. Intersection (tasks in both)
tasks_in_both = new_tasks_set & claude_tasks_set
print(f"Tasks in both: {len(tasks_in_both)}")

# 2. Only in new
tasks_only_in_new = new_tasks_set - claude_tasks_set
print(f"Tasks only in new: {len(tasks_only_in_new)}")

# 3. Only in Claude (old)
tasks_only_in_claude = claude_tasks_set - new_tasks_set
print(f"Tasks only in Claude's set: {len(tasks_only_in_claude)}")


task_mappings_df = pd.read_csv("../original_data/onet_task_mappings.csv")
task_mappings_df["task_norm"] = task_mappings_df["task_name"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()

# Unique normalized task sets
new_tasks_set = set(task_statements_new_df["task_norm"].dropna().unique())
claude_pct_tasks_set = set(task_mappings_df["task_norm"].dropna().unique())

# Intersections and gaps
in_both = claude_pct_tasks_set & new_tasks_set
only_in_claude_pct = claude_pct_tasks_set - new_tasks_set

# Prints
print("Claude % tasks (unique):", len(claude_pct_tasks_set))
print("In both (mapped % task exists in NEW):", len(in_both))
print("Missing in NEW (mapped % task not found):", len(only_in_claude_pct))
print("Coverage (% of Claude % tasks found in NEW):",
      round(100 * len(in_both) / max(1, len(claude_pct_tasks_set)), 2), "%")

# Optional: list a few missing for spot-check
print("\nSample missing tasks:", list(sorted(only_in_claude_pct))[:20])


missing_pct_sum = task_mappings_df[task_mappings_df["task_norm"].isin(only_in_claude_pct)]["pct"].sum()

# Total pct for all tasks
total_pct_sum = task_mappings_df["pct"].sum()

# Percent composition of missing tasks
missing_pct_percent = 100 * missing_pct_sum / total_pct_sum

print(f"Sum of pct for missing tasks: {missing_pct_sum}")
print(f"Percent of total pct from missing tasks: {missing_pct_percent:.2f}%")



# Load ratings file
task_ratings_df = pd.read_csv("../extra_data/Task Ratings 20.1.csv")

# Normalize 'Task' column in ratings
task_ratings_df["task_norm"] = task_ratings_df["Task"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()

ratings_tasks_set = set(task_ratings_df["task_norm"].dropna().unique())

print("\n--- Ratings File Checks ---")

# 1. Task Statements vs Ratings
statements_in_ratings = new_tasks_set & ratings_tasks_set
statements_missing_in_ratings = new_tasks_set - ratings_tasks_set

print(f"Task statements in ratings: {len(statements_in_ratings)} / {len(new_tasks_set)} "
      f"({100*len(statements_in_ratings)/len(new_tasks_set):.2f}%)")
print(f"Task statements missing in ratings: {len(statements_missing_in_ratings)}")
print("Sample missing from ratings (statements):", list(sorted(statements_missing_in_ratings))[:10])

# 2. Task Mappings vs Ratings
mappings_in_ratings = claude_pct_tasks_set & ratings_tasks_set
mappings_missing_in_ratings = claude_pct_tasks_set - ratings_tasks_set

print(f"\nTask mappings in ratings: {len(mappings_in_ratings)} / {len(claude_pct_tasks_set)} "
      f"({100*len(mappings_in_ratings)/len(claude_pct_tasks_set):.2f}%)")
print(f"Task mappings missing in ratings: {len(mappings_missing_in_ratings)}")
print("Sample missing from ratings (mappings):", list(sorted(mappings_missing_in_ratings))[:10])

# 3. (Optional) % sum of missing mappings
missing_mappings_pct_sum = task_mappings_df[task_mappings_df["task_norm"].isin(mappings_missing_in_ratings)]["pct"].sum()
total_pct_sum = task_mappings_df["pct"].sum()
missing_mappings_pct_percent = 100 * missing_mappings_pct_sum / total_pct_sum

print(f"\nSum of pct for missing mappings: {missing_mappings_pct_sum}")
print(f"Percent of total pct from missing mappings: {missing_mappings_pct_percent:.2f}%")




Tasks in both: 18427
Tasks only in new: 0
Tasks only in Claude's set: 0
Claude % tasks (unique): 3514
In both (mapped % task exists in NEW): 3513
Missing in NEW (mapped % task not found): 1
Coverage (% of Claude % tasks found in NEW): 99.97 %

Sample missing tasks: ['none']
Sum of pct for missing tasks: 0.4815572435538299
Percent of total pct from missing tasks: 0.48%

--- Ratings File Checks ---
Task statements in ratings: 17808 / 18427 (96.64%)
Task statements missing in ratings: 619
Sample missing from ratings (statements): ['adjust network sizes to meet volume or capacity demands', 'adjust temperature  pressure  vacuum  level  flow rate  or transfer of biofuels to maintain processes at required levels', 'administer tests to help determine children s developmental levels  needs  or potential', 'advise clients on aspects of capitalization  such as amounts  sources  or timing', 'advise farmers on upgrading global positioning system  gps  equipment to take advantage of newly installed 

In [42]:
task_statements_df = pd.read_csv("../extra_data/task_statements_new.csv")
task_mappings_df = pd.read_csv("../original_data/onet_task_mappings.csv")

task_statements_df["task_Norm"] = task_statements_df["Task"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()
task_mappings_df["task_norm"] = task_mappings_df["task_name"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()
merged_test = task_mappings_df.merge(
    task_statements_df,
    left_on="task_norm",
    right_on="task_Norm",
    how="left"
)
merged_test
#merged_test.loc[merged_test["pct"].isna(), "task_Norm"].nunique()


Unnamed: 0,task_name,pct,task_norm,O*NET-SOC Code,Title,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source,task_Norm
0,act as advisers to student organizations.,0.006775,act as advisers to student organizations,25-1011.00,"Business Teachers, Postsecondary",5682.0,Act as advisers to student organizations.,Supplemental,104.0,08/2019,Incumbent,act as advisers to student organizations
1,act as advisers to student organizations.,0.006775,act as advisers to student organizations,25-1021.00,"Computer Science Teachers, Postsecondary",5700.0,Act as advisers to student organizations.,Supplemental,90.0,08/2019,Incumbent,act as advisers to student organizations
2,act as advisers to student organizations.,0.006775,act as advisers to student organizations,25-1022.00,"Mathematical Science Teachers, Postsecondary",5726.0,Act as advisers to student organizations.,Supplemental,116.0,08/2019,Incumbent,act as advisers to student organizations
3,act as advisers to student organizations.,0.006775,act as advisers to student organizations,25-1031.00,"Architecture Teachers, Postsecondary",5751.0,Act as advisers to student organizations.,Core,69.0,08/2019,Incumbent,act as advisers to student organizations
4,act as advisers to student organizations.,0.006775,act as advisers to student organizations,25-1032.00,"Engineering Teachers, Postsecondary",5774.0,Act as advisers to student organizations.,Supplemental,54.0,08/2019,Incumbent,act as advisers to student organizations
...,...,...,...,...,...,...,...,...,...,...,...,...
4211,"write, design, or edit web page content, or di...",0.320908,write design or edit web page content or di...,,,,,,,,,
4212,"write, present, and publish reports that recor...",0.117393,write present and publish reports that recor...,,,,,,,,,
4213,"write, review, or execute plans for testing ne...",0.010944,write review or execute plans for testing ne...,15-1299.03,Document Management Specialists,16220.0,"Write, review, or execute plans for testing ne...",Core,20.0,08/2021,Occupational Expert,write review or execute plans for testing ne...
4214,"write, review, or maintain engineering documen...",0.076872,write review or maintain engineering documen...,17-2141.02,Automotive Engineers,16425.0,"Write, review, or maintain engineering documen...",Core,21.0,08/2022,Occupational Expert,write review or maintain engineering documen...


### Original Data

In [25]:
task_statements_df = pd.read_csv("../original_data/onet_task_statements.csv")
crosswalk_df = pd.read_csv("../extra_data/2010_to_2019_soc_crosswalk.csv")
task_statements_df["title_norm"] = task_statements_df["Title"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()
crosswalk_df["2010_title_norm"] = crosswalk_df["O*NET-SOC 2010 Title"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()
crosswalk_df["2019_title_norm"] = crosswalk_df["O*NET-SOC 2019 Title"].str.lower().str.replace("[^a-z0-9]", " ", regex=True).str.strip()

merged = pd.merge(
    task_statements_df,
    crosswalk_df,
    left_on="title_norm",
    right_on="2010_title_norm",
    how="left"
)

merged["O*NET-SOC 2019 Title"].isna().sum()
merged.to_csv("test_3")


In [22]:
def norm(s: pd.Series) -> pd.Series:
    return (
        s.fillna("")
         .str.lower()
         .str.replace("[^a-z0-9]", " ", regex=True)
         .str.replace(r"\s+", " ", regex=True)
         .str.strip()
    )

# normalized versions
t0 = norm(merged["Title"])
t19 = norm(merged["O*NET-SOC 2019 Title"])

# boolean mask of mismatches
diff = t0 != t19

print("Rows with different titles (normalized):", diff.sum())

# show unique pairs that differ
pairs = (merged.loc[diff, ["Title", "O*NET-SOC 2019 Title"]]
         .drop_duplicates()
         .sort_values(["Title", "O*NET-SOC 2019 Title"]))
print(pairs.to_string(index=False, max_rows=50))

Rows with different titles (normalized): 4768
                                                                    Title                                                                               O*NET-SOC 2019 Title
                                                              Accountants                                                                           Accountants and Auditors
                                         Administrative Services Managers                                                                                Facilities Managers
Adult Basic and Secondary Education and Literacy Teachers and Instructors     Adult Basic Education, Adult Secondary Education, and English as a Second Language Instructors
                         Aerospace Engineering and Operations Technicians                                 Aerospace Engineering and Operations Technologists and Technicians
                                                          Anthropologists                

In [14]:
# Load Initial ONET Data

def create_onet_soc_data() -> pd.DataFrame:
    """
    Description:
        This takes the onet task statements and merges them with the SOC structure in order to get the SOC major group titles mapped to the tasks.
        It then renames columns for better data usage.
        It then normalizes the task names by making them all lowercase and stripping whitespace. 
        It then creates a new column to count the number of occurrences of each task in an occupation, and in an SOC title.
        This originally did not rename columns or create a column for n_occurances_soc.
    
    Args:
        onet_path (str): Path to the O*NET task statements CSV file
        soc_path (str): Path to the SOC structure CSV file
    
    Returns:
        pd.DataFrame: Merged DataFrame containing O*NET data with SOC major group titles
    """

    # Read and process O*NET data
    onet_df = pd.read_csv("../original_data/onet_task_statements.csv")
    onet_df["soc_group_code"] = onet_df["O*NET-SOC Code"].str[:2]
    
    # Read and process SOC data
    soc_df = pd.read_csv("../original_data/SOC_Structure.csv")
    soc_df = soc_df.dropna(subset=['Major Group'])
    soc_df["soc_group_code"] = soc_df["Major Group"].str[:2]
    
    # Merge datasets
    task_soc_df = onet_df.merge(
        soc_df[['soc_group_code', 'SOC or O*NET-SOC 2019 Title']],
        on='soc_group_code',
        how='left'
    )

    # Rename columns for better usability
    task_soc_df.rename(columns={
    "O*NET-SOC Code": "occ_group_code",
    "Title": "title",
    "Task ID": "task_id",
    "Task": "task",
    "Task Type": "task_type",
    "Incumbents Responding": "n_responding",
    "Date": "date",
    "Domain Source": "domain_source",
    "SOC or O*NET-SOC 2019 Title": "soc_title",
    }, inplace=True)

    task_soc_df["task_normalized"] = task_soc_df["task"].str.lower().str.strip()
    task_soc_df["n_occurrences"] = task_soc_df.groupby("task_normalized")["title"].transform("nunique")
    task_soc_df["n_occurrences_soc"] = task_soc_df.groupby("task_normalized")["soc_title"].transform("nunique")

    return task_soc_df

task_soc_df = create_onet_soc_data()
#display(task_soc_df.reset_index(drop=True))

In [15]:
# Add Claude data
def add_claude_pct(df) -> pd.DataFrame:
    """
    Description:
        This loads in the tasks and percentage of occurrences from the Claude data, and merges it with the tasks in our data set we already have. 
        It then normalizes the percentages of occurances of tasks and has one column for weighted percents based multiple occurrences, and one where that weight is normalized
        It then sorts it based on the O*NET-SOC Code.
        This originally did not create a column for the weighted percentage of occurrences.
    
    Args:
        task_soc_df (pd.DataFrame): DataFrame containing O*NET tasks and SOC titles.
    
    Returns:
        pd.DataFrame: Updated DataFrame with percentage of occurrences added.
    """
    
    # Load task mappings from Claude data
    task_mappings_df = pd.read_csv("../original_data/onet_task_mappings.csv")
    
    # Merge with existing task DataFrame
    merged = task_mappings_df.merge(
        df,
        left_on="task_name",
        right_on="task_normalized",
        how="left"
    )
    
    # Calculate weighted and normalized percentages
    merged["pct_occ_weighted"] = 100 * merged["pct"] / merged["pct"].sum()
    merged["pct_occ_norm"] = 100 * (merged["pct"] / merged["n_occurrences"]) / (merged["pct"] / merged["n_occurrences"]).sum()
    
    # Sort by O*NET-SOC Code
    merged.sort_values(by="occ_group_code", ascending=True, inplace=True)
    
    return merged

task_soc_pct_all = add_claude_pct(task_soc_df)
display(task_soc_pct_all.reset_index(drop=True))
task_soc_pct_all.to_csv("test")

Unnamed: 0,task_name,pct,occ_group_code,title,task_id,task,task_type,n_responding,date,domain_source,soc_group_code,soc_title,task_normalized,n_occurrences,n_occurrences_soc,pct_occ_weighted,pct_occ_norm
0,direct or conduct studies or research on issue...,0.004951,11-1011.00,Chief Executives,8848.0,Direct or conduct studies or research on issue...,Core,87.0,07/2014,Incumbent,11,Management Occupations,direct or conduct studies or research on issue...,1.0,1.0,0.004072,0.004975
1,"direct, plan, or implement policies, objective...",0.005212,11-1011.00,Chief Executives,8826.0,"Direct, plan, or implement policies, objective...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"direct, plan, or implement policies, objective...",1.0,1.0,0.004286,0.005237
2,"interpret and explain policies, rules, regulat...",0.049250,11-1011.00,Chief Executives,8843.0,"Interpret and explain policies, rules, regulat...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"interpret and explain policies, rules, regulat...",1.0,1.0,0.040504,0.049488
3,"deliver speeches, write articles, or present i...",0.008078,11-1011.00,Chief Executives,8839.0,"Deliver speeches, write articles, or present i...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"deliver speeches, write articles, or present i...",1.0,1.0,0.006644,0.008117
4,"serve as liaisons between organizations, share...",0.003778,11-1011.00,Chief Executives,8840.0,"Serve as liaisons between organizations, share...",Supplemental,87.0,07/2014,Incumbent,11,Management Occupations,"serve as liaisons between organizations, share...",1.0,1.0,0.003107,0.003797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,stop gathering arms when cars are full.,0.003388,53-7033.00,"Loading Machine Operators, Underground Mining",15190.0,Stop gathering arms when cars are full.,Supplemental,78.0,06/2008,Incumbent,53,Transportation and Material Moving Occupations,stop gathering arms when cars are full.,1.0,1.0,0.002786,0.003404
4241,collect and test samples of cleaning solutions...,0.002476,53-7061.00,Cleaners of Vehicles and Equipment,5010.0,Collect and test samples of cleaning solutions...,Supplemental,87.0,07/2013,Incumbent,53,Transportation and Material Moving Occupations,collect and test samples of cleaning solutions...,1.0,1.0,0.002036,0.002488
4242,stack cargo in locations such as transit sheds...,0.002866,53-7062.00,"Laborers and Freight, Stock, and Material Move...",10795.0,Stack cargo in locations such as transit sheds...,Supplemental,87.0,07/2013,Incumbent,53,Transportation and Material Moving Occupations,stack cargo in locations such as transit sheds...,1.0,1.0,0.002357,0.002880
4243,"test materials and solutions, using testing eq...",0.001954,53-7072.00,"Pump Operators, Except Wellhead Pumpers",14622.0,"Test materials and solutions, using testing eq...",Supplemental,105.0,06/2007,Incumbent,53,Transportation and Material Moving Occupations,"test materials and solutions, using testing eq...",1.0,1.0,0.001607,0.001964


### Extra Data

In [16]:

def add_emp_wage_data(df) -> pd.DataFrame:
    """
    Description:
        This loads in the employment wage data  and merges it into the given dataframe with the desired columns on the occupation code.
        If a row doesn't match, we will fall back to merging on occupation title. 
        All column names in the resulting DataFrame will be lowercase.

    Args:
        df (pd.DataFrame): Input the df with the ONET and Claude data merged.

    Returns:
        pd.DataFrame: Merged DataFrame with employment and wage data
    """
    emp_wage_df = pd.read_csv("../extra_data/emp_wage_national.csv")

    # Standardize for merges
    df["occ_group_code"] = df["occ_group_code"].str[:7]
    df["title_normalized"] = df["title"].str.lower().str.strip()
    emp_wage_df["occ_title_normalized"] = emp_wage_df["OCC_TITLE"].str.lower().str.strip()

    wage_cols = [
            "OCC_CODE", "AREA_TITLE", "TOT_EMP", "EMP_PRSE", "JOBS_1000",
            "LOC_QUOTIENT", "PCT_TOTAL", "PCT_RPT", "H_MEAN", "A_MEAN",
            "MEAN_PRSE", "H_PCT10", "H_PCT25", "H_MEDIAN", "H_PCT75", "H_PCT90",
            "A_PCT10", "A_PCT25", "A_MEDIAN", "A_PCT75", "A_PCT90", "ANNUAL", "HOURLY", "occ_title_normalized"
        ]

    # Perform merge
    merged_df = pd.merge(
        df,
        emp_wage_df[wage_cols],
        left_on="occ_group_code",
        right_on="OCC_CODE",
        how="left"
    )

    merged_matched = merged_df[merged_df["TOT_EMP"].notna()]
    unmatched = merged_df[merged_df["TOT_EMP"].isna()]
    unmatched = unmatched.drop(columns=wage_cols, errors="ignore")

    merged_unmatched = pd.merge(
        unmatched,
        emp_wage_df[wage_cols],
        left_on="title_normalized",
        right_on="occ_title_normalized",
        how="left"
    )

    final_merged = pd.concat([merged_matched, merged_unmatched], ignore_index=True)
    final_merged.drop(columns=["title_normalized", "occ_title_normalized"], inplace=True, errors="ignore")


    # Convert all column names to lowercase
    final_merged.columns = [col.lower() for col in final_merged.columns]

    return final_merged

task_emp_wage_df = add_emp_wage_data(task_soc_pct_all)
#display(task_emp_wage_df)
print("tot_emp missing:", task_emp_wage_df["tot_emp"].isna().sum())
print(task_emp_wage_df.loc[task_emp_wage_df["tot_emp"].isna(), "title"].unique())



tot_emp missing: 448
['Funeral Service Managers' 'Buyers and Purchasing Agents, Farm Products'
 'Wholesale and Retail Buyers, Except Farm Products'
 'Purchasing Agents, Except Wholesale, Retail, and Farm Products'
 'Assessors' 'Appraisers, Real Estate' 'Informatics Nurse Specialists'
 'Software Developers, Applications'
 'Software Developers, Systems Software'
 'Telecommunications Engineering Specialists'
 'Software Quality Assurance Engineers and Testers'
 'Computer Systems Engineers/Architects' 'Web Administrators'
 'Geospatial Information Scientists and Technologists'
 'Geographic Information Systems Technicians'
 'Data Warehousing Specialists' 'Business Intelligence Analysts'
 'Information Technology Project Managers' 'Search Marketing Strategists'
 'Video Game Designers' 'Document Management Specialists'
 'Mathematical Technicians' 'Clinical Psychologists'
 'Counseling Psychologists' 'Geophysical Data Technicians'
 'Geological Sample Test Technicians'
 'Substance Abuse and Behavio

In [17]:
#Task ratings processing

def add_task_ratings():
    """
    Description:
        This function reads the task ratings from an Excel file, processes it to extract frequency, importance, and relevance ratings,
        and merges them into a single DataFrame with the desired structure.

    Args:
        df (pd.DataFrame): Input the df with the ONET, Claude, and emp and wage data merged.
    
    Returns:
        pd.DataFrame: Merged DataFrame with task ratings including frequency, importance, and relevance.
    """
    

    task_ratings_df = pd.read_csv("../extra_data/task_ratings.csv")


# Frequency mapping. Assuming a 52 week year with 5 working days per week, these are corresponding survey questions::
# 1 Once per year or less (Assuming 1 time per year)
# 2 More than once per year (Assuming 3 times per year)
# 3 More than once per month (Assuming 48 times per year, 3 times per month)
# 4 More than once per week (Assuming 130 times per year, 2.5 times per week)
# 5 Daily
# 6 Several times per day (Assuming 3 times per day)
# 7 Hourly or more often (Assuming 12 times per day, 1.5 times per hour)
    frequency_weights = {
        1: 1 / 260,
        2: 3 / 260,
        3: 48 / 260,
        4: 130 / 260,
        5: 1,
        6: 3,
        7: 12
    }


    # Get freq rows, drop unusable ones, generate freq aggregates
    freq_df = task_ratings_df[task_ratings_df["Scale ID"] == "FT"].copy()

    # Drop rows without category or invalid categories
    freq_df = freq_df[pd.to_numeric(freq_df["Category"], errors='coerce').notnull()]
    freq_df["Category"] = freq_df["Category"].astype(int)

    # Apply weights
    freq_df["freq_mean"] = freq_df["Data Value"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_lower"] = freq_df["Lower CI Bound"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_upper"] = freq_df["Upper CI Bound"] * freq_df["Category"].map(frequency_weights) / 100

    # Sum across categories to get per-task total
    freq_agg = freq_df.groupby(["O*NET-SOC Code", "Title", "Task ID", "Task"]).agg({
        "freq_mean": "sum",
        "freq_lower": "sum",
        "freq_upper": "sum"
    }).reset_index()


    # Get importance and relevance ratings
    importance_df = task_ratings_df[task_ratings_df["Scale ID"] == "IM"].copy()
    importance_df = importance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    importance_df = importance_df.rename(columns={
        "Data Value": "importance",
        "Lower CI Bound": "importance_lower",
        "Upper CI Bound": "importance_upper"
    })

    relevance_df = task_ratings_df[task_ratings_df["Scale ID"] == "RT"].copy()
    relevance_df = relevance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    relevance_df = relevance_df.rename(columns={
        "Data Value": "relevance",
        "Lower CI Bound": "relevance_lower",
        "Upper CI Bound": "relevance_upper"
    })


    # Merge ratings
    merged_ratings = freq_agg.merge(importance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")
    merged_ratings = merged_ratings.merge(relevance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")


    merged_ratings["task_normalized"] = merged_ratings["Task"].str.lower().str.strip()


    return merged_ratings

ratings_df = add_task_ratings()
#display(ratings_df.reset_index(drop=True))

In [18]:
#Merge all and final cleanup

def batch_lemmatize(texts):
    """
    Efficiently lemmatize a list of strings using spaCy's nlp.pipe().
    Skips punctuation, whitespace, and possessives.
    """
    if not texts:
        return []
    
    # Handle empty/null strings
    processed_texts = [str(text).strip() if text and str(text).strip() else " " for text in texts]
    
    cleaned = []
    try:
        for doc in nlp.pipe(processed_texts, batch_size=1000, disable=["ner", "parser"]):
            lemmas = [
                token.lemma_ for token in doc
                if not token.is_punct and not token.is_space and token.text != "'s"
            ]
            result = " ".join(lemmas).strip()
            cleaned.append(result if result else "")
    except Exception as e:
        print(f"Error in batch_lemmatize: {e}")
        raise
    
    return cleaned

def merge_all_and_cleanup(df, ratings_df):
    """
    Description:
        This function merges the task data with the ratings data and performs final cleanup.
    
    Args:
        df (pd.DataFrame): DataFrame containing task data.
        ratings_df (pd.DataFrame): DataFrame containing task ratings.
    
    Returns:
        pd.DataFrame: Final merged DataFrame with all necessary information.
    """
    
    # Normalize task names

    # Apply batch lemmatization
    df["task_normalized"] = batch_lemmatize(df["task"].tolist())
    ratings_df["task_normalized"] = batch_lemmatize(ratings_df["Task"].tolist())

    df["title_normalized"] = df["title"].str.lower().str.strip()
    ratings_df["title_normalized"] = ratings_df["Title"].str.lower().str.strip()

    # Count how many times each normalized task appears
    task_counts = df["task_normalized"].value_counts()

    # Boolean mask for duplicate vs. unique tasks
    is_duplicate = df["task_normalized"].isin(task_counts[task_counts > 1].index)
    is_unique = ~is_duplicate

    # Split the dataframe
    df_duplicate_tasks = df[is_duplicate].copy()
    df_unique_tasks = df[is_unique].copy()

    # Count how many times each normalized task appears
    task_counts_ratings = ratings_df["task_normalized"].value_counts()

    # Boolean mask for duplicate vs. unique tasks
    is_duplicate_ratings = ratings_df["task_normalized"].isin(task_counts_ratings[task_counts_ratings > 1].index)
    is_unique_ratings = ~is_duplicate_ratings

    # Split the dataframe
    df_duplicate_tasks_ratings = ratings_df[is_duplicate_ratings].copy()
    df_unique_tasks_ratings = ratings_df[is_unique_ratings].copy()

    # Merge on unique tasks
    merged_unique = df_unique_tasks.merge(
        df_unique_tasks_ratings[[
            "freq_mean", "freq_lower", "freq_upper",
            "importance", "importance_lower", "importance_upper",
            "relevance", "relevance_lower", "relevance_upper",
            "task_normalized", "title_normalized"
        ]],
        on=["task_normalized"],
        how="left"
    )


    # Merge on both title and task
    merged_duplicate = df_duplicate_tasks.merge(
        df_duplicate_tasks_ratings[[
            "freq_mean", "freq_lower", "freq_upper",
            "importance", "importance_lower", "importance_upper",
            "relevance", "relevance_lower", "relevance_upper",
            "task_normalized", "title_normalized"
        ]],
        on=["task_normalized", "title_normalized"],
        how="left"
    )

    merged = pd.concat([merged_unique, merged_duplicate], ignore_index=True)

    # Replace placeholders with NaN
    placeholder_values = ["#", "*", "", "n/a", "na", "--"]
    merged.replace(placeholder_values, pd.NA, inplace=True)

    # Drop fully empty columns
    merged.dropna(axis=1, how="all", inplace=True)

    # Drop 'occ_code' and 'task_name'
    merged.drop(columns=["occ_code", "task_name", "title_normalized", "title_normalized_x", "title_normalized_y"], inplace=True, errors="ignore")

    # Reorder columns: make 'task' and 'task_normalized' first
    cols = merged.columns.tolist()
    for col in ["task_normalized", "task"]:
        if col in cols:
            cols.insert(0, cols.pop(cols.index(col)))
    merged = merged[cols]

    return merged

task_final = merge_all_and_cleanup(task_emp_wage_df, ratings_df)
task_final.to_csv("../new_data/tasks_final.csv", index=False)
#display(task_final.reset_index(drop=True))


KeyboardInterrupt: 