In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
import numpy as np
import os

## Original Data

In [76]:
# Load Initial ONET Data

def create_onet_soc_data() -> pd.DataFrame:
    """
    Description:
        This takes the onet task statements and merges them with the SOC structure in order to get the SOC major group titles mapped to the tasks.
        It then renames columns for better data usage.
        It then normalizes the task names by making them all lowercase and stripping whitespace. 
        It then creates a new column to count the number of occurrences of each task in an occupation, and in an SOC title.
        This originally did not rename columns or create a column for n_occurances_soc.
    
    Args:
        onet_path (str): Path to the O*NET task statements CSV file
        soc_path (str): Path to the SOC structure CSV file
    
    Returns:
        pd.DataFrame: Merged DataFrame containing O*NET data with SOC major group titles
    """

    # Read and process O*NET data
    onet_df = pd.read_csv("../original_data/onet_task_statements.csv")
    onet_df["soc_group_code"] = onet_df["O*NET-SOC Code"].str[:2]
    
    # Read and process SOC data
    soc_df = pd.read_csv("../original_data/SOC_Structure.csv")
    soc_df = soc_df.dropna(subset=['Major Group'])
    soc_df["soc_group_code"] = soc_df["Major Group"].str[:2]
    
    # Merge datasets
    task_soc_df = onet_df.merge(
        soc_df[['soc_group_code', 'SOC or O*NET-SOC 2019 Title']],
        on='soc_group_code',
        how='left'
    )

    # Rename columns for better usability
    task_soc_df.rename(columns={
    "O*NET-SOC Code": "occ_group_code",
    "Title": "title",
    "Task ID": "task_id",
    "Task": "task",
    "Task Type": "task_type",
    "Incumbents Responding": "n_responding",
    "Date": "date",
    "Domain Source": "domain_source",
    "SOC or O*NET-SOC 2019 Title": "soc_title",
    }, inplace=True)

    task_soc_df["task_normalized"] = task_soc_df["task"].str.lower().str.strip()
    task_soc_df["n_occurrences"] = task_soc_df.groupby("task_normalized")["title"].transform("nunique")
    task_soc_df["n_occurrences_soc"] = task_soc_df.groupby("task_normalized")["soc_title"].transform("nunique")

    return task_soc_df

task_soc_df = create_onet_soc_data()
display(task_soc_df.reset_index(drop=True))

Unnamed: 0,occ_group_code,title,task_id,task,task_type,n_responding,date,domain_source,soc_group_code,soc_title,task_normalized,n_occurrences,n_occurrences_soc
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,87.0,07/2014,Incumbent,11,Management Occupations,direct or coordinate an organization's financi...,1,1
1,11-1011.00,Chief Executives,8831,Appoint department heads or managers and assig...,Core,87.0,07/2014,Incumbent,11,Management Occupations,appoint department heads or managers and assig...,1,1
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,87.0,07/2014,Incumbent,11,Management Occupations,analyze operations to evaluate performance of ...,1,1
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"direct, plan, or implement policies, objective...",1,1
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"prepare budgets for approval, including those ...",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19525,53-7121.00,"Tank Car, Truck, and Ship Loaders",12806,"Test vessels for leaks, damage, and defects, a...",Supplemental,66.0,12/2006,Incumbent,53,Transportation and Material Moving Occupations,"test vessels for leaks, damage, and defects, a...",1,1
19526,53-7121.00,"Tank Car, Truck, and Ship Loaders",12807,Unload cars containing liquids by connecting h...,Supplemental,66.0,12/2006,Incumbent,53,Transportation and Material Moving Occupations,unload cars containing liquids by connecting h...,1,1
19527,53-7121.00,"Tank Car, Truck, and Ship Loaders",12808,Copy and attach load specifications to loaded ...,Supplemental,64.0,12/2006,Incumbent,53,Transportation and Material Moving Occupations,copy and attach load specifications to loaded ...,1,1
19528,53-7121.00,"Tank Car, Truck, and Ship Loaders",12809,Start pumps and adjust valves or cables to reg...,Supplemental,67.0,12/2006,Incumbent,53,Transportation and Material Moving Occupations,start pumps and adjust valves or cables to reg...,1,1


In [83]:
# Add Claude data
def add_claude_pct(df) -> pd.DataFrame:
    """
    Description:
        This loads in the tasks and percentage of occurrences from the Claude data, and merges it with the tasks in our data set we already have. 
        It then normalizes the percentages of occurances of tasks and has one column for weighted percents based multiple occurrences, and one where that weight is normalized
        It then sorts it based on the O*NET-SOC Code.
        This originally did not create a column for the weighted percentage of occurrences.
    
    Args:
        task_soc_df (pd.DataFrame): DataFrame containing O*NET tasks and SOC titles.
    
    Returns:
        pd.DataFrame: Updated DataFrame with percentage of occurrences added.
    """
    
    # Load task mappings from Claude data
    task_mappings_df = pd.read_csv("../original_data/onet_task_mappings.csv")
    
    # Merge with existing task DataFrame
    merged = task_mappings_df.merge(
        df,
        left_on="task_name",
        right_on="task_normalized",
        how="left"
    )
    
    # Calculate weighted and normalized percentages
    merged["pct_occ_weighted"] = 100 * merged["pct"] / merged["pct"].sum()
    merged["pct_occ_norm"] = 100 * (merged["pct"] / merged["n_occurrences"]) / (merged["pct"] / merged["n_occurrences"]).sum()
    
    # Sort by O*NET-SOC Code
    merged.sort_values(by="occ_group_code", ascending=True, inplace=True)
    
    return merged

task_soc_pct_all = add_claude_pct(task_soc_df)
display(task_soc_pct_all.reset_index(drop=True))
task_soc_pct_all.reset_index(drop=True).to_csv("test_bug12.csv")

Unnamed: 0,task_name,pct,occ_group_code,title,task_id,task,task_type,n_responding,date,domain_source,soc_group_code,soc_title,task_normalized,n_occurrences,n_occurrences_soc,pct_occ_weighted,pct_occ_norm
0,direct or conduct studies or research on issue...,0.004951,11-1011.00,Chief Executives,8848.0,Direct or conduct studies or research on issue...,Core,87.0,07/2014,Incumbent,11,Management Occupations,direct or conduct studies or research on issue...,1.0,1.0,0.004072,0.004975
1,"direct, plan, or implement policies, objective...",0.005212,11-1011.00,Chief Executives,8826.0,"Direct, plan, or implement policies, objective...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"direct, plan, or implement policies, objective...",1.0,1.0,0.004286,0.005237
2,"interpret and explain policies, rules, regulat...",0.049250,11-1011.00,Chief Executives,8843.0,"Interpret and explain policies, rules, regulat...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"interpret and explain policies, rules, regulat...",1.0,1.0,0.040504,0.049488
3,"deliver speeches, write articles, or present i...",0.008078,11-1011.00,Chief Executives,8839.0,"Deliver speeches, write articles, or present i...",Core,87.0,07/2014,Incumbent,11,Management Occupations,"deliver speeches, write articles, or present i...",1.0,1.0,0.006644,0.008117
4,"serve as liaisons between organizations, share...",0.003778,11-1011.00,Chief Executives,8840.0,"Serve as liaisons between organizations, share...",Supplemental,87.0,07/2014,Incumbent,11,Management Occupations,"serve as liaisons between organizations, share...",1.0,1.0,0.003107,0.003797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,stop gathering arms when cars are full.,0.003388,53-7033.00,"Loading Machine Operators, Underground Mining",15190.0,Stop gathering arms when cars are full.,Supplemental,78.0,06/2008,Incumbent,53,Transportation and Material Moving Occupations,stop gathering arms when cars are full.,1.0,1.0,0.002786,0.003404
4241,collect and test samples of cleaning solutions...,0.002476,53-7061.00,Cleaners of Vehicles and Equipment,5010.0,Collect and test samples of cleaning solutions...,Supplemental,87.0,07/2013,Incumbent,53,Transportation and Material Moving Occupations,collect and test samples of cleaning solutions...,1.0,1.0,0.002036,0.002488
4242,stack cargo in locations such as transit sheds...,0.002866,53-7062.00,"Laborers and Freight, Stock, and Material Move...",10795.0,Stack cargo in locations such as transit sheds...,Supplemental,87.0,07/2013,Incumbent,53,Transportation and Material Moving Occupations,stack cargo in locations such as transit sheds...,1.0,1.0,0.002357,0.002880
4243,"test materials and solutions, using testing eq...",0.001954,53-7072.00,"Pump Operators, Except Wellhead Pumpers",14622.0,"Test materials and solutions, using testing eq...",Supplemental,105.0,06/2007,Incumbent,53,Transportation and Material Moving Occupations,"test materials and solutions, using testing eq...",1.0,1.0,0.001607,0.001964


## Extra Data

In [88]:
# Add employment and wage data for each occupation

def add_emp_wage_data(df) -> pd.DataFrame:
    """
    Description:
        This loads in the employment wage data from the CSV file and merges it into the given dataframe with the desired columns.
        All column names in the resulting DataFrame will be lowercase.

    Args:
        df (pd.DataFrame): Input the df with the ONET and Claude data merged.

    Returns:
        pd.DataFrame: Merged DataFrame with employment and wage data
    """
    
    # Load employment wage data
    emp_wage_df = pd.read_csv("../extra_data/emp_wage_national.csv")

    # Take off last 3 characters to standardize for merge
    df["occ_group_code"] = df["occ_group_code"].str[:7]

    # Perform merge
    merged_df = pd.merge(
        df,
        emp_wage_df[[
            "OCC_CODE", "AREA_TITLE", "TOT_EMP", "EMP_PRSE", "JOBS_1000",
            "LOC_QUOTIENT", "PCT_TOTAL", "PCT_RPT", "H_MEAN", "A_MEAN",
            "MEAN_PRSE", "H_PCT10", "H_PCT25", "H_MEDIAN", "H_PCT75", "H_PCT90",
            "A_PCT10", "A_PCT25", "A_MEDIAN", "A_PCT75", "A_PCT90", "ANNUAL", "HOURLY"
        ]],
        left_on="occ_group_code",
        right_on="OCC_CODE",
        how="left"
    )

    # Convert all column names to lowercase
    merged_df.columns = [col.lower() for col in merged_df.columns]

    return merged_df

task_emp_wage_df = add_emp_wage_data(task_soc_pct_all)


In [None]:
#Task ratings processing


def add_task_ratings(df):
    """
    Description:
        This function reads the task ratings from an Excel file, processes it to extract frequency, importance, and relevance ratings,
        and merges them into a single DataFrame with the desired structure.

    Args:
        df (pd.DataFrame): Input the df with the ONET, Claude, and emp and wage data merged.
    
    Returns:
        pd.DataFrame: Merged DataFrame with task ratings including frequency, importance, and relevance.
    """
    

    task_ratings_df = pd.read_csv("../extra_data/task_ratings.csv")


    #Frequency mapping
    frequency_weights = {
        1: 1 / 260,
        2: 2 / 260,
        3: 12 / 260,
        4: 52 / 260,
        5: 1,
        6: 3,
        7: 8
    }


    # Get freq rows, drop unusable ones, generate freq aggregates
    freq_df = task_ratings_df[task_ratings_df["Scale ID"] == "FT"].copy()

    # Drop rows without category or invalid categories
    freq_df = freq_df[pd.to_numeric(freq_df["Category"], errors='coerce').notnull()]
    freq_df["Category"] = freq_df["Category"].astype(int)

    # Apply weights
    freq_df["freq_mean"] = freq_df["Data Value"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_lower"] = freq_df["Lower CI Bound"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_upper"] = freq_df["Upper CI Bound"] * freq_df["Category"].map(frequency_weights) / 100

    # Sum across categories to get per-task total
    freq_agg = freq_df.groupby(["O*NET-SOC Code", "Title", "Task ID", "Task"]).agg({
        "freq_mean": "sum",
        "freq_lower": "sum",
        "freq_upper": "sum"
    }).reset_index()


    # Get importance and relevance ratings
    importance_df = task_ratings_df[task_ratings_df["Scale ID"] == "IM"].copy()
    importance_df = importance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    importance_df = importance_df.rename(columns={
        "Data Value": "importance",
        "Lower CI Bound": "importance_lower",
        "Upper CI Bound": "importance_upper"
    })

    relevance_df = task_ratings_df[task_ratings_df["Scale ID"] == "RT"].copy()
    relevance_df = relevance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    relevance_df = relevance_df.rename(columns={
        "Data Value": "relevance",
        "Lower CI Bound": "relevance_lower",
        "Upper CI Bound": "relevance_upper"
    })


    # Merge ratings
    merged_ratings = freq_agg.merge(importance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")
    merged_ratings = merged_ratings.merge(relevance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")


    merged_ratings["task_normalized"] = merged_ratings["Task"].str.lower().str.strip()

    merged_all = df.merge(
        merged_ratings,
        merged_ratings[[
            "freq_mean", "freq_lower", "freq_upper",
            "importance", "importance_lower", "importance_upper",
            "relevance", "relevance_lower", "relevance_upper"
        ]],
        left_on="task",
        right_on="task_normalized",
        how="left"
    )

    return merged_all

    
task_final = add_task_ratings(task_emp_wage_df)
task_final.to_csv("tasks_final.csv", index=False)


KeyError: 'task_normalized'

In [33]:
# It then creates separate DataFrames for core and supplemental tasks
# It then saves the data to csv files in the




# # Create DataFrame for Supplemental tasks
# task_soc_pct_suppl_df = task_soc_pct_all[task_soc_pct_all["Task Type"] == "Supplemental"].copy()

# # Create DataFrame for Core tasks
# task_soc_pct_core_df = task_soc_pct_all[task_soc_pct_all["Task Type"] == "Core"].copy()

# # Dictionary of dataframes and their names
# dfs = {
#     "grouped_with_occupations_all": task_soc_pct_all,
#     "gwo_core_df": task_soc_pct_core_df,
#     "gwo_suppl_df": task_soc_pct_suppl_df,
# }

# for name, df in dfs.items():
#     # Normalize weighted percentages
#     df["pct_occ_weighted"] = 100 * df["pct"] / df["pct"].sum()

#     # Normalize percentages
#     df["pct_occ_norm"] = 100 * (df["pct"] / df["n_occurrences"]) / (df["pct"] / df["n_occurrences"]).sum()

#     # Print check
#     print(f"{name} — Raw Sum: {df['pct_occ_weighted'].sum():.2f}, Spread Sum: {df['pct_occ_norm'].sum():.2f}")

#     # Save CSV
#     path = f"../new_generated_data/{name}.csv"
#     if os.path.exists(path):
#         try:
#             os.remove(path)
#         except PermissionError:
#             print(f"⚠️ Close {path} before saving.")
#     df.to_csv(path, index=False)