In [102]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
import numpy as np
import os
import re
import spacy
from spacy.cli import download

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

## Original Data

In [74]:
# Load Initial ONET Data

def create_onet_soc_data() -> pd.DataFrame:
    """
    Description:
        This takes the onet task statements and merges them with the SOC structure in order to get the SOC major group titles mapped to the tasks.
        It then renames columns for better data usage.
        It then normalizes the task names by making them all lowercase and stripping whitespace. 
        It then creates a new column to count the number of occurrences of each task in an occupation, and in an SOC title.
        This originally did not rename columns or create a column for n_occurances_soc.
    
    Args:
        onet_path (str): Path to the O*NET task statements CSV file
        soc_path (str): Path to the SOC structure CSV file
    
    Returns:
        pd.DataFrame: Merged DataFrame containing O*NET data with SOC major group titles
    """

    # Read and process O*NET data
    onet_df = pd.read_csv("../original_data/onet_task_statements.csv")
    onet_df["soc_group_code"] = onet_df["O*NET-SOC Code"].str[:2]
    
    # Read and process SOC data
    soc_df = pd.read_csv("../original_data/SOC_Structure.csv")
    soc_df = soc_df.dropna(subset=['Major Group'])
    soc_df["soc_group_code"] = soc_df["Major Group"].str[:2]
    
    # Merge datasets
    task_soc_df = onet_df.merge(
        soc_df[['soc_group_code', 'SOC or O*NET-SOC 2019 Title']],
        on='soc_group_code',
        how='left'
    )

    # Rename columns for better usability
    task_soc_df.rename(columns={
    "O*NET-SOC Code": "occ_group_code",
    "Title": "title",
    "Task ID": "task_id",
    "Task": "task",
    "Task Type": "task_type",
    "Incumbents Responding": "n_responding",
    "Date": "date",
    "Domain Source": "domain_source",
    "SOC or O*NET-SOC 2019 Title": "soc_title",
    }, inplace=True)

    task_soc_df["task_normalized"] = task_soc_df["task"].str.lower().str.strip()
    task_soc_df["n_occurrences"] = task_soc_df.groupby("task_normalized")["title"].transform("nunique")
    task_soc_df["n_occurrences_soc"] = task_soc_df.groupby("task_normalized")["soc_title"].transform("nunique")

    return task_soc_df

task_soc_df = create_onet_soc_data()
#display(task_soc_df.reset_index(drop=True))

In [61]:
# Add Claude data
def add_claude_pct(df) -> pd.DataFrame:
    """
    Description:
        This loads in the tasks and percentage of occurrences from the Claude data, and merges it with the tasks in our data set we already have. 
        It then normalizes the percentages of occurances of tasks and has one column for weighted percents based multiple occurrences, and one where that weight is normalized
        It then sorts it based on the O*NET-SOC Code.
        This originally did not create a column for the weighted percentage of occurrences.
    
    Args:
        task_soc_df (pd.DataFrame): DataFrame containing O*NET tasks and SOC titles.
    
    Returns:
        pd.DataFrame: Updated DataFrame with percentage of occurrences added.
    """
    
    # Load task mappings from Claude data
    task_mappings_df = pd.read_csv("../original_data/onet_task_mappings.csv")
    
    # Merge with existing task DataFrame
    merged = task_mappings_df.merge(
        df,
        left_on="task_name",
        right_on="task_normalized",
        how="left"
    )
    
    # Calculate weighted and normalized percentages
    merged["pct_occ_weighted"] = 100 * merged["pct"] / merged["pct"].sum()
    merged["pct_occ_norm"] = 100 * (merged["pct"] / merged["n_occurrences"]) / (merged["pct"] / merged["n_occurrences"]).sum()
    
    # Sort by O*NET-SOC Code
    merged.sort_values(by="occ_group_code", ascending=True, inplace=True)
    
    return merged

task_soc_pct_all = add_claude_pct(task_soc_df)
#display(task_soc_pct_all.reset_index(drop=True))

## Extra Data

In [118]:

def add_emp_wage_data(df) -> pd.DataFrame:
    """
    Description:
        This loads in the employment wage data  and merges it into the given dataframe with the desired columns on the occupation code.
        If a row doesn't match, we will fall back to merging on occupation title. 
        All column names in the resulting DataFrame will be lowercase.

    Args:
        df (pd.DataFrame): Input the df with the ONET and Claude data merged.

    Returns:
        pd.DataFrame: Merged DataFrame with employment and wage data
    """
    emp_wage_df = pd.read_csv("../extra_data/emp_wage_national.csv")

    # Standardize for merges
    df["occ_group_code"] = df["occ_group_code"].str[:7]
    df["title_normalized"] = df["title"].str.lower().str.strip()
    emp_wage_df["occ_title_normalized"] = emp_wage_df["OCC_TITLE"].str.lower().str.strip()

    wage_cols = [
            "OCC_CODE", "AREA_TITLE", "TOT_EMP", "EMP_PRSE", "JOBS_1000",
            "LOC_QUOTIENT", "PCT_TOTAL", "PCT_RPT", "H_MEAN", "A_MEAN",
            "MEAN_PRSE", "H_PCT10", "H_PCT25", "H_MEDIAN", "H_PCT75", "H_PCT90",
            "A_PCT10", "A_PCT25", "A_MEDIAN", "A_PCT75", "A_PCT90", "ANNUAL", "HOURLY", "occ_title_normalized"
        ]

    # Perform merge
    merged_df = pd.merge(
        df,
        emp_wage_df[wage_cols],
        left_on="occ_group_code",
        right_on="OCC_CODE",
        how="left"
    )

    merged_matched = merged_df[merged_df["TOT_EMP"].notna()]
    unmatched = merged_df[merged_df["TOT_EMP"].isna()]
    unmatched = unmatched.drop(columns=wage_cols, errors="ignore")

    merged_unmatched = pd.merge(
        unmatched,
        emp_wage_df[wage_cols],
        left_on="title_normalized",
        right_on="occ_title_normalized",
        how="left"
    )

    final_merged = pd.concat([merged_matched, merged_unmatched], ignore_index=True)
    final_merged.drop(columns=["title_normalized", "occ_title_normalized"], inplace=True, errors="ignore")


    # Convert all column names to lowercase
    final_merged.columns = [col.lower() for col in final_merged.columns]

    return final_merged

task_emp_wage_df = add_emp_wage_data(task_soc_pct_all)
#display(task_emp_wage_df)

In [119]:
#Task ratings processing

def add_task_ratings():
    """
    Description:
        This function reads the task ratings from an Excel file, processes it to extract frequency, importance, and relevance ratings,
        and merges them into a single DataFrame with the desired structure.

    Args:
        df (pd.DataFrame): Input the df with the ONET, Claude, and emp and wage data merged.
    
    Returns:
        pd.DataFrame: Merged DataFrame with task ratings including frequency, importance, and relevance.
    """
    

    task_ratings_df = pd.read_csv("../extra_data/task_ratings.csv")


    #Frequency mapping
    frequency_weights = {
        1: 1 / 260,
        2: 2 / 260,
        3: 12 / 260,
        4: 52 / 260,
        5: 1,
        6: 3,
        7: 8
    }


    # Get freq rows, drop unusable ones, generate freq aggregates
    freq_df = task_ratings_df[task_ratings_df["Scale ID"] == "FT"].copy()

    # Drop rows without category or invalid categories
    freq_df = freq_df[pd.to_numeric(freq_df["Category"], errors='coerce').notnull()]
    freq_df["Category"] = freq_df["Category"].astype(int)

    # Apply weights
    freq_df["freq_mean"] = freq_df["Data Value"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_lower"] = freq_df["Lower CI Bound"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_upper"] = freq_df["Upper CI Bound"] * freq_df["Category"].map(frequency_weights) / 100

    # Sum across categories to get per-task total
    freq_agg = freq_df.groupby(["O*NET-SOC Code", "Title", "Task ID", "Task"]).agg({
        "freq_mean": "sum",
        "freq_lower": "sum",
        "freq_upper": "sum"
    }).reset_index()


    # Get importance and relevance ratings
    importance_df = task_ratings_df[task_ratings_df["Scale ID"] == "IM"].copy()
    importance_df = importance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    importance_df = importance_df.rename(columns={
        "Data Value": "importance",
        "Lower CI Bound": "importance_lower",
        "Upper CI Bound": "importance_upper"
    })

    relevance_df = task_ratings_df[task_ratings_df["Scale ID"] == "RT"].copy()
    relevance_df = relevance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    relevance_df = relevance_df.rename(columns={
        "Data Value": "relevance",
        "Lower CI Bound": "relevance_lower",
        "Upper CI Bound": "relevance_upper"
    })


    # Merge ratings
    merged_ratings = freq_agg.merge(importance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")
    merged_ratings = merged_ratings.merge(relevance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")


    merged_ratings["task_normalized"] = merged_ratings["Task"].str.lower().str.strip()


    return merged_ratings

ratings_df = add_task_ratings()
#display(ratings_df.reset_index(drop=True))

In [117]:
#Merge all and final cleanup

def batch_lemmatize(texts):
    """
    Efficiently lemmatize a list of strings using spaCy's nlp.pipe().
    Skips punctuation, whitespace, and possessives.
    """
    if not texts:
        return []
    
    # Handle empty/null strings
    processed_texts = [str(text).strip() if text and str(text).strip() else " " for text in texts]
    
    cleaned = []
    try:
        for doc in nlp.pipe(processed_texts, batch_size=1000, disable=["ner", "parser"]):
            lemmas = [
                token.lemma_ for token in doc
                if not token.is_punct and not token.is_space and token.text != "'s"
            ]
            result = " ".join(lemmas).strip()
            cleaned.append(result if result else "")
    except Exception as e:
        print(f"Error in batch_lemmatize: {e}")
        raise
    
    return cleaned

def merge_all_and_cleanup(df, ratings_df):
    """
    Description:
        This function merges the task data with the ratings data and performs final cleanup.
    
    Args:
        df (pd.DataFrame): DataFrame containing task data.
        ratings_df (pd.DataFrame): DataFrame containing task ratings.
    
    Returns:
        pd.DataFrame: Final merged DataFrame with all necessary information.
    """
    
    # Normalize task names

    # Apply batch lemmatization
    df["task_normalized"] = batch_lemmatize(df["task"].tolist())
    ratings_df["task_normalized"] = batch_lemmatize(ratings_df["Task"].tolist())

    df["title_normalized"] = df["title"].str.lower().str.strip()
    ratings_df["title_normalized"] = ratings_df["Title"].str.lower().str.strip()

    # Count how many times each normalized task appears
    task_counts = df["task_normalized"].value_counts()

    # Boolean mask for duplicate vs. unique tasks
    is_duplicate = df["task_normalized"].isin(task_counts[task_counts > 1].index)
    is_unique = ~is_duplicate

    # Split the dataframe
    df_duplicate_tasks = df[is_duplicate].copy()
    df_unique_tasks = df[is_unique].copy()

    # Count how many times each normalized task appears
    task_counts_ratings = ratings_df["task_normalized"].value_counts()

    # Boolean mask for duplicate vs. unique tasks
    is_duplicate_ratings = ratings_df["task_normalized"].isin(task_counts_ratings[task_counts_ratings > 1].index)
    is_unique_ratings = ~is_duplicate_ratings

    # Split the dataframe
    df_duplicate_tasks_ratings = ratings_df[is_duplicate_ratings].copy()
    df_unique_tasks_ratings = ratings_df[is_unique_ratings].copy()

    # Merge on unique tasks
    merged_unique = df_unique_tasks.merge(
        df_unique_tasks_ratings[[
            "freq_mean", "freq_lower", "freq_upper",
            "importance", "importance_lower", "importance_upper",
            "relevance", "relevance_lower", "relevance_upper",
            "task_normalized", "title_normalized"
        ]],
        on=["task_normalized"],
        how="left"
    )


    # Merge on both title and task
    merged_duplicate = df_duplicate_tasks.merge(
        df_duplicate_tasks_ratings[[
            "freq_mean", "freq_lower", "freq_upper",
            "importance", "importance_lower", "importance_upper",
            "relevance", "relevance_lower", "relevance_upper",
            "task_normalized", "title_normalized"
        ]],
        on=["task_normalized", "title_normalized"],
        how="left"
    )

    merged = pd.concat([merged_unique, merged_duplicate], ignore_index=True)

    # Replace placeholders with NaN
    placeholder_values = ["#", "*", "", "n/a", "na", "--"]
    merged.replace(placeholder_values, pd.NA, inplace=True)

    # Drop fully empty columns
    merged.dropna(axis=1, how="all", inplace=True)

    # Drop 'occ_code' and 'task_name'
    merged.drop(columns=["occ_code", "task_name"], inplace=True, errors="ignore")

    # Reorder columns: make 'task' and 'task_normalized' first
    cols = merged.columns.tolist()
    for col in ["task_normalized", "task"]:
        if col in cols:
            cols.insert(0, cols.pop(cols.index(col)))
    merged = merged[cols]

    return merged

task_final = merge_all_and_cleanup(task_emp_wage_df, ratings_df)
task_final.to_csv("../new_data/tasks_final.csv", index=False)
#display(task_final.reset_index(drop=True))


## TBA

In [33]:
# It then creates separate DataFrames for core and supplemental tasks
# It then saves the data to csv files in the




# # Create DataFrame for Supplemental tasks
# task_soc_pct_suppl_df = task_soc_pct_all[task_soc_pct_all["Task Type"] == "Supplemental"].copy()

# # Create DataFrame for Core tasks
# task_soc_pct_core_df = task_soc_pct_all[task_soc_pct_all["Task Type"] == "Core"].copy()

# # Dictionary of dataframes and their names
# dfs = {
#     "grouped_with_occupations_all": task_soc_pct_all,
#     "gwo_core_df": task_soc_pct_core_df,
#     "gwo_suppl_df": task_soc_pct_suppl_df,
# }

# for name, df in dfs.items():
#     # Normalize weighted percentages
#     df["pct_occ_weighted"] = 100 * df["pct"] / df["pct"].sum()

#     # Normalize percentages
#     df["pct_occ_norm"] = 100 * (df["pct"] / df["n_occurrences"]) / (df["pct"] / df["n_occurrences"]).sum()

#     # Print check
#     print(f"{name} — Raw Sum: {df['pct_occ_weighted'].sum():.2f}, Spread Sum: {df['pct_occ_norm'].sum():.2f}")

#     # Save CSV
#     path = f"../new_generated_data/{name}.csv"
#     if os.path.exists(path):
#         try:
#             os.remove(path)
#         except PermissionError:
#             print(f"⚠️ Close {path} before saving.")
#     df.to_csv(path, index=False)