## Imports and Helpers

In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
import numpy as np
import os
import re
# import spacy
# from spacy.cli import download
# nlp = spacy.load("en_core_web_sm")

In [56]:
#Helper Functions

def normalize_text(text):
    if not isinstance(text, str):
        return text
    text = text.lower().strip()                   # lowercase + trim
    text = re.sub(r"[^\w\s]", "", text)            # remove punctuation
    text = re.sub(r"\s+", " ", text)               # collapse multiple spaces
    return text


In [57]:
# Adjust parameters

# Frequency mapping. Assuming a 52 week year with 5 working days per week, these are corresponding survey questions::
# 1 Once per year or less (Assuming 1 time per year)
# 2 More than once per year (Assuming 3 times per year)
# 3 More than once per month (Assuming 48 times per year, 3 times per month)
# 4 More than once per week (Assuming 130 times per year, 2.5 times per week)
# 5 Daily
# 6 Several times per day (Assuming 3 times per day)
# 7 Hourly or more often (Assuming 12 times per day, 1.5 times per hour)
frequency_weights = {
    1: 1 / 260,
    2: 3 / 260,
    3: 48 / 260,
    4: 130 / 260,
    5: 1,
    6: 3,
    7: 12
}

## Step 1: Map Anthropic Task %s to O*NET v20.1 Task Statements

In [58]:
def pct_to_onet_tasks(pct_df, task_statements_df) -> pd.DataFrame:
    """
    Description:
        This loads in the tasks and percentage of occurrences from the Anthropic data, and merges it with the tasks statement data. 
        It normalizes the percents based on a weighted and non weighted approach.
        See documentation for more details.

    Args:
        pct_df (pd.DataFrame): DataFrame containing the Anthropic data of percent occurances of every task in their conversation data
        task_statements_df (pd.DataFrame): DataFrame containing O*NET tasks and SOC titles.
    
    Returns:
        pd.DataFrame: Updated DataFrame with percentage of occurrences added.
    """

    task_statements_df.rename(columns={
    "O*NET-SOC Code": "soc_code_2010",
    "Title": "title",
    "Task ID": "task_id",
    "Task": "task",
    "Task Type": "task_type",
    "Incumbents Responding": "n_responding",
    "Date": "date",
    "Domain Source": "domain_source",
    }, inplace=True)

    # Normalize task columns
    pct_df["task_normalized_temp"] = pct_df["task_name"].apply(normalize_text)
    task_statements_df["task_normalized"] = task_statements_df["task"].apply(normalize_text)
    
    # Merge dfs
    merged = pct_df.merge(
        task_statements_df,
        left_on="task_normalized_temp",
        right_on="task_normalized",
        how="left"
    )
    
    # Calculate weighted and normalized percentages
    merged["n_occurrences"] = merged.groupby("task_normalized")["title"].transform("nunique")
    merged["pct_weighted"] = 100 * merged["pct"] / merged["pct"].sum()
    merged["pct_normalized"] = 100 * (merged["pct"] / merged["n_occurrences"]) / (merged["pct"] / merged["n_occurrences"]).sum()

    # Drop unnecessary columns
    merged.drop(columns=["task_name", "task_normalized_temp", "pct"], inplace=True)

    # Reorder so `task` is first and `task_normalized` is second
    cols = ["task", "task_normalized"] + [c for c in merged.columns if c not in ["task", "task_normalized"]]
    merged = merged[cols]
    
    # Sort by O*NET-SOC Code
    merged.sort_values(by="soc_code_2010", ascending=True, inplace=True)

    return merged.reset_index(drop=True)


task_statements_df = pd.read_csv("../extra_data/task_statements_v20.1.csv")
pct_df = pd.read_csv("../original_data/onet_task_mappings.csv")
pct_onet_tasks_df = pct_to_onet_tasks(pct_df, task_statements_df)


In [59]:
# Optional save to csv and show df for inspection

# pct_onet_tasks_df.to_csv("../merged_data_files/pct_onet_tasks.csv", index=False)
# pct_onet_tasks_df

## Step 2: Add SOC Major Occupational Category

In [60]:
def add_soc_structure(pct_onet_tasks_df, soc_structure_df) -> pd.DataFrame:
    """
    Description:
        This loads in the previous DataFrame and adds major occupational categories to each row based on the soc structure data 
        See documentation for more details.

    Args:
        pct_onet_tasks_df (pd.DataFrame): DataFrame from previous step containing pcts mapped to task statements and O*NET metadata
        soc_structure_df (pd.DataFrame): DataFrame containing the SOC structure with major, minor, and detailed categories for occupations

    Returns:
        pd.DataFrame: Updated DataFrame with major occupational categories added
    """

    # Rename column
    soc_structure_df.rename(columns={
    "SOC or O*NET-SOC 2019 Title": "major_occ_category",
    }, inplace=True)

    # Create new df and columns for merging
    pct_onet_tasks_df["major_group_code"] = pct_onet_tasks_df["soc_code_2010"].str[:2]
    soc_structure_df = soc_structure_df.dropna(subset=['Major Group']).copy()
    soc_structure_df["major_group_code"] = soc_structure_df["Major Group"].str[:2]
    
    
    # Merge dfs
    merged = pct_onet_tasks_df.merge(
        soc_structure_df[['major_group_code', 'major_occ_category']],
        on='major_group_code',
        how='left'
    )

    # Drop unnecessary columns
    merged.drop(columns=["major_group_code"], inplace=True)

    return merged.reset_index(drop=True)


soc_structure_df = pd.read_csv("../extra_data/soc_structure_2019.csv")
pct_tasks_soc_structure_df = add_soc_structure(pct_onet_tasks_df, soc_structure_df)


In [61]:
# Optional save to csv and show df for inspection

# pct_tasks_soc_structure_df.to_csv("../merged_data_files/pct_tasks_soc_structure.csv", index=False)
# pct_tasks_soc_structure_df

## Step 3: Add 2024 Wage and Employment Data

### 3.1: Add Updated (2019) SOC Codes

In [62]:
# Get df of updated SOC codes to merge with up to date wage and employment data

def add_updated_soc_code(pct_tasks_soc_structure_df, soc_crosswalk_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles from our main df and their corresponding O*NET-SOC 2019 code (some titles are duplicated as they get split into different SOC codes)
    This is so we can merge the wage and employment data separate from our main df and merge all at once. 

    Args:
        pct_tasks_soc_structure_df (pd.DataFrame): DataFrame from previous step.
        soc_crosswalk_df (pd.DataFrame): DataFrame 2010 and 2019 occupation titles and SOC codes

    Returns:
        pd.DataFrame: DataFrame with an added 'soc_code_2019' column.
    """

    # Rename columns
    soc_crosswalk_df = soc_crosswalk_df.rename(
        columns={
            "O*NET-SOC 2010 Title": "title",
            "O*NET-SOC 2019 Code": "onet_soc_code_2019"
        }
    )

    soc_crosswalk_df['soc_code_2019'] = soc_crosswalk_df['onet_soc_code_2019'].str[:7]

    # Get unique titles from rolling DataFrame
    titles_df = pct_tasks_soc_structure_df[["title"]].drop_duplicates()

    # Merge to attach 2019 SOC codes
    merged = titles_df.merge(
        soc_crosswalk_df[["title", "soc_code_2019"]],
        on="title",
        how="left"
    )

    return merged

soc_crosswalk_df = pd.read_csv("../extra_data/2010_to_2019_soc_crosswalk.csv")
title_and_2019_soc_df = add_updated_soc_code(pct_tasks_soc_structure_df, soc_crosswalk_df)


In [63]:
# Optional save to csv and show df for inspection

# title_and_2019_soc_df.to_csv("../merged_data_files/title_and_2019_soc.csv", index=False)
# title_and_2019_soc_df

### 3.2: Add 2024 National Wage Data

In [64]:
def add_nat_wage_2024(title_and_2019_soc_df, nat_wage_df, scraped_wage_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their national annual and hourly median salary from 2024. 
    It also includes a 6 (from previous df) & 5 digit SOC code for use in following merging. 

    Args:
        title_and_2019_soc_df (pd.DataFrame): DataFrame from previous step.
        nat_wage_df (pd.DataFrame): DataFrame of OEWS data from 2024.
        scraped_wage_df (pd.DataFrame): DataFrame containing scraped wage data from O*NET's website from Jan 2020 

    Returns:
        pd.DataFrame: DataFrame with national wage data from 2024 added
    """

     # Get only columns needed
    wage_df_trimmed = nat_wage_df[["OCC_CODE", "O_GROUP", "H_MEDIAN", "A_MEDIAN"]].copy()
    wage_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2019"}, inplace=True)

    # Change wage columns to floats
    for c in ["H_MEDIAN", "A_MEDIAN"]:
        wage_df_trimmed[c] = pd.to_numeric(wage_df_trimmed[c], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = title_and_2019_soc_df.merge(
        wage_df_trimmed, 
        on="soc_code_2019", 
        how="left"
    )

    # Get 5 digit SOC codes for broad groups to merge on
    merged["5_digit_soc"] = merged["soc_code_2019"].astype(str).str[:6]     
    wage_df_trimmed["5_digit_soc"] = wage_df_trimmed["soc_code_2019"].astype(str).str[:6]

    #Create fallback DataFrames with only broad groups and where median values are missing
    wage_df_trimmed_fallback_1st = wage_df_trimmed[wage_df_trimmed["O_GROUP"] == "broad"]
    merged_fallback_1st = merged[merged["H_MEDIAN"].isna() | merged["A_MEDIAN"].isna()]

    # Create fallback df with broad group wages
    fallback_merge = merged_fallback_1st.merge(
        wage_df_trimmed_fallback_1st[["5_digit_soc", "H_MEDIAN", "A_MEDIAN"]],
        on="5_digit_soc", how="left",
        suffixes=("", "_fallback")
    )

    # Make titles unique so we don't create a Cartesian product when merging into main DataFrame
    fallback_merge_unique_titles = fallback_merge.drop_duplicates(subset="title")

    # Merge fallback data into the main dataframe
    merged = merged.merge(
        fallback_merge_unique_titles[["title", "H_MEDIAN_fallback", "A_MEDIAN_fallback"]],
        on="title",
        how="left"
    )

    # Fill missing median values from fallback columns
    merged["H_MEDIAN"] = merged["H_MEDIAN"].fillna(merged["H_MEDIAN_fallback"])
    merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["A_MEDIAN_fallback"])

    # Create column to merge on and where annual median is missing
    scraped_wage_df["title"] = scraped_wage_df["JobName"]
    merged_fallback_2nd = merged[merged["H_MEDIAN"].isna() & merged["A_MEDIAN"].isna()]

    # Create 2nd fallback df with scraper wage data
    fallback_merge_2nd = merged_fallback_2nd.merge(
        scraped_wage_df[["title", "MedianSalary"]],
        on="title", how="left",
    )

    # Make titles unique so we don't create a Cartesian product when merging into main DataFrame
    fallback_merge_2nd_unique_titles = fallback_merge_2nd.drop_duplicates(subset="title")

    # Merge 2nd fallback data into the main dataframe
    merged = merged.merge(
        fallback_merge_2nd_unique_titles[["title", "MedianSalary"]],
        on="title",
        how="left"
    )

    # Fill missing median values from scraper median columns and make present value due to inflation
    inflation_factor = 1.24
    merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["MedianSalary"] * inflation_factor)

    # Fill missing annual median using hourly median * 2080 (52 weeks * 40 hours)
    merged.loc[merged["A_MEDIAN"].isna() & merged["H_MEDIAN"].notna(), "A_MEDIAN"] = (
        merged["H_MEDIAN"] * 2080
    )

    # Fill missing hourly median using annual median / 2080
    merged.loc[merged["H_MEDIAN"].isna() & merged["A_MEDIAN"].notna(), "H_MEDIAN"] = (
        merged["A_MEDIAN"] / 2080
    )

    # Create final national wage columns by averaging for any duplicate titles and drop uneeded columns. 
    merged["h_median_national"] = merged.groupby("title")["H_MEDIAN"].transform("mean")
    merged["a_median_national"] = merged.groupby("title")["A_MEDIAN"].transform("mean")
    merged.drop(columns=["H_MEDIAN", "A_MEDIAN", "H_MEDIAN_fallback", "A_MEDIAN_fallback", "MedianSalary", "O_GROUP"], inplace=True)

    return merged.reset_index(drop=True)


nat_wage_2024_df = pd.read_csv("../extra_data/oews_national_2024.csv")
scraped_wage_df = pd.read_csv("../extra_data/scraped_wage_data.csv")
titles_and_nat_wage_2024_df = add_nat_wage_2024(title_and_2019_soc_df, nat_wage_2024_df, scraped_wage_df)


In [65]:
# Optional save to csv and show df for inspection

# titles_and_nat_wage_2024_df.to_csv("../merged_data_files/titles_and_nat_wage_2024.csv", index=False)
# titles_and_nat_wage_2024_df

### 3.3: Add 2024 State Wage Data

In [66]:
def add_state_wage_2024(titles_and_nat_wage_df, state_wage_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their state annual and hourly median salary from 2024. 

    Args:
        titles_and_nat_wage_df (pd.DataFrame): DataFrame from previous step.
        wage_df (pd.DataFrame): DataFrame of OEWS data from 2024 with state level breakdown

    Returns:
        pd.DataFrame: DataFrame with state wage data from 2024 added
    """

     # Get only columns needed
    wage_df_trimmed = state_wage_df[["OCC_CODE", "H_MEDIAN", "A_MEDIAN", "AREA_TITLE"]].copy()
    wage_df_trimmed = wage_df_trimmed[wage_df_trimmed["AREA_TITLE"] == "Utah"]
    wage_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2019",
                                    "H_MEDIAN": "h_median_state",
                                    "A_MEDIAN": "a_median_state"}, inplace=True)

    # Change wage columns to floats
    for c in ["h_median_state", "a_median_state"]:
        wage_df_trimmed[c] = pd.to_numeric(wage_df_trimmed[c], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = titles_and_nat_wage_df.merge(
        wage_df_trimmed, 
        on="soc_code_2019", 
        how="left"
    )

    # Fill missing annual median using hourly median * 2080 (52 weeks * 40 hours)
    merged.loc[merged["a_median_state"].isna() & merged["h_median_state"].notna(), "a_median_state"] = (
        merged["h_median_state"] * 2080
    )

    # Fill missing hourly median using annual median / 2080
    merged.loc[merged["h_median_state"].isna() & merged["a_median_state"].notna(), "h_median_state"] = (
        merged["a_median_state"] / 2080
    )

    # Fill remaining missing values with national data
    merged.loc[merged["a_median_state"].isna(), "a_median_state"] = (
        merged["a_median_national"]
    )
    merged.loc[merged["h_median_state"].isna(), "h_median_state"] = (
        merged["h_median_national"]
    )

    merged["h_median_utah"] = merged.groupby("title")["h_median_state"].transform("mean")
    merged["a_median_utah"] = merged.groupby("title")["a_median_state"].transform("mean")
    merged.drop(columns=["h_median_state", "a_median_state", "AREA_TITLE"], inplace=True)

    return merged


state_wage_df_2024 = pd.read_csv("../extra_data/oews_states_2024.csv")
titles_nat_and_state_wage_2024_df = add_state_wage_2024(titles_and_nat_wage_2024_df, state_wage_df_2024)

In [67]:
# Optional save to csv and show df for inspection

# titles_nat_and_state_wage_2024_df.to_csv("../merged_data_files/titles_nat_and_state_wage_2024.csv", index=False)
# titles_nat_and_state_wage_2024_df

### 3.4: Add 2024 National Employment Data

In [68]:
def add_nat_emp_2024(titles_nat_and_state_wage_df, nat_emp_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their national employment data from 2024.  

    Args:
        titles_nat_and_state_wage_df (pd.DataFrame): DataFrame from previous step.
        nat_emp_df (pd.DataFrame): DataFrame of OEWS data from 2024.

    Returns:
        pd.DataFrame: DataFrame with national employment data from 2024 added
    """

     # Get only columns needed
    emp_df_trimmed = nat_emp_df[["OCC_CODE", "TOT_EMP", "O_GROUP"]].copy()
    emp_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2019"}, inplace=True)

    # Change emp columns to floats
    emp_df_trimmed["TOT_EMP"] = pd.to_numeric(emp_df_trimmed["TOT_EMP"], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = titles_nat_and_state_wage_df.merge(
        emp_df_trimmed, 
        on="soc_code_2019", 
        how="left"
    )

    # Get 5 digit SOC codes for broad groups to merge on  
    emp_df_trimmed["5_digit_soc"] = emp_df_trimmed["soc_code_2019"].astype(str).str[:6]

    #Create fallback DataFrames with only broad groups and where median values are missing
    emp_df_trimmed_fallback_1st = emp_df_trimmed[emp_df_trimmed["O_GROUP"] == "broad"]
    merged_fallback_1st = merged[merged["TOT_EMP"].isna()]

    # Create fallback df with broad group wages
    fallback_merge = merged_fallback_1st.merge(
        emp_df_trimmed_fallback_1st[["5_digit_soc", "TOT_EMP"]],
        on="5_digit_soc", how="left",
        suffixes=("", "_fallback")
    )

    # Make titles unique so we don't create a Cartesian product when merging into main DataFrame
    fallback_merge_unique_titles = fallback_merge.drop_duplicates(subset="title")

    # Merge fallback data into the main dataframe
    merged = merged.merge(
        fallback_merge_unique_titles[["title", "TOT_EMP_fallback"]],
        on="title",
        how="left"
    )

    # Fill missing emp values from fallback columns
    merged["TOT_EMP"] = merged["TOT_EMP"].fillna(merged["TOT_EMP_fallback"])

    # Create final national emp columns by dividing by number of occurences for each soc code and summing per occupation. 
    title_counts = merged.groupby("title")["soc_code_2019"].transform("count")
    merged["TOT_EMP_adj"] = merged["TOT_EMP"] / title_counts
    merged["emp_total_national"] = merged.groupby("title")["TOT_EMP_adj"].transform("sum")

    merged.drop(columns=["TOT_EMP_fallback", "TOT_EMP", "O_GROUP", "TOT_EMP_adj"], inplace=True)
    return merged.reset_index(drop=True)


nat_emp_df_2024 = pd.read_csv("../extra_data/oews_national_2024.csv")
titles_wage_nat_emp_2024_df = add_nat_emp_2024(titles_nat_and_state_wage_2024_df, nat_emp_df_2024)

In [69]:
# Optional save to csv and show df for inspection

# titles_wage_nat_emp_2024_df.to_csv("../merged_data_files/titles_wage_nat_emp_2024.csv", index=False)
# titles_wage_nat_emp_2024_df

### 3.5: Add 2024 State Employment Data

In [70]:
def add_state_emp_2024(titles_wage_nat_emp_df, state_emp_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their state employment data from 2024.  

    Args:
        titles_wage_nat_emp_df (pd.DataFrame): DataFrame from previous step.
        state_emp_df (pd.DataFrame): DataFrame of OEWS data from 2024.

    Returns:
        pd.DataFrame: DataFrame with state employment data from 2024 added
    """

    # Change emp columns to floats
    state_emp_df["TOT_EMP"] = pd.to_numeric(state_emp_df["TOT_EMP"], errors="coerce")

    # Get only columns needed
    emp_df_trimmed = state_emp_df[["OCC_CODE", "TOT_EMP", "AREA_TITLE"]].copy()
    emp_df_trimmed = emp_df_trimmed[emp_df_trimmed["AREA_TITLE"] == "Utah"]
    emp_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2019"}, inplace=True)

    # Initial merge on detailed SOC codes
    merged = titles_wage_nat_emp_df.merge(
        emp_df_trimmed, 
        on="soc_code_2019", 
        how="left"
    )

    # Fill remaining missing values with national data by multiplying by the proportion of state employment to national employment
    total_nat_emp = state_emp_df.loc[state_emp_df["OCC_CODE"] == "00-0000", "TOT_EMP"].sum()
    total_utah_emp = state_emp_df.loc[
    (state_emp_df["OCC_CODE"] == "00-0000") & (state_emp_df["AREA_TITLE"] == "Utah"), "TOT_EMP"].iloc[0]
    utah_share = float(total_utah_emp) / float(total_nat_emp)
    merged.loc[merged["TOT_EMP"].isna(), "TOT_EMP"] = (
    (merged["emp_total_national"] * utah_share).round())

    # Create final national emp columns by dividing by number of occurances for each soc code and summing per occupation. 
    title_counts = merged.groupby("title")["soc_code_2019"].transform("count")
    merged["TOT_EMP_adj"] = merged["TOT_EMP"] / title_counts
    merged["emp_total_utah"] = merged.groupby("title")["TOT_EMP_adj"].transform("sum")

    merged.drop(columns=["TOT_EMP", "AREA_TITLE", "TOT_EMP_adj"], inplace=True)
    return merged.reset_index(drop=True)


state_emp_2024_df = pd.read_csv("../extra_data/oews_states_2024.csv")
titles_wage_all_emp_2024_df = add_state_emp_2024(titles_wage_nat_emp_2024_df, state_emp_2024_df)

In [71]:
# Optional save to csv and show df for inspection

# titles_wage_all_emp_2024_df.to_csv("../merged_data_files/titles_wage_all_emp_2024.csv", index=False)
# titles_wage_all_emp_2024_df

### 3.6: Merge 2024 Wage and Employment Data Into Task Data

In [72]:
def wage_emp_to_tasks_2024(titles_wage_all_emp_df, pct_tasks_soc_structure_df) -> pd.DataFrame:
    """
    Returns DataFrame with our wage and employment data from 2024 added to our task data.  

    Args:
        titles_wage_all_emp_df (pd.DataFrame): DataFrame from previous step.
        pct_tasks_soc_structure_df (pd.DataFrame): DataFrame from step 2

    Returns:
        pd.DataFrame: DataFrame with wage and employment data from 2024 added to task data
    """

    titles_wage_all_emp_df = titles_wage_all_emp_df.drop_duplicates(subset="title").copy()

    titles_wage_all_emp_df.drop(columns=["5_digit_soc", "soc_code_2019"], inplace=True)

    merged = pct_tasks_soc_structure_df.merge(
        titles_wage_all_emp_df,
        on="title",
        how="left"
    )

    merged.rename(columns={"h_median_national": "h_med_nat_2024",
                                    "a_median_national": "a_med_nat_2024",
                                    "h_median_utah": "h_med_ut_2024",
                                    "a_median_utah": "a_med_ut_2024",
                                    "emp_total_national": "emp_tot_nat_2024",
                                    "emp_total_utah": "emp_tot_ut_2024"}, inplace=True)
    
    return merged
    

task_wage_emp_2024_df = wage_emp_to_tasks_2024(titles_wage_all_emp_2024_df, pct_tasks_soc_structure_df)

In [73]:
# Optional save to csv and show df for inspection

# task_wage_emp_2024_df.to_csv("../merged_data_files/task_wage_emp_2024.csv", index=False)
# task_wage_emp_2024_df

## Step 4: Add 2015 Wage and Employment Data

### 4.1: Add 2015 National Wage Data

In [74]:
def add_nat_wage_2015(pct_tasks_soc_structure_df, nat_wage_df) -> pd.DataFrame:
    """
    Creates a DataFrame of titles and their 2010 SOC codes
    Returns DataFrame with occupation titles along with their national annual and hourly median salary from 2015 in real and nominal terms merged with titles and SOC codes. 
    It also includes a 5 digit SOC code for use in following merging. 

    Args:
        pct_tasks_soc_structure_df (pd.DataFrame): DataFrame from Step 2.
        nat_wage_df (pd.DataFrame): DataFrame of OEWS data from 2015 

    Returns:
        pd.DataFrame: DataFrame with national wage data from 2024 added
    """

    # Make df with titles and SOC codes
    title_soc_code_2010_df = pct_tasks_soc_structure_df[["title", "soc_code_2010"]].drop_duplicates(subset="title").copy()
    title_soc_code_2010_df.reset_index(drop=True, inplace=True)
    title_soc_code_2010_df['soc_code_2010'] = title_soc_code_2010_df['soc_code_2010'].str[:7]

    # Get only columns needed
    wage_df_trimmed = nat_wage_df[["OCC_CODE", "OCC_GROUP", "H_MEDIAN", "A_MEDIAN", "H_MEAN", "A_MEAN"]].copy()
    wage_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2010"}, inplace=True)

    # Change wage columns to floats
    for c in ["H_MEDIAN", "A_MEDIAN", "H_MEAN", "A_MEAN"]:
        wage_df_trimmed[c] = pd.to_numeric(wage_df_trimmed[c], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = title_soc_code_2010_df.merge(
        wage_df_trimmed, 
        on="soc_code_2010", 
        how="left"
    )

    # Fill missing annual median using hourly median * 2080 (52 weeks * 40 hours)
    merged.loc[merged["A_MEDIAN"].isna() & merged["H_MEDIAN"].notna(), "A_MEDIAN"] = (
        merged["H_MEDIAN"] * 2080
    )

    # Fill missing hourly median using annual median / 2080
    merged.loc[merged["H_MEDIAN"].isna() & merged["A_MEDIAN"].notna(), "H_MEDIAN"] = (
        merged["A_MEDIAN"] / 2080
    )

    # Get 5 digit SOC codes for broad groups to merge on
    merged["5_digit_soc"] = merged["soc_code_2010"].astype(str).str[:6]     
    wage_df_trimmed["5_digit_soc"] = wage_df_trimmed["soc_code_2010"].astype(str).str[:6]

    #Create fallback DataFrames with only broad groups and where median values are missing
    wage_df_trimmed_fallback_1st = wage_df_trimmed[wage_df_trimmed["OCC_GROUP"] == "broad"]
    merged_fallback_1st = merged[merged["H_MEDIAN"].isna() | merged["A_MEDIAN"].isna()]

    # Create fallback df with broad group wages
    fallback_merge = merged_fallback_1st.merge(
        wage_df_trimmed_fallback_1st[["5_digit_soc", "H_MEDIAN", "A_MEDIAN"]],
        on="5_digit_soc", how="left",
        suffixes=("", "_fallback")
    )

    # Make titles unique so we don't create a Cartesian product when merging into main DataFrame
    fallback_merge_unique_titles = fallback_merge.drop_duplicates(subset="title")

    # Merge fallback data into the main dataframe
    merged = merged.merge(
        fallback_merge_unique_titles[["title", "H_MEDIAN_fallback", "A_MEDIAN_fallback"]],
        on="title",
        how="left"
    )

    # Fill missing median values from fallback columns
    merged["H_MEDIAN"] = merged["H_MEDIAN"].fillna(merged["H_MEDIAN_fallback"])
    merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["A_MEDIAN_fallback"])

    # Fill missing median values from mean columns
    merged["H_MEDIAN"] = merged["H_MEDIAN"].fillna(merged["H_MEAN"])
    merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["A_MEAN"])

    # Rename and drop columns for cleanup 
    merged.rename(columns={"H_MEDIAN": "h_med_nat_nominal"}, inplace=True)
    merged.rename(columns={"A_MEDIAN": "a_med_nat_nominal"}, inplace=True)
    merged.drop(columns=["H_MEDIAN_fallback", "A_MEDIAN_fallback", "H_MEAN", "A_MEAN", "OCC_GROUP"], inplace=True)

    # Make present value column for inflation
    inflation_factor = 1.36
    merged["h_med_nat_real"] = merged["h_med_nat_nominal"] * inflation_factor
    merged["a_med_nat_real"] = merged["a_med_nat_nominal"] * inflation_factor

    return merged.reset_index(drop=True)


nat_wage_df_2015 = pd.read_csv("../extra_data/oews_national_2015.csv")
titles_and_nat_wage_2015_df = add_nat_wage_2015(pct_tasks_soc_structure_df, nat_wage_df_2015)

In [75]:
# Optional save to csv and show df for inspection

# titles_and_nat_wage_2015_df.to_csv("../merged_data_files/titles_and_nat_wage_2015.csv", index=False)
# titles_and_nat_wage_2015_df

### 4.2: Add 2015 State Wage Data

In [76]:
def add_state_wage_2015(titles_and_nat_wage_df, state_wage_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their state annual and hourly median salary from 2015 in nominal and real terms. 

    Args:
        titles_and_nat_wage_df (pd.DataFrame): DataFrame from previous step.
        state_wage_df (pd.DataFrame): DataFrame of OEWS data from 2015 with state level breakdown

    Returns:
        pd.DataFrame: DataFrame with state wage data from 2015 added
    """

    # Get only columns needed
    wage_df_trimmed = state_wage_df[["OCC_CODE", "H_MEDIAN", "A_MEDIAN", "ST"]].copy()
    wage_df_trimmed = wage_df_trimmed[wage_df_trimmed["ST"] == "UT"]
    wage_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2010",
                                    "H_MEDIAN": "h_median_state",
                                    "A_MEDIAN": "a_median_state"}, inplace=True)

    # Change wage columns to floats
    for c in ["h_median_state", "a_median_state"]:
        wage_df_trimmed[c] = pd.to_numeric(wage_df_trimmed[c], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = titles_and_nat_wage_df.merge(
        wage_df_trimmed, 
        on="soc_code_2010", 
        how="left"
    )

    # Fill missing annual median using hourly median * 2080 (52 weeks * 40 hours)
    merged.loc[merged["a_median_state"].isna() & merged["h_median_state"].notna(), "a_median_state"] = (
        merged["h_median_state"] * 2080
    )

    # Fill missing hourly median using annual median / 2080
    merged.loc[merged["h_median_state"].isna() & merged["a_median_state"].notna(), "h_median_state"] = (
        merged["a_median_state"] / 2080
    )

    # Fill remaining missing values with national data
    merged.loc[merged["a_median_state"].isna(), "a_median_state"] = (
        merged["a_med_nat_nominal"]
    )
    merged.loc[merged["h_median_state"].isna(), "h_median_state"] = (
        merged["h_med_nat_nominal"]
    )

    # Rename and drop columns for cleanup
    merged.rename(columns={"h_median_state": "h_med_utah_nominal",
                                    "a_median_state": "a_med_utah_nominal"}, inplace=True)
    merged.drop(columns=["ST"], inplace=True)

    # Make present value column for inflation
    inflation_factor = 1.36
    merged["h_med_utah_real"] = merged["h_med_utah_nominal"] * inflation_factor
    merged["a_med_utah_real"] = merged["a_med_utah_nominal"] * inflation_factor

    return merged.reset_index(drop=True)


state_wage_df_2015 = pd.read_csv("../extra_data/oews_states_2015.csv")
titles_nat_and_state_wage_2015_df = add_state_wage_2015(titles_and_nat_wage_2015_df, state_wage_df_2015)

In [77]:
# Optional save to csv and show df for inspection

# titles_nat_and_state_wage_2015_df.to_csv("../merged_data_files/titles_nat_and_state_wage_2015.csv", index=False)
# titles_nat_and_state_wage_2015_df

### 4.3: Add 2015 National Employment Data

In [78]:
def add_nat_emp_2015(titles_nat_and_state_wage_df, nat_emp_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their national employment data from 2015.  

    Args:
        titles_nat_and_state_wage_df (pd.DataFrame): DataFrame from previous step.
        nat_emp_df (pd.DataFrame): DataFrame of OEWS data from 2015.

    Returns:
        pd.DataFrame: DataFrame with national employment data from 2015 added
    """

    # Get only columns needed
    emp_df_trimmed = nat_emp_df[["OCC_CODE", "TOT_EMP", "OCC_GROUP"]].copy()
    emp_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2010"}, inplace=True)

    # Change emp columns to floats
    emp_df_trimmed["TOT_EMP"] = pd.to_numeric(emp_df_trimmed["TOT_EMP"], errors="coerce")

    # Initial merge on detailed SOC codes
    merged = titles_nat_and_state_wage_df.merge(
        emp_df_trimmed, 
        on="soc_code_2010", 
        how="left"
    )

    # Get 5 digit SOC codes for broad groups to merge on  
    emp_df_trimmed["5_digit_soc"] = emp_df_trimmed["soc_code_2010"].astype(str).str[:6]

    #Create fallback DataFrames with only broad groups and where median values are missing
    emp_df_trimmed_fallback_1st = emp_df_trimmed[emp_df_trimmed["OCC_GROUP"] == "broad"]
    merged_fallback_1st = merged[merged["TOT_EMP"].isna()]

    # Create fallback df with broad group wages
    fallback_merge = merged_fallback_1st.merge(
        emp_df_trimmed_fallback_1st[["5_digit_soc", "TOT_EMP"]],
        on="5_digit_soc", how="left",
        suffixes=("", "_fallback")
    )

    # Make titles unique so we don't create a Cartesian product when merging into main DataFrame
    fallback_merge_unique_titles = fallback_merge.drop_duplicates(subset="title")

    # Merge fallback data into the main dataframe
    merged = merged.merge(
        fallback_merge_unique_titles[["title", "TOT_EMP_fallback"]],
        on="title",
        how="left"
    )

    # Fill missing emp values from fallback columns
    merged["TOT_EMP"] = merged["TOT_EMP"].fillna(merged["TOT_EMP_fallback"])

    # Rename and drop columns for cleanup
    merged.rename(columns={"TOT_EMP": "emp_tot_nat"}, inplace=True)
    merged.drop(columns=["TOT_EMP_fallback", "OCC_GROUP"], inplace=True)

    return merged.reset_index(drop=True)


nat_emp_df_2015 = pd.read_csv("../extra_data/oews_national_2015.csv")
titles_wage_nat_emp_2015_df = add_nat_emp_2015(titles_nat_and_state_wage_2015_df, nat_emp_df_2015)

In [79]:
# Optional save to csv and show df for inspection

# titles_wage_nat_emp_2015_df.to_csv("../merged_data_files/titles_wage_nat_emp_2015.csv", index=False)
# titles_wage_nat_emp_2015_df

### 4.4: Add 2015 State Employment Data

In [80]:
def add_state_emp_2015(titles_wage_nat_emp_df, state_emp_df) -> pd.DataFrame:
    """
    Returns DataFrame with occupation titles along with their state employment data from 2015.  

    Args:
        titles_wage_nat_emp_df (pd.DataFrame): DataFrame from previous step.
        state_emp_df (pd.DataFrame): DataFrame of OEWS data from 2015.

    Returns:
        pd.DataFrame: DataFrame with state employment data from 2015 added
    """

    # Change emp columns to floats
    state_emp_df["TOT_EMP"] = pd.to_numeric(state_emp_df["TOT_EMP"], errors="coerce")

    # Get only columns needed
    emp_df_trimmed = state_emp_df[["OCC_CODE", "TOT_EMP", "ST"]].copy()
    emp_df_trimmed = emp_df_trimmed[emp_df_trimmed["ST"] == "UT"]
    emp_df_trimmed.rename(columns={"OCC_CODE": "soc_code_2010"}, inplace=True)

    # Initial merge on detailed SOC codes
    merged = titles_wage_nat_emp_2015_df.merge(
        emp_df_trimmed, 
        on="soc_code_2010", 
        how="left"
    )

    # Fill remaining missing values with national data by multiplying by the proportion of state employment to national employment
    total_nat_emp = state_emp_df.loc[state_emp_df["OCC_CODE"] == "00-0000", "TOT_EMP"].sum()
    total_utah_emp = state_emp_df.loc[(state_emp_df["OCC_CODE"] == "00-0000") & (state_emp_df["ST"] == "UT"), "TOT_EMP"].iloc[0]
    utah_share = float(total_utah_emp) / float(total_nat_emp)
    merged.loc[merged["TOT_EMP"].isna(), "TOT_EMP"] = (
    (merged["emp_tot_nat"] * utah_share).round())

    # Rename and drop columns for cleanup
    merged.rename(columns={"TOT_EMP": "emp_tot_utah"}, inplace=True)
    merged.drop(columns=["ST"], inplace=True)

    return merged.reset_index(drop=True)


state_emp_2015_df = pd.read_csv("../extra_data/oews_states_2015.csv")
titles_wage_all_emp_2015_df = add_state_emp_2015(titles_wage_nat_emp_2015_df, state_emp_2015_df)

In [81]:
# Optional save to csv and show df for inspection

# titles_wage_all_emp_2015_df.to_csv("../merged_data_files/titles_wage_all_emp_2015.csv", index=False)
# titles_wage_all_emp_2015_df

### 4.5: Merge 2015 Wage and Employment Data Into Task Data

In [82]:
def wage_emp_to_tasks_2015(titles_wage_all_emp_df, pct_tasks_soc_structure_df) -> pd.DataFrame:
    """
    Returns DataFrame with our wage and employment data from 2015 added to our task data.  

    Args:
        titles_wage_all_emp_df (pd.DataFrame): DataFrame from previous step.
        pct_tasks_soc_structure_df (pd.DataFrame): DataFrame from step 2

    Returns:
        pd.DataFrame: DataFrame with wage and employment data from 2015 added to task data
    """

    titles_wage_all_emp_df = titles_wage_all_emp_df.drop_duplicates(subset="title")

    titles_wage_all_emp_df.drop(columns=["soc_code_2010", "5_digit_soc"], inplace=True)

    merged = pct_tasks_soc_structure_df.merge(
        titles_wage_all_emp_df,
        on="title",
        how="left"
    )

    merged.rename(columns={"h_med_nat_nominal": "h_med_nat_nominal_2015",
                            "a_med_nat_nominal": "a_med_nat_nominal_2015",
                            "h_med_nat_real": "h_med_nat_real_2015",
                            "a_med_nat_real": "a_med_nat_real_2015",
                            "h_med_utah_nominal": "h_med_ut_nominal_2015",
                            "a_med_utah_nominal": "a_med_ut_nominal_2015",
                            "h_med_utah_real": "h_med_ut_real_2015",
                            "a_med_utah_real": "a_med_ut_real_2015",
                            "emp_tot_nat": "emp_tot_nat_2015",
                            "emp_tot_utah": "emp_tot_ut_2015"}, inplace=True)
    
    return merged
    

tasks_all_wage_emp_df = wage_emp_to_tasks_2015(titles_wage_all_emp_2015_df, task_wage_emp_2024_df)

In [83]:
# Optional save to csv and show df for inspection

# tasks_all_wage_emp_df.to_csv("../merged_data_files/tasks_all_wage_emp.csv", index=False)
# tasks_all_wage_emp_df

## Step 5: Adjust Employment Columns

In [84]:
def adjust_emp(tasks_all_wage_emp_df) -> pd.DataFrame:
    """
    Reallocates employment numbers based on the relative percent of Claude conversations, as we have some duplicate
    6 digit SOC codes but different titles  

    Args:
        tasks_all_wage_emp_df (pd.DataFrame): DataFrame from previous 4.5.

    Returns:
        pd.DataFrame: DataFrame with correct employment numbers
    """

    df = tasks_all_wage_emp_df

    # 6-digit SOC to remove decimals (e.g., '11-1011.03' -> '11-1011')
    df["soc6"] = df["soc_code_2010"].astype(str).str[:7]

    # share of each title within its 6-digit SOC based on pct_normalized
    title_pct_sum   = df.groupby(["soc6","title"])["pct_normalized"].transform("sum")
    soc6_pct_sum    = df.groupby("soc6")["pct_normalized"].transform("sum")
    df["soc6_share"] = title_pct_sum / soc6_pct_sum

    # columns to allocate (only those that exist will be processed)
    emp_cols = [c for c in ["emp_tot_nat_2024","emp_tot_ut_2024",
                            "emp_tot_nat_2015","emp_tot_ut_2015"] if c in df.columns]

    # Calculate the correct employment numbers by multiplying each by their share in the 6 digit SOC group
    for c in emp_cols:
        soc6_tot = df.groupby("soc6")[c].transform("max") 
        alloc_col = f"{c}_alloc_by_pct"
        df[c] = round(soc6_tot * df["soc6_share"])

    # Create percent-of-workforce columns from the reallocated totals
    pct_map = {
        "emp_tot_nat_2024":  "emp_pct_nat_2024",
        "emp_tot_ut_2024":   "emp_pct_ut_2024",
        "emp_tot_nat_2015":  "emp_pct_nat_2015",
        "emp_tot_ut_2015": "emp_pct_ut_2015",
    }

    for tot_col, pct_col in pct_map.items():
        if tot_col in df.columns:
            total_sum = df[["title", tot_col]].drop_duplicates("title")[tot_col].sum()
            df[pct_col] = (df.groupby("title")[tot_col].transform("first") / total_sum) * 100

    df.drop(columns=["soc6","soc6_share"], inplace=True)
    return df


tasks_wage_emp_final_df = adjust_emp(tasks_all_wage_emp_df)


In [85]:
# Optional save to csv and show df for inspection

# tasks_wage_emp_final_df.to_csv("../merged_data_files/tasks_wage_emp_final.csv", index=False)
# tasks_wage_emp_final_df

## Step 6: Add Task Rating Data

### 6.1: Bring In 2025 and 2015 Task Rating Data

In [135]:
# Frequency mapping. Assuming a 52 week year with 5 working days per week, these are corresponding survey questions::
# 1 Once per year or less (Assuming 1 time per year)
# 2 More than once per year (Assuming 3 times per year)
# 3 More than once per month (Assuming 48 times per year, 3 times per month)
# 4 More than once per week (Assuming 130 times per year, 2.5 times per week)
# 5 Daily
# 6 Several times per day (Assuming 3 times per day)
# 7 Hourly or more often (Assuming 12 times per day, 1.5 times per hour)
# frequency_weights = {
#     1: 1 / 260,
#     2: 3 / 260,
#     3: 48 / 260,
#     4: 130 / 260,
#     5: 1,
#     6: 3,
#     7: 12
# }


def add_task_ratings(task_ratings_df) -> pd.DataFrame:
    """
    Description:
        Takes frequency, relevance, and importance from May 2025 and Oct 2015 task ratings data from O*NET.
        Uses frequency mapping weights to get a single number for frequency

    Args:
        task_ratings_df (pd.DataFrame): DataFrame with the O*NET Task Rating data from 2025 and 2015
    
    Returns:
        pd.DataFrame: DataFrame with task ratings added to other task columns
    """

    # Get freq rows, drop unusable ones, generate freq aggregates
    freq_df = task_ratings_df[task_ratings_df["Scale ID"] == "FT"].copy()

    # Drop rows without category or invalid categories
    freq_df = freq_df[pd.to_numeric(freq_df["Category"], errors='coerce').notnull()]
    freq_df["Category"] = freq_df["Category"].astype(int)

    # Apply weights
    freq_df["freq_mean"] = freq_df["Data Value"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_lower"] = freq_df["Lower CI Bound"] * freq_df["Category"].map(frequency_weights) / 100
    freq_df["freq_upper"] = freq_df["Upper CI Bound"] * freq_df["Category"].map(frequency_weights) / 100

    # Sum across categories to get per-task total
    freq_agg = freq_df.groupby(["O*NET-SOC Code", "Title", "Task ID", "Task"]).agg({
        "freq_mean": "sum",
        "freq_lower": "sum",
        "freq_upper": "sum"
    }).reset_index()


    # Get importance and relevance ratings
    importance_df = task_ratings_df[task_ratings_df["Scale ID"] == "IM"].copy()
    importance_df = importance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    importance_df = importance_df.rename(columns={
        "Data Value": "importance",
        "Lower CI Bound": "importance_lower",
        "Upper CI Bound": "importance_upper"
    })

    relevance_df = task_ratings_df[task_ratings_df["Scale ID"] == "RT"].copy()
    relevance_df = relevance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
                                "Data Value", "Lower CI Bound", "Upper CI Bound"]]
    relevance_df = relevance_df.rename(columns={
        "Data Value": "relevance",
        "Lower CI Bound": "relevance_lower",
        "Upper CI Bound": "relevance_upper"
    })

    # Merge ratings
    merged_ratings = freq_agg.merge(importance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")
    merged_ratings = merged_ratings.merge(relevance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")

    merged_ratings["task_normalized"] = merged_ratings["Task"].str.lower().str.strip()

    return merged_ratings


task_ratings_2025_df = pd.read_csv("../extra_data/task_ratings_may_2025.csv")
ratings_cleaned_2025_df = add_task_ratings(task_ratings_2025_df)

task_ratings_2015_df = pd.read_csv("../extra_data/task_ratings_oct_2015.csv")
ratings_cleaned_2015_df = add_task_ratings(task_ratings_2015_df)

In [136]:
# Optional save to csv and show df for inspection

# ratings_cleaned_2025_df.to_csv("../merged_data_files/ratings_cleaned_2025.csv", index=False)
# ratings_cleaned_2025_df

In [137]:
# Optional save to csv and show df for inspection

# ratings_cleaned_2015_df.to_csv("../merged_data_files/ratings_cleaned_2015.csv", index=False)
# ratings_cleaned_2015_df

### 6.2: Merge Rating Values Into Tasks

In [138]:
def merge_task_ratings(tasks_wage_emp_final_df, ratings_df) -> pd.DataFrame:
    """
    Description:
        This function merges the task data with the ratings data for both 2025 and 2015. Some values are missing
    
    Args:
        tasks_wage_emp_final_df (pd.DataFrame): DataFrame from Step 5
        ratings_df (pd.DataFrame): DataFrame containing cleaned task ratings (single year).
    
    Returns:
        pd.DataFrame: Merged DataFrame with task ratings values unfilled.
    """

    df = tasks_wage_emp_final_df.copy()

    # Normalize column names
    df["task_normalized"] = df["task"].apply(normalize_text)
    ratings_df["task_normalized"] = ratings_df["Task"].apply(normalize_text)
    df["title_normalized"] = df["title"].str.lower().str.strip()
    ratings_df["title_normalized"] = ratings_df["Title"].str.lower().str.strip()

    # Count how many times each normalized task appears
    task_counts = df["task_normalized"].value_counts()
    is_duplicate = df["task_normalized"].isin(task_counts[task_counts > 1].index)
    is_unique = ~is_duplicate
    df_duplicate_tasks = df[is_duplicate].copy()
    df_unique_tasks = df[is_unique].copy()

    # Count how many times each normalized task appears in ratings
    task_counts_ratings = ratings_df["task_normalized"].value_counts()
    is_duplicate_ratings = ratings_df["task_normalized"].isin(task_counts_ratings[task_counts_ratings > 1].index)
    is_unique_ratings = ~is_duplicate_ratings
    df_duplicate_tasks_ratings = ratings_df[is_duplicate_ratings].copy()
    df_unique_tasks_ratings = ratings_df[is_unique_ratings].copy()

    # Merge on unique tasks
    merged_unique = df_unique_tasks.merge(
        df_unique_tasks_ratings[
            ["freq_mean", "freq_lower", "freq_upper",
             "importance", "importance_lower", "importance_upper",
             "relevance", "relevance_lower", "relevance_upper",
             "task_normalized"]
        ],
        on=["task_normalized"],
        how="left"
    )

    # Merge on both title and task for duplicate ones
    merged_duplicate = df_duplicate_tasks.merge(
        df_duplicate_tasks_ratings[
            ["freq_mean", "freq_lower", "freq_upper",
             "importance", "importance_lower", "importance_upper",
             "relevance", "relevance_lower", "relevance_upper",
             "task_normalized", "title_normalized"]
        ],
        on=["task_normalized", "title_normalized"],
        how="left"
    )

    merged = pd.concat([merged_unique, merged_duplicate], ignore_index=True)
    
    return merged


tasks_final_2025_unfilled_df = merge_task_ratings(tasks_wage_emp_final_df, ratings_cleaned_2025_df)
tasks_final_2015_unfilled_df = merge_task_ratings(tasks_wage_emp_final_df, ratings_cleaned_2015_df)



In [None]:
# Optional save to csv and show df for inspection

# tasks_final_2025_unfilled_df.to_csv("../merged_data_files/tasks_final_2025_unfilled.csv", index=False)
# tasks_final_2025_unfilled_df

In [None]:
# Optional save to csv and show df for inspection

# tasks_final_2015_unfilled_df.to_csv("../merged_data_files/tasks_final_2015_unfilled.csv", index=False)
# tasks_final_2015_unfilled_df

### 6.3: Fill Missing Task Rating Values

In [140]:
def fill_missing_ratings(tasks_final_unfilled_df) -> pd.DataFrame:
    """
    Description:
        This function fills missing task rating values.
    
    Args:
        tasks_final_unfilled_df (pd.DataFrame): DataFrame from previous step
    
    Returns:
        pd.DataFrame: Merged DataFrames with task rating data added for both 2025 and 2015
    """

    df = tasks_final_unfilled_df

    # Mark rows that are missing any of the key values
    df["imputed_rating_mean"] = False
    df["imputed_rating_ci"] = False

    # Loop through each metric
    for col in ["freq_mean", "freq_lower", "freq_upper",
                "importance", "importance_lower", "importance_upper",
                "relevance", "relevance_lower", "relevance_upper"]:
        
        # Group by title and compute occupation-level mean
        occ_means = df.groupby("title")[col].mean()

        # Group by major occ category and compute fallback mean
        major_occ_means = df.groupby("major_occ_category")[col].mean()

        # Go row by row
        for i, row in df.iterrows():
            if pd.isna(row[col]):
                occ_val = occ_means.get(row["title"], None)
                occ_count = df[(df["title"] == row["title"]) & (df[col].notna())].shape[0]

                if occ_count >= 3 and pd.notna(occ_val):
                    df.at[i, col] = occ_val
                    if col in ["freq_mean", "importance", "relevance"]:
                        df.at[i, "imputed_rating_mean"] = True
                    else:
                        df.at[i, "imputed_rating_ci"] = True
                else:
                    soc_val = major_occ_means.get(row["major_occ_category"], None)
                    if pd.notna(soc_val):
                        df.at[i, col] = soc_val
                        if col in ["freq_mean", "importance", "relevance"]:
                            df.at[i, "imputed_rating_mean"] = True
                        else:
                            df.at[i, "imputed_rating_ci"] = True
    
    return df


tasks_final_2025_filled_df = fill_missing_ratings(tasks_final_2025_unfilled_df)
tasks_final_2015_filled_df = fill_missing_ratings(tasks_final_2015_unfilled_df)

In [None]:
# Optional save to csv and show df for inspection

# tasks_final_2025_filled_df.to_csv("../merged_data_files/tasks_final_2025_filled.csv", index=False)
# tasks_final_2025_filled_df

In [None]:
# Optional save to csv and show df for inspection

# tasks_final_2015_filled_df.to_csv("../merged_data_files/tasks_final_2015_filled.csv", index=False)
# tasks_final_2015_filled_df

### 6.4 Merge 2015 and 2025 Task Ratings To One DataFrame

In [142]:
def merge_task_ratings_to_one_df(base_df, add_df, base_year, add_year):
    """
    Description:
        This function merges the task data with the ratings data and performs final cleanup.
    
    Args:
        tasks_wage_emp_final_df (pd.DataFrame): DataFrame from Step 5
        ratings_df (pd.DataFrame): DataFrame containing cleaned task ratings from either 2025 or 2015.
        year (int): Year of the ratings data (e.g., 2015 or 2025)
    
    Returns:
        pd.DataFrame: Final merged DataFrame with all necessary information.
    """

    base_df = base_df.rename(columns={
            "freq_mean": f"freq_mean_{base_year}",
            "freq_lower": f"freq_lower_{base_year}",
            "freq_upper": f"freq_upper_{base_year}",
            "importance": f"importance_{base_year}",
            "importance_lower": f"importance_lower_{base_year}",
            "importance_upper": f"importance_upper_{base_year}",
            "relevance": f"relevance_{base_year}",
            "relevance_lower": f"relevance_lower_{base_year}",
            "relevance_upper": f"relevance_upper_{base_year}",
            "imputed_rating_mean": f"imputed_rating_mean_{base_year}",
            "imputed_rating_ci": f"imputed_rating_ci_{base_year}"
        })
    
    add_df = add_df.rename(columns={
            "freq_mean": f"freq_mean_{add_year}",
            "freq_lower": f"freq_lower_{add_year}",
            "freq_upper": f"freq_upper_{add_year}",
            "importance": f"importance_{add_year}",
            "importance_lower": f"importance_lower_{add_year}",
            "importance_upper": f"importance_upper_{add_year}",
            "relevance": f"relevance_{add_year}",
            "relevance_lower": f"relevance_lower_{add_year}",
            "relevance_upper": f"relevance_upper_{add_year}",
            "imputed_rating_mean": f"imputed_rating_mean_{add_year}",
            "imputed_rating_ci": f"imputed_rating_ci_{add_year}"
        })

    # Count how many times each normalized task appears
    task_counts = base_df["task_normalized"].value_counts()
    is_duplicate_base = base_df["task_normalized"].isin(task_counts[task_counts > 1].index)
    is_unique_base = ~is_duplicate_base
    base_df_duplicate_tasks = base_df[is_duplicate_base].copy()
    base_df_unique_tasks = base_df[is_unique_base].copy()

    # Count how many times each normalized task appears in ratings
    task_counts_ratings = add_df["task_normalized"].value_counts()
    is_duplicate_add = add_df["task_normalized"].isin(task_counts_ratings[task_counts_ratings > 1].index)
    is_unique_add = ~is_duplicate_add
    add_duplicate_tasks = add_df[is_duplicate_add].copy()
    add_unique_tasks = add_df[is_unique_add].copy()

    # Merge on unique tasks
    merged_unique = base_df_unique_tasks.merge(
        add_unique_tasks[
            [f"freq_mean_{add_year}", f"freq_lower_{add_year}", f"freq_upper_{add_year}",
             f"importance_{add_year}", f"importance_lower_{add_year}", f"importance_upper_{add_year}",
             f"relevance_{add_year}", f"relevance_lower_{add_year}", f"relevance_upper_{add_year}",
             f"imputed_rating_mean_{add_year}", f"imputed_rating_ci_{add_year}", "task_normalized"]
        ],
        on=["task_normalized"],
        how="left"
    )

    # Merge on both title and task for duplicate ones
    merged_duplicate = base_df_duplicate_tasks.merge(
        add_duplicate_tasks[
            [f"freq_mean_{add_year}", f"freq_lower_{add_year}", f"freq_upper_{add_year}",
             f"importance_{add_year}", f"importance_lower_{add_year}", f"importance_upper_{add_year}",
             f"relevance_{add_year}", f"relevance_lower_{add_year}", f"relevance_upper_{add_year}",
             f"imputed_rating_mean_{add_year}", f"imputed_rating_ci_{add_year}",
             "task_normalized", "title_normalized"]
        ],
        on=["task_normalized", "title_normalized"],
        how="left"
    )

    merged = pd.concat([merged_unique, merged_duplicate], ignore_index=True)
    
    return merged



tasks_final_uncleaned_df = merge_task_ratings_to_one_df(tasks_final_2025_filled_df, tasks_final_2015_filled_df, 2025, 2015)



In [None]:
# Optional save to csv and show df for inspection

# tasks_final_uncleaned_df.to_csv("../merged_data_files/tasks_final_uncleaned.csv", index=False)
# tasks_final_uncleaned_df

Unnamed: 0,task,task_normalized,soc_code_2010,title,task_id,task_type,n_responding,date,domain_source,n_occurrences,...,freq_lower_2015,freq_upper_2015,importance_2015,importance_lower_2015,importance_upper_2015,relevance_2015,relevance_lower_2015,relevance_upper_2015,imputed_rating_mean_2015,imputed_rating_ci_2015
0,Direct or conduct studies or research on issue...,direct or conduct studies or research on issue...,11-1011.00,Chief Executives,8848.0,Core,87.0,07/2014,Incumbent,1.0,...,0.061440,1.146565,3.33,3.03,3.63,75.11,63.36,84.04,False,False
1,"Review and analyze legislation, laws, or publi...",review and analyze legislation laws or public ...,11-1011.00,Chief Executives,20461.0,Core,87.0,07/2014,Incumbent,1.0,...,0.125447,1.759347,3.64,3.33,3.95,73.15,59.40,83.53,False,False
2,Review reports submitted by staff members to r...,review reports submitted by staff members to r...,11-1011.00,Chief Executives,8830.0,Core,87.0,07/2014,Incumbent,1.0,...,0.317070,1.475519,3.82,3.57,4.07,98.87,92.25,99.84,False,False
3,"Serve as liaisons between organizations, share...",serve as liaisons between organizations shareh...,11-1011.00,Chief Executives,8840.0,Supplemental,87.0,07/2014,Incumbent,1.0,...,0.076612,0.654576,3.56,3.26,3.86,56.76,42.19,70.25,False,False
4,"Direct, plan, or implement policies, objective...",direct plan or implement policies objectives o...,11-1011.00,Chief Executives,8826.0,Core,87.0,07/2014,Incumbent,1.0,...,0.617652,4.047700,4.39,4.24,4.54,95.84,87.35,98.71,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,Wet concrete surface and rub with stone to smo...,wet concrete surface and rub with stone to smo...,47-2053.00,Terrazzo Workers and Finishers,9848.0,Supplemental,61.0,07/2015,Incumbent,2.0,...,0.157735,2.244911,3.22,2.43,4.02,53.13,36.76,68.85,False,False
4466,Program electronic equipment.,program electronic equipment,51-6062.00,"Textile Cutting Machine Setters, Operators, an...",14161.0,Supplemental,109.0,06/2007,Incumbent,2.0,...,1.158993,8.286697,3.45,2.88,4.03,53.25,38.91,67.07,False,False
4467,Program electronic equipment.,program electronic equipment,51-6063.00,"Textile Knitting and Weaving Machine Setters, ...",14178.0,Supplemental,85.0,06/2007,Incumbent,2.0,...,0.762112,6.269766,4.21,3.96,4.46,43.04,33.75,52.85,False,False
4468,Plan and formulate flight activities and test ...,plan and formulate flight activities and test ...,53-2011.00,"Airline Pilots, Copilots, and Flight Engineers",10586.0,Supplemental,52.0,07/2012,Incumbent,2.0,...,0.322073,7.329848,3.60,3.20,4.01,33.44,26.78,40.83,False,False


## Step 6: Final Cleanup

In [None]:
#     # Replace placeholders with NaN
#     placeholder_values = ["#", "*", "", "n/a", "na", "--"]
#     merged.replace(placeholder_values, pd.NA, inplace=True)

#     # Drop fully empty columns
#     merged.dropna(axis=1, how="all", inplace=True)

#     # Drop 'occ_code' and 'task_name'
#     merged.drop(columns=["occ_code", "task_name", "title_normalized", "title_normalized_x", "title_normalized_y"], inplace=True, errors="ignore")

#     # Reorder columns: make 'task' and 'task_normalized' first
#     cols = merged.columns.tolist()
#     for col in ["task_normalized", "task"]:
#         if col in cols:
#             cols.insert(0, cols.pop(cols.index(col)))
#     merged = merged[cols]

In [None]:
# We are gonna want to have a block of code at the top that you can change variables, like the inflation number. Step 4.1 has one, and step 3.2
# We will also want to have a place to make folder for the merged data files
# Make sure to have a definite idea of what broad cateogory is
# We will want to remove the nlp imports

## Extra Data (OLD)

In [13]:

#MIGHT NEED THESE REFERENCES FOR STEP 3

# # Only fill NaNs from fallback columns
# merged["H_MEDIAN"] = merged["H_MEDIAN"].fillna(merged["H_MEDIAN_fallback"])
# merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["A_MEDIAN_fallback"])




# merged["2_digit_soc"] = merged["soc_code_2019"].astype(str).str[:2]     
# wage_df_trimmed["2_digit_soc"] = wage_df_trimmed["soc_code_2019"].astype(str).str[:2]
# wage_df_trimmed_fallback_2nd = wage_df_trimmed[wage_df_trimmed["O_GROUP"] == "major"]
# merged_fallback_2nd = merged[merged["H_MEDIAN"].isna() | merged["A_MEDIAN"].isna()]

# fallback_merge_2nd = merged_fallback_2nd.merge(
#     wage_df_trimmed_fallback_2nd[["2_digit_soc", "H_MEDIAN", "A_MEDIAN"]],
#     on="2_digit_soc", how="left",
#     suffixes=("", "_fallback2nd")
# )

# merged = merged.merge(
#     fallback_merge_2nd[["title", "H_MEDIAN_fallback2nd", "A_MEDIAN_fallback2nd"]],
#     on="title",
#     how="left"
# )

# # Only fill NaNs from fallback columns
# merged["H_MEDIAN"] = merged["H_MEDIAN"].fillna(merged["H_MEDIAN_fallback2nd"])
# merged["A_MEDIAN"] = merged["A_MEDIAN"].fillna(merged["A_MEDIAN_fallback2nd"])



# merged["h_median_final"] = merged.groupby("title")["H_MEDIAN"].transform("mean")
# merged["a_median_final"] = merged.groupby("title")["A_MEDIAN"].transform("mean")







# merged["2_digit_soc"] = merged["soc_code_2019"].astype(str).str[:2]     
# wage_df_trimmed["2_digit_soc"] = wage_df_trimmed["soc_code_2019"].astype(str).str[:2]
# wage_df_trimmed_fallback_2nd = wage_df_trimmed[wage_df_trimmed["O_GROUP"] == "major"]
#scraper_wage_df["title"] = scraper_wage_df["JobName"].apply(normalize_text)

In [14]:

# def add_emp_wage_data(df) -> pd.DataFrame:
#     """
#     Description:
#         This loads in the employment wage data  and merges it into the given dataframe with the desired columns on the occupation code.
#         If a row doesn't match, we will fall back to merging on occupation title. 
#         All column names in the resulting DataFrame will be lowercase.

#     Args:
#         df (pd.DataFrame): Input the df with the ONET and Claude data merged.

#     Returns:
#         pd.DataFrame: Merged DataFrame with employment and wage data
#     """
#     emp_wage_df = pd.read_csv("../extra_data/emp_wage_national.csv")

#     # Standardize for merges
#     df["occ_group_code"] = df["occ_group_code"].str[:7]
#     df["title_normalized"] = df["title"].str.lower().str.strip()
#     emp_wage_df["occ_title_normalized"] = emp_wage_df["OCC_TITLE"].str.lower().str.strip()

#     wage_cols = [
#             "OCC_CODE", "AREA_TITLE", "TOT_EMP", "EMP_PRSE", "JOBS_1000",
#             "LOC_QUOTIENT", "PCT_TOTAL", "PCT_RPT", "H_MEAN", "A_MEAN",
#             "MEAN_PRSE", "H_PCT10", "H_PCT25", "H_MEDIAN", "H_PCT75", "H_PCT90",
#             "A_PCT10", "A_PCT25", "A_MEDIAN", "A_PCT75", "A_PCT90", "ANNUAL", "HOURLY", "occ_title_normalized"
#         ]

#     # Perform merge
#     merged_df = pd.merge(
#         df,
#         emp_wage_df[wage_cols],
#         left_on="occ_group_code",
#         right_on="OCC_CODE",
#         how="left"
#     )

#     merged_matched = merged_df[merged_df["TOT_EMP"].notna()]
#     unmatched = merged_df[merged_df["TOT_EMP"].isna()]
#     unmatched = unmatched.drop(columns=wage_cols, errors="ignore")

#     merged_unmatched = pd.merge(
#         unmatched,
#         emp_wage_df[wage_cols],
#         left_on="title_normalized",
#         right_on="occ_title_normalized",
#         how="left"
#     )

#     final_merged = pd.concat([merged_matched, merged_unmatched], ignore_index=True)
#     final_merged.drop(columns=["title_normalized", "occ_title_normalized"], inplace=True, errors="ignore")


#     # Convert all column names to lowercase
#     final_merged.columns = [col.lower() for col in final_merged.columns]

#     return final_merged

# task_emp_wage_df = add_emp_wage_data(task_soc_pct_all)
# #display(task_emp_wage_df)
# print("tot_emp missing:", task_emp_wage_df["tot_emp"].isna().sum())
# print(task_emp_wage_df.loc[task_emp_wage_df["tot_emp"].isna(), "title"].unique())



In [15]:
# #Task ratings processing

# def add_task_ratings():
#     """
#     Description:
#         This function reads the task ratings from an Excel file, processes it to extract frequency, importance, and relevance ratings,
#         and merges them into a single DataFrame with the desired structure.

#     Args:
#         df (pd.DataFrame): Input the df with the ONET, Claude, and emp and wage data merged.
    
#     Returns:
#         pd.DataFrame: Merged DataFrame with task ratings including frequency, importance, and relevance.
#     """
    

#     task_ratings_df = pd.read_csv("../extra_data/task_ratings.csv")


# # Frequency mapping. Assuming a 52 week year with 5 working days per week, these are corresponding survey questions::
# # 1 Once per year or less (Assuming 1 time per year)
# # 2 More than once per year (Assuming 3 times per year)
# # 3 More than once per month (Assuming 48 times per year, 3 times per month)
# # 4 More than once per week (Assuming 130 times per year, 2.5 times per week)
# # 5 Daily
# # 6 Several times per day (Assuming 3 times per day)
# # 7 Hourly or more often (Assuming 12 times per day, 1.5 times per hour)
#     frequency_weights = {
#         1: 1 / 260,
#         2: 3 / 260,
#         3: 48 / 260,
#         4: 130 / 260,
#         5: 1,
#         6: 3,
#         7: 12
#     }


#     # Get freq rows, drop unusable ones, generate freq aggregates
#     freq_df = task_ratings_df[task_ratings_df["Scale ID"] == "FT"].copy()

#     # Drop rows without category or invalid categories
#     freq_df = freq_df[pd.to_numeric(freq_df["Category"], errors='coerce').notnull()]
#     freq_df["Category"] = freq_df["Category"].astype(int)

#     # Apply weights
#     freq_df["freq_mean"] = freq_df["Data Value"] * freq_df["Category"].map(frequency_weights) / 100
#     freq_df["freq_lower"] = freq_df["Lower CI Bound"] * freq_df["Category"].map(frequency_weights) / 100
#     freq_df["freq_upper"] = freq_df["Upper CI Bound"] * freq_df["Category"].map(frequency_weights) / 100

#     # Sum across categories to get per-task total
#     freq_agg = freq_df.groupby(["O*NET-SOC Code", "Title", "Task ID", "Task"]).agg({
#         "freq_mean": "sum",
#         "freq_lower": "sum",
#         "freq_upper": "sum"
#     }).reset_index()


#     # Get importance and relevance ratings
#     importance_df = task_ratings_df[task_ratings_df["Scale ID"] == "IM"].copy()
#     importance_df = importance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
#                                 "Data Value", "Lower CI Bound", "Upper CI Bound"]]
#     importance_df = importance_df.rename(columns={
#         "Data Value": "importance",
#         "Lower CI Bound": "importance_lower",
#         "Upper CI Bound": "importance_upper"
#     })

#     relevance_df = task_ratings_df[task_ratings_df["Scale ID"] == "RT"].copy()
#     relevance_df = relevance_df[["O*NET-SOC Code", "Title", "Task ID", "Task", 
#                                 "Data Value", "Lower CI Bound", "Upper CI Bound"]]
#     relevance_df = relevance_df.rename(columns={
#         "Data Value": "relevance",
#         "Lower CI Bound": "relevance_lower",
#         "Upper CI Bound": "relevance_upper"
#     })


#     # Merge ratings
#     merged_ratings = freq_agg.merge(importance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")
#     merged_ratings = merged_ratings.merge(relevance_df, on=["O*NET-SOC Code", "Title", "Task ID", "Task"], how="left")


#     merged_ratings["task_normalized"] = merged_ratings["Task"].str.lower().str.strip()


#     return merged_ratings

# ratings_df = add_task_ratings()
# #display(ratings_df.reset_index(drop=True))

In [16]:
# #Merge all and final cleanup

# def batch_lemmatize(texts):
#     """
#     Efficiently lemmatize a list of strings using spaCy's nlp.pipe().
#     Skips punctuation, whitespace, and possessives.
#     """
#     if not texts:
#         return []
    
#     # Handle empty/null strings
#     processed_texts = [str(text).strip() if text and str(text).strip() else " " for text in texts]
    
#     cleaned = []
#     try:
#         for doc in nlp.pipe(processed_texts, batch_size=1000, disable=["ner", "parser"]):
#             lemmas = [
#                 token.lemma_ for token in doc
#                 if not token.is_punct and not token.is_space and token.text != "'s"
#             ]
#             result = " ".join(lemmas).strip()
#             cleaned.append(result if result else "")
#     except Exception as e:
#         print(f"Error in batch_lemmatize: {e}")
#         raise
    
#     return cleaned

# def merge_all_and_cleanup(df, ratings_df):
#     """
#     Description:
#         This function merges the task data with the ratings data and performs final cleanup.
    
#     Args:
#         df (pd.DataFrame): DataFrame containing task data.
#         ratings_df (pd.DataFrame): DataFrame containing task ratings.
    
#     Returns:
#         pd.DataFrame: Final merged DataFrame with all necessary information.
#     """
    
#     # Normalize task names

#     # Apply batch lemmatization
#     df["task_normalized"] = batch_lemmatize(df["task"].tolist())
#     ratings_df["task_normalized"] = batch_lemmatize(ratings_df["Task"].tolist())

#     df["title_normalized"] = df["title"].str.lower().str.strip()
#     ratings_df["title_normalized"] = ratings_df["Title"].str.lower().str.strip()

#     # Count how many times each normalized task appears
#     task_counts = df["task_normalized"].value_counts()

#     # Boolean mask for duplicate vs. unique tasks
#     is_duplicate = df["task_normalized"].isin(task_counts[task_counts > 1].index)
#     is_unique = ~is_duplicate

#     # Split the dataframe
#     df_duplicate_tasks = df[is_duplicate].copy()
#     df_unique_tasks = df[is_unique].copy()

#     # Count how many times each normalized task appears
#     task_counts_ratings = ratings_df["task_normalized"].value_counts()

#     # Boolean mask for duplicate vs. unique tasks
#     is_duplicate_ratings = ratings_df["task_normalized"].isin(task_counts_ratings[task_counts_ratings > 1].index)
#     is_unique_ratings = ~is_duplicate_ratings

#     # Split the dataframe
#     df_duplicate_tasks_ratings = ratings_df[is_duplicate_ratings].copy()
#     df_unique_tasks_ratings = ratings_df[is_unique_ratings].copy()

#     # Merge on unique tasks
#     merged_unique = df_unique_tasks.merge(
#         df_unique_tasks_ratings[[
#             "freq_mean", "freq_lower", "freq_upper",
#             "importance", "importance_lower", "importance_upper",
#             "relevance", "relevance_lower", "relevance_upper",
#             "task_normalized", "title_normalized"
#         ]],
#         on=["task_normalized"],
#         how="left"
#     )


#     # Merge on both title and task
#     merged_duplicate = df_duplicate_tasks.merge(
#         df_duplicate_tasks_ratings[[
#             "freq_mean", "freq_lower", "freq_upper",
#             "importance", "importance_lower", "importance_upper",
#             "relevance", "relevance_lower", "relevance_upper",
#             "task_normalized", "title_normalized"
#         ]],
#         on=["task_normalized", "title_normalized"],
#         how="left"
#     )

#     merged = pd.concat([merged_unique, merged_duplicate], ignore_index=True)

#     # Replace placeholders with NaN
#     placeholder_values = ["#", "*", "", "n/a", "na", "--"]
#     merged.replace(placeholder_values, pd.NA, inplace=True)

#     # Drop fully empty columns
#     merged.dropna(axis=1, how="all", inplace=True)

#     # Drop 'occ_code' and 'task_name'
#     merged.drop(columns=["occ_code", "task_name", "title_normalized", "title_normalized_x", "title_normalized_y"], inplace=True, errors="ignore")

#     # Reorder columns: make 'task' and 'task_normalized' first
#     cols = merged.columns.tolist()
#     for col in ["task_normalized", "task"]:
#         if col in cols:
#             cols.insert(0, cols.pop(cols.index(col)))
#     merged = merged[cols]

#     return merged

# task_final = merge_all_and_cleanup(task_emp_wage_df, ratings_df)
# task_final.to_csv("../new_data/tasks_final.csv", index=False)
# #display(task_final.reset_index(drop=True))
