# Unified ETL Pipeline

This notebook reproduces the steps from the Python ETL scripts in a single,
well-documented workflow. It generates the same output tables as the original
scripts for ONET, AEI and CAGED data.

## 1. Setup

In [None]:
import unicodedata
import pandas as pd
from huggingface_hub import hf_hub_download

In [3]:
def strip_accents(text: str) -> str:
    """Return ``text`` without accent marks."""
    decomposed = unicodedata.normalize("NFKD", text)
    return "".join(ch for ch in decomposed if unicodedata.category(ch) != "Mn")

## 2. Loading

In [74]:
CW_ONET_CBO = pd.read_csv(
    "data/config/cbo_onet_crosswalks.csv", encoding="utf-8", engine="python"
)

In [None]:
def load_aei(file_name):
    """Load data from Antrhopic Economic Data's Huggingace Repository"""
    return pd.read_csv(
                       hf_hub_download(
                            repo_id="Anthropic/EconomicIndex",
                            filename=f"release_2025_03_27/{file_name}",
                            repo_type="dataset"
                            )
                        )

# Loads the two Anthropic Economic Index databases.
# Check https://arxiv.org/pdf/2503.04761#page=16&zoom=100,110,189 for more details on the methodology
tasks_aei       = load_aei("task_pct_v2.csv") # Database with two columns: task_name (as per onet) and pct (percentage of conversations attributed to the task)
aut_aug_by_task = load_aei("automation_vs_augmentation_by_task.csv") # Database that attributes each conversations to a mode of use. 

# Loads the two official ONET databses
onet_df         = load_aei("onet_task_statements.csv") # Contains task metadata, linking it to occupations
soc_structure   = load_aei("SOC_Structure.csv")

In [5]:
# Load CAGED

#########################################################################################
# To re-run the query from the original data source, uncomment the lines below ##########
#########################################################################################

# load_dotenv()
# LIMIT = False
# import basedosdados as bd
# from dotenv import load_dotenv

# billing_id = os.getenv('BILLING_ID')
# with open('data/config/caged_bd_national.SQL') as f:
#     query = f.read()
#     if LIMIT:
#         query += f" LIMIT {LIMIT}"
# df = bd.read_sql(query=query, billing_project_id=billing_id)
# df.to_parquet('data/input/caged_national_UNTREATED.parquet')

caged_raw = pd.read_parquet('data/input/caged_national_UNTREATED.parquet')

## 3. ONET ETL

In [6]:
# Copies soc_structure, keeping only the rows referring to minor groups, renames the columns
# and adds a columnn major_group based on the first two characters of minor_group
s = soc_structure.copy()\
    .rename(
        {'SOC or O*NET-SOC 2019 Title': 'major_title',
         'Minor Group': 'minor_group'}, axis='columns')\
    .dropna(subset=['minor_group'])\
    .assign(minor_group=lambda x: x['minor_group'].str[:4])\
    [['minor_group', 'major_title']]

## 4. AEI ETL

Generate AEI occupation level metrics.

This script merges Anthropic Economic Index (AEI) task data with the
Standard Occupational Classification (SOC) structure in order to compute
automation and augmentation ratios by SOC minor group. The resulting
tables are written to ``data/output`` for further analysis.

### 4.1 Merge AEI tasks with SOC data
This step combines task statements with SOC titles and computes usage percentages per task.

In [None]:
# Merges the ONET task statements with the SOC structure, assigning major and minor groups and titles
tasks_df = (
    onet_df
        .copy()
        # Extracts major and minor group codes (first two and four characters respectively)
        .assign(minor_group=lambda x: x["O*NET-SOC Code"].str[:4])
        .assign(major_group=lambda x: x['minor_group'].str[:2])
        
        # Normalizes task names to match format in Anthropic Economic Index data
        .assign(task = lambda x: x['Task'].str.lower().str.strip())
        
        # Merges with other datasets:
        ## SOC structure, to get the Major and Minor Group titles
        .merge(s, on="minor_group", how="left")
        
        ## AEI data on the percentage of conversations assigned to each task
        .merge(tasks_aei, left_on='task', right_on='task_name', how='inner')
        
        ## AEI data on classification of use by automation vs. augmentation
        .merge(aut_aug_by_task, on='task_name', how='left')
        
        # Distributes the percentages evenly per occupation and calculates the percentage of total
        # conversations per task and occupation. In future iterations, distribution can be improved and adapted
        .assign(n_occ = lambda x: x.groupby('task')['Title'].transform('nunique'))
        .assign(pct_total = lambda x: 100 * (x['pct'] / x['n_occ']) / (x['pct'] / x['n_occ']).sum())
        
        # Sums the percentages attributed to subclassifications of automation and augmentation
        # As per the methodology in the AEI paper, augmentation = learning + validation
        # and automation = feedback_loop + directive + task_iteration
        .assign(aug = lambda x: (x['learning'] + x['validation']) * x['pct_total'])
        .assign(aut = lambda x: (x['feedback_loop'] + x['directive'] + x['task_iteration']) * x['pct'])
        
        # Keeps only the columns of interest
        [['major_group', 'minor_group', 'major_title', 'Title', 'task', 'n_occ', 'pct_total', 'aut', 'aug']]
)


In [8]:
# Asserting that merges and calculations were done correctly
assert len(tasks_df[tasks_df['major_title'].isna()]) == 0
assert len(tasks_df[tasks_df['n_occ'] == 0]) == 0
assert len(tasks_df[tasks_df['n_occ'].isna()]) == 0
assert tasks_df['pct_total'].sum() == 100

In [68]:
print('Number of unique tasks:', len(tasks_df['task'].unique()))
print('Number of minor groups:', len(tasks_df['minor_group'].unique()))
print('Number of major groups:', len(tasks_df['major_group'].unique()))

Number of unique tasks: 3364
Number of minor groups: 91
Number of major groups: 22


### 4.2 Aggregate metrics by occupation
Combine task usage with automation labels and summarize them by SOC minor group.

In [None]:
# occupation_metrics: aggregate task metrics to the minor group level
occupation_df = (
    tasks_df
        .groupby('minor_group')[['pct_total', 'aug', 'aut']]
        .sum()
        .reset_index()
        .assign(aug_aut_ratio = lambda x: x['aug'] / x['aut'])
        .assign(top_10        = lambda x: (x['pct_total'].rank(method='first', ascending=False) <= 10).astype(int))
        .assign(bottom_10     = lambda x: (x['pct_total'].rank(method='first', ascending=True)  <= 10).astype(int))
        .assign(top_10_aut    = lambda x: (      x['aut'].rank(method='first', ascending=False) <= 10).astype(int))
        .assign(top_10_aug    = lambda x: (      x['aug'].rank(method='first', ascending=False) <= 10).astype(int))
        
)

occupation_df.to_parquet('data/output/occ_aut_aug_lvl.parquet')

In [None]:
# Creates final_occupations.csv, containing the list of 40 records comprising each 'top 10' category

final_occupations = pd.concat(
        [
            occupation_df[occupation_df['top_10'] == 1    ].assign(cat = 'top_10'),
            occupation_df[occupation_df['bottom_10'] == 1 ].assign(cat = 'bottom_10'),
            occupation_df[occupation_df['top_10_aug'] == 1].assign(cat = 'top_10_aug'),
            occupation_df[occupation_df['top_10_aut'] == 1].assign(cat = 'top_10_aut')
        ],
        axis=0
    )
final_occupations.to_csv('data/output/final_occupations.csv', index=False)

## 5. CAGED ETL

In [82]:
caged = (
    caged_raw.copy()
    .dropna(subset=['cbo_2002_descricao_subgrupo_principal'])
    .groupby(['ano',
              'mes',
              'cbo_2002_descricao_subgrupo_principal',
              'cbo_2002_descricao_grande_grupo'
            ], 
            dropna=False
    )['saldo_movimentacao'].sum().reset_index()
    .apply(lambda col: col.str.strip() if col.dtype == 'object' else col)
    .rename(
        columns={
            'saldo_movimentacao': 'net_jobs',
            'ano': 'year',
            'mes': 'month',
            'cbo_2002_descricao_subgrupo_principal': 'cbo_subgroup',
            'cbo_2002_descricao_grande_grupo': 'cbo_group'
        }
    )
    .assign(cbo_subgroup = lambda x: x['cbo_subgroup'].apply(strip_accents))
    .assign(cbo_group = lambda x: x['cbo_group'].apply(strip_accents))
)


caged.to_parquet('data/output/caged_national.parquet')


# 6. Combine CAGED and AEI

In [None]:
# Merge CAGED job data with AEI occupation classifications.

CW_ONET_CBO["cbo_subgroup"] = CW_ONET_CBO["cbo_subgroup"].apply(strip_accents)

df = (
    caged
        .merge(CW_ONET_CBO, how="left", on="cbo_subgroup")
        .merge(final_occupations, how="left", left_on="onet_code", right_on="minor_group")
        #.dropna(subset=['cat'])
        .assign(date = lambda x: pd.to_datetime(x["year"].astype(str) + x["month"].astype(str), format="%Y%m"))
        .groupby(["date", "cat"])["net_jobs"].sum().reset_index()
)


time_series = df.groupby(["date", "cat"])["net_jobs"].sum().reset_index()
time_series.to_csv("data/output/time_series.csv", index=False)