In [33]:
import pandas as pd

In [34]:
# --- Load data ---
eurostat = pd.read_csv('original_data/eurostat.csv')
nace = pd.read_csv('transformed_data/economic_activity_sector.csv')

In [35]:
# --- Prepare Eurostat aging data ---
selected_cols = ['age', 'sex', 'nace_r2', 'geo', 'TIME_PERIOD', 'OBS_VALUE']
eurostat = eurostat[selected_cols].copy()

In [36]:
# Keep only EU members
eu_countries = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia',
    'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece',
    'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
    'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia',
    'Slovenia', 'Spain', 'Sweden'
]
excluded = ['No response', 'Total - all NACE activities', 'Unknown NACE activity']

In [37]:
# Filter relevant observations
eurostat = eurostat[
    eurostat["TIME_PERIOD"].isin([2014, 2024]) &
    (eurostat["age"].isin(["15 years or over", "50 years or over"])) &
    (eurostat["geo"].isin(eu_countries))
]

eurostat = eurostat[~eurostat["nace_r2"].isin(excluded)]
eurostat.columns = ['age', 'gender', 'Economic Activity', 'country', 'year', 'num_of_workers']
eurostat["num_of_workers"] = eurostat["num_of_workers"] * 1000

excluded = ['No response', 'Total - all NACE activities', 'Unknown NACE activity']
eurostat = eurostat[~eurostat["Economic Activity"].isin(excluded)]

eurostat = eurostat[~(eurostat["gender"] == "Total")]
eurostat = eurostat.merge(nace, how='left', on = "Economic Activity")

In [38]:
# Pivot for aging ratios
eurostat = eurostat.pivot_table(
    index=["country", "sector"],
    columns=["age", "year", "gender"],
    values="num_of_workers",
    aggfunc="sum"
)

In [39]:
# Reset index first
eurostat = eurostat.reset_index()

# Flatten MultiIndex columns: (age_group, year, gender) → "agegroup_year_gender"
eurostat.columns = [
    f"{col[0]}_{col[1]}_{col[2]}" if isinstance(col, tuple) else col
    for col in eurostat.columns
]

# Optional: replace spaces with underscores and strip trailing underscores
eurostat.columns = [col.replace(" ", "_").rstrip("_") for col in eurostat.columns]

In [40]:
# Group data by countires
eurostat_gender_countries = eurostat.groupby("country")[[
    '15_years_or_over_2014_Females',
    '15_years_or_over_2014_Males',
    '15_years_or_over_2024_Females',
    '15_years_or_over_2024_Males',
    '50_years_or_over_2014_Females',
    '50_years_or_over_2014_Males',
    '50_years_or_over_2024_Females',
    '50_years_or_over_2024_Males'
]].sum()

In [41]:
# Group data by sectors
eurostat_gender_economic_activities = eurostat.groupby("sector")[[
    '15_years_or_over_2014_Females',
    '15_years_or_over_2014_Males',
    '15_years_or_over_2024_Females',
    '15_years_or_over_2024_Males',
    '50_years_or_over_2014_Females',
    '50_years_or_over_2014_Males',
    '50_years_or_over_2024_Females',
    '50_years_or_over_2024_Males'
]].sum()

In [42]:
# Compute gender ratios
eurostat_gender_countries["gender_ratio_2014"] = (eurostat_gender_countries['15_years_or_over_2014_Females'] / \
                                                  (eurostat_gender_countries['15_years_or_over_2014_Males'] + eurostat_gender_countries['15_years_or_over_2014_Females'])) * 100

eurostat_gender_countries["gender_ratio_2024"] = (eurostat_gender_countries['15_years_or_over_2024_Females'] / \
                                                 (eurostat_gender_countries['15_years_or_over_2024_Males'] + eurostat_gender_countries['15_years_or_over_2024_Females'])) * 100

eurostat_gender_countries["gender_ratio_2014_2024"] = eurostat_gender_countries["gender_ratio_2024"] - \
                                                      eurostat_gender_countries["gender_ratio_2014"]

eurostat_gender_countries["female_ratio_2014"] = eurostat_gender_countries['50_years_or_over_2014_Females'] / \
                                                 eurostat_gender_countries['15_years_or_over_2014_Females'] * 100

eurostat_gender_countries["female_ratio_2024"] = eurostat_gender_countries['50_years_or_over_2024_Females'] / \
                                                 eurostat_gender_countries['15_years_or_over_2024_Females'] * 100

eurostat_gender_countries["male_ratio_2014"] = eurostat_gender_countries['50_years_or_over_2014_Males'] / \
                                                 eurostat_gender_countries['15_years_or_over_2014_Males'] * 100

eurostat_gender_countries["male_ratio_2024"] = eurostat_gender_countries['50_years_or_over_2024_Males'] / \
                                                 eurostat_gender_countries['15_years_or_over_2024_Males'] * 100

eurostat_gender_countries["female_diff_2014_2024"] = eurostat_gender_countries["female_ratio_2024"] - \
                                                     eurostat_gender_countries["female_ratio_2014"]

eurostat_gender_countries["male_diff_2014_2024"] = eurostat_gender_countries["male_ratio_2024"] - \
                                                     eurostat_gender_countries["male_ratio_2014"]

In [43]:
eurostat_gender_countries = eurostat_gender_countries.round(2).fillna(0).reset_index()

In [44]:
eurostat_gender_countries.to_csv('transformed_data/eurostat_gender_countries.csv', index=False)

In [45]:
# Compute gender ratios
eurostat_gender_economic_activities["gender_ratio_2014"] = (eurostat_gender_economic_activities['15_years_or_over_2014_Females'] / \
                                                  (eurostat_gender_economic_activities['15_years_or_over_2014_Males'] + eurostat_gender_economic_activities['15_years_or_over_2014_Females'])) * 100

eurostat_gender_economic_activities["gender_ratio_2024"] = (eurostat_gender_economic_activities['15_years_or_over_2024_Females'] / \
                                                  (eurostat_gender_economic_activities['15_years_or_over_2024_Males'] + eurostat_gender_economic_activities['15_years_or_over_2024_Females'])) * 100

eurostat_gender_economic_activities["gender_ratio_2014_2024"] = eurostat_gender_economic_activities["gender_ratio_2024"] - \
                                                      eurostat_gender_economic_activities["gender_ratio_2014"]

eurostat_gender_economic_activities["female_ratio_2014"] = eurostat_gender_economic_activities['50_years_or_over_2014_Females'] / \
                                                 eurostat_gender_economic_activities['15_years_or_over_2014_Females'] * 100

eurostat_gender_economic_activities["female_ratio_2024"] = eurostat_gender_economic_activities['50_years_or_over_2024_Females'] / \
                                                 eurostat_gender_economic_activities['15_years_or_over_2024_Females'] * 100

eurostat_gender_economic_activities["male_ratio_2014"] = eurostat_gender_economic_activities['50_years_or_over_2014_Males'] / \
                                                 eurostat_gender_economic_activities['15_years_or_over_2014_Males'] * 100

eurostat_gender_economic_activities["male_ratio_2024"] = eurostat_gender_economic_activities['50_years_or_over_2024_Males'] / \
                                                 eurostat_gender_economic_activities['15_years_or_over_2024_Males'] * 100

eurostat_gender_economic_activities["female_diff_2014_2024"] = eurostat_gender_economic_activities["female_ratio_2024"] - \
                                                     eurostat_gender_economic_activities["female_ratio_2014"]

eurostat_gender_economic_activities["male_diff_2014_2024"] = eurostat_gender_economic_activities["male_ratio_2024"] - \
                                                     eurostat_gender_economic_activities["male_ratio_2014"]

eurostat_gender_economic_activities =  eurostat_gender_economic_activities.round(2).fillna(0).reset_index()

In [46]:
# save
eurostat_gender_economic_activities.to_csv("transformed_data/eurostat_gender_economic_activities.csv", index=False)