In [None]:
# 2020 World Pop
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

parquet_file = "../../local.parquet"
table = pq.read_table(parquet_file)

male_columns = [col for col in table.column_names if "_m_" in col]
female_columns = [col for col in table.column_names if "_f_" in col]

def sum_column_with_filter(column):
    non_negative_column = pc.if_else(pc.less(column, 0), pa.array([0] * len(column)), column)
    return pc.sum(non_negative_column)

sum_pop_f_2020 = sum(sum_column_with_filter(table.column(col)) for col in female_columns)
sum_pop_m_2020 = sum(sum_column_with_filter(table.column(col)) for col in male_columns)
sum_pop_2020 = sum_pop_f_2020 + sum_pop_m_2020

new_table = table.append_column("sum_pop_f_2020", sum_pop_f_2020)
new_table = new_table.append_column("sum_pop_m_2020", sum_pop_m_2020)
new_table = new_table.append_column("sum_pop_2020", sum_pop_2020)

updated_parquet_file = "updated_local.parquet"
pq.write_table(new_table, updated_parquet_file)

In [None]:
# 2025 World Pop
import pandas as pd

in_path = "in.parquet"
out_path = "out.parquet"

df = pd.read_parquet(in_path)

# If hex_id is stored as the index, move it back to a column
if "hex_id" not in df.columns and df.index.name == "hex_id":
    df = df.reset_index()

female_cols = [
    "SUM_f_00_2025","SUM_f_01_2025","SUM_f_05_2025","SUM_f_10_2025","SUM_f_15_2025",
    "SUM_f_20_2025","SUM_f_25_2025","SUM_f_30_2025","SUM_f_35_2025","SUM_f_40_2025",
    "SUM_f_45_2025","SUM_f_50_2025","SUM_f_55_2025","SUM_f_60_2025","SUM_f_65_2025",
    "SUM_f_70_2025","SUM_f_75_2025","SUM_f_80_2025","SUM_f_85_2025","SUM_f_90_2025",
]

male_cols = [
    "SUM_m_00_2025","SUM_m_01_2025","SUM_m_05_2025","SUM_m_10_2025","SUM_m_15_2025",
    "SUM_m_20_2025","SUM_m_25_2025","SUM_m_30_2025","SUM_m_35_2025","SUM_m_40_2025",
    "SUM_m_45_2025","SUM_m_50_2025","SUM_m_55_2025","SUM_m_60_2025","SUM_m_65_2025",
    "SUM_m_70_2025","SUM_m_75_2025","SUM_m_80_2025","SUM_m_85_2025","SUM_m_90_2025",
]

df["sum_f_2025"] = df[female_cols].sum(axis=1, min_count=1)
df["sum_m_2025"] = df[male_cols].sum(axis=1, min_count=1)

df.to_parquet(out_path, index=False)

In [None]:
import pyarrow.parquet as pq

parquet_path = "WorldPop_2025_Demographics.parquet"
schema = pq.read_schema(parquet_path)

for name in schema.names:
    print(name)

hex_id
SUM_f_00_2025
SUM_f_01_2025
SUM_f_05_2025
SUM_f_10_2025
SUM_f_15_2025
SUM_f_20_2025
SUM_f_25_2025
SUM_f_30_2025
SUM_f_35_2025
SUM_f_40_2025
SUM_f_45_2025
SUM_f_50_2025
SUM_f_55_2025
SUM_f_60_2025
SUM_f_65_2025
SUM_f_70_2025
SUM_f_75_2025
SUM_f_80_2025
SUM_f_85_2025
SUM_f_90_2025
SUM_m_00_2025
SUM_m_01_2025
SUM_m_05_2025
SUM_m_10_2025
SUM_m_15_2025
SUM_m_20_2025
SUM_m_25_2025
SUM_m_30_2025
SUM_m_35_2025
SUM_m_40_2025
SUM_m_45_2025
SUM_m_50_2025
SUM_m_55_2025
SUM_m_60_2025
SUM_m_65_2025
SUM_m_70_2025
SUM_m_75_2025
SUM_m_80_2025
SUM_m_85_2025
SUM_m_90_2025
SUM_pop_2015
SUM_pop_2016
SUM_pop_2017
SUM_pop_2018
SUM_pop_2019
SUM_pop_2020
SUM_pop_2021
SUM_pop_2022
SUM_pop_2023
SUM_pop_2024
SUM_pop_2025
SUM_pop_2026
SUM_pop_2027
SUM_pop_2028
SUM_pop_2029
SUM_pop_2030
sum_f_2025
sum_m_2025
