# Generate parquet for ingestion

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore that schema and combine the various CSV files into a final output parquet file.

In [1]:
import sys
import os
import itertools
import boto3

import geopandas as gpd
import pandas as pd
import numpy as np

from scipy.spatial import cKDTree
from shapely.geometry import Point
from operator import itemgetter
from tqdm.notebook import tqdm

sys.path.append("../../../gostrocks/src")

from GOSTrocks.misc import tPrint

In [2]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_counts_and_pop.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

Unnamed: 0,hex_id,ghs_11_count,ghs_12_count,ghs_13_count,ghs_21_count,ghs_22_count,ghs_23_count,ghs_30_count,ghs_total_count,ghs_11_pop,ghs_12_pop,ghs_13_pop,ghs_21_pop,ghs_22_pop,ghs_23_pop,ghs_30_pop,ghs_total_pop
0,860000007ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,86000000fffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,860000017ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86000001fffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,860000027ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

# Combine output CSV files into single parquet

In [3]:
#Search for all relevant CSV files
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/WorldPop_2025_Demographics/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region, verify=False)

# Loop through the S3 bucket and get all the keys for files that are .csv 
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('.parquet'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])

Completed loop: 0




Completed loop: 1




Completed loop: 2




Completed loop: 3




Completed loop: 4




Completed loop: 5




Completed loop: 6




Completed loop: 7




In [4]:
# Create empty data frame based on the schema of the good schema
out_res = pd.DataFrame(index=good_schema['hex_id'])

all_res = []
for in_file in tqdm(good_res):
    if not "_t_" in in_file:
        full_path = f's3://{bucket}/{in_file}'
        curD = pd.read_parquet(full_path)
        curD = curD.loc[:,[x for x in curD.columns if "SUM" in x] + ['shape_id']]
        curD.replace(-1.0, np.nan, inplace=True)
        sum_col_name = "SUM_" + "_".join(os.path.basename(in_file).split("_")[1:4]).replace("_CN", "") 
        curD.rename({"SUM": sum_col_name, "shape_id": "hex_id"}, axis=1, inplace=True)
        curD.set_index('hex_id', inplace=True)
        out_res.loc[curD.index, sum_col_name] = curD[sum_col_name]
        all_res.append(curD)
        
#final_res.drop(['Unnamed: 0'], axis=1).to_parquet(f's3://{bucket}/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts.parquet')

  0%|          | 0/7828 [00:00<?, ?it/s]

In [7]:
out_res.head(20)

Unnamed: 0_level_0,SUM_f_00_2025,SUM_f_01_2025,SUM_f_05_2025,SUM_f_10_2025,SUM_f_15_2025,SUM_f_20_2025,SUM_f_25_2025,SUM_f_30_2025,SUM_f_35_2025,SUM_f_40_2025,...,SUM_pop_2021,SUM_pop_2022,SUM_pop_2023,SUM_pop_2024,SUM_pop_2025,SUM_pop_2026,SUM_pop_2027,SUM_pop_2028,SUM_pop_2029,SUM_pop_2030
hex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
860000007ffffff,,,,,,,,,,,...,,,,,,,,,,
86000000fffffff,,,,,,,,,,,...,,,,,,,,,,
860000017ffffff,,,,,,,,,,,...,,,,,,,,,,
86000001fffffff,,,,,,,,,,,...,,,,,,,,,,
860000027ffffff,,,,,,,,,,,...,,,,,,,,,,
86000002fffffff,,,,,,,,,,,...,,,,,,,,,,
860000037ffffff,,,,,,,,,,,...,,,,,,,,,,
860000047ffffff,,,,,,,,,,,...,,,,,,,,,,
86000004fffffff,,,,,,,,,,,...,,,,,,,,,,
860000057ffffff,,,,,,,,,,,...,,,,,,,,,,


In [12]:
out_res.to_parquet(f's3://{bucket}/Space2Stats/parquet/GLOBAL/WorldPop_2025_Demographics.parquet')