# Create Sample Parquet Extracts

- Pulls Space2Stats cross-sectional and SPI time-series samples for a small AOI (Kenya + Uganda)
- Cleans the retrieved frames, trims to schema-aligned columns, and writes the cross-sectional and time-series parquet files for local seeding/tests

In [3]:
import geopandas as gpd
import pandas as pd
from space2stats_client import Space2StatsClient

client = Space2StatsClient()

## Build Cross-Country AOI

In [None]:
iso3_list = ["KEN", "UGA"]

countries = []
for iso3 in iso3_list:
    gdf = client.fetch_admin_boundaries(iso3=iso3, adm="ADM0")
    countries.append(gdf)

aoi = gpd.GeoDataFrame(pd.concat(countries, ignore_index=True), crs="EPSG:4326")

In [4]:
aoi_union = aoi.dissolve().reset_index(drop=True)
aoi_union

Unnamed: 0,geometry,FID,ISO_A3,ISO_A2,WB_A3,HASC_0,GAUL_0,WB_REGION,WB_STATUS,SOVEREIGN,NAM_0,GlobalID,Shape__Area,Shape__Length
0,"MULTIPOLYGON (((39.40774 -4.70525, 39.40465 -4...",42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.32705,46.140605


# Cross-Sectional Sample Data

In [29]:
fields = client.get_fields()
fields

['sum_built_area_m_2030',
 'pop',
 'pop_flood',
 'pop_flood_pct',
 'sum_pop_f_0_2020',
 'sum_pop_f_10_2020',
 'sum_pop_f_15_2020',
 'sum_pop_f_1_2020',
 'sum_pop_f_20_2020',
 'sum_pop_f_25_2020',
 'sum_pop_f_30_2020',
 'sum_pop_f_35_2020',
 'sum_pop_f_40_2020',
 'sum_pop_f_45_2020',
 'sum_pop_f_50_2020',
 'sum_pop_f_55_2020',
 'sum_pop_f_5_2020',
 'sum_pop_f_60_2020',
 'sum_pop_f_65_2020',
 'sum_pop_f_70_2020',
 'sum_pop_f_75_2020',
 'sum_pop_f_80_2020',
 'sum_pop_m_0_2020',
 'sum_pop_m_10_2020',
 'sum_pop_m_15_2020',
 'sum_pop_m_1_2020',
 'sum_pop_m_20_2020',
 'sum_pop_m_25_2020',
 'sum_pop_m_30_2020',
 'sum_pop_m_35_2020',
 'sum_pop_m_40_2020',
 'sum_pop_m_45_2020',
 'sum_pop_m_50_2020',
 'sum_pop_m_55_2020',
 'sum_pop_m_5_2020',
 'sum_pop_m_60_2020',
 'sum_pop_m_65_2020',
 'sum_pop_m_70_2020',
 'sum_pop_m_75_2020',
 'sum_pop_m_80_2020',
 'sum_pop_f_2020',
 'sum_pop_m_2020',
 'sum_pop_2020',
 'sum_pop_m_2020_v2',
 'sum_pop_f_2020_v2',
 'sum_pop_2020_v2',
 'ghs_11_count',
 'ghs_12_cou

In [None]:
def chunked(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n]


all_parts = []
first = True

for field_chunk in chunked(fields, 10):  # adjust chunk size as needed
    kwargs = dict(
        gdf=aoi_union,
        spatial_join_method="centroid",
        fields=field_chunk,
    )
    if first:
        kwargs["geometry"] = "polygon" 

    part = client.get_summary(**kwargs)
    all_parts.append(gpd.GeoDataFrame(part))
    first = False

Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...
Fetching data for boundary 1 of 1...


In [None]:
# Merge chunks on hex id
hex_key = "hex_id"
summary = all_parts[0]
for part in all_parts[1:]:
    df = pd.DataFrame(part).drop(columns=[c for c in part.columns if c in summary.columns and c != hex_key])
    summary = summary.merge(df, on=hex_key, how="left")


### Clean Cross-Sectional Columns

In [None]:
to_drop = list(aoi_union.columns) + ["index_gdf", "index_h3"]
summary_clean = summary.drop(columns = to_drop)
summary_clean.head()

In [14]:
summary_clean.to_parquet("space2stats_sample_cs.parquet", index=False)

# Timeseries Sample Data

In [5]:
fields = client.get_timeseries_fields()
fields

['spi']

In [17]:
aoi = client.fetch_admin_boundaries(iso3="KEN", adm="ADM0")

In [28]:
years = range(2015, 2021)  # 2015–2020 inclusive
dfs = []

for idx, country in enumerate(countries[:2]):
    for year in years:
        start = f"{year}-01-01"
        end = f"{year}-12-31"

        ts = client.get_timeseries(
            gdf=country,
            spatial_join_method="centroid",
            fields=["spi"],
            start_date=start,
            end_date=end,
            verbose=False,
        )

        if not ts.empty:
            ts["country_idx"] = idx
            ts["year"] = year
            dfs.append(ts)

result = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
result


Unnamed: 0,FID,ISO_A3,ISO_A2,WB_A3,HASC_0,GAUL_0,WB_REGION,WB_STATUS,SOVEREIGN,NAM_0,GlobalID,Shape__Area,Shape__Length,hex_id,date,spi,area_id,country_idx,year
0,42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.327050,46.140605,866a58007ffffff,2015-01-01T00:00:00+00:00,0.677105,0,0,2015
1,42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.327050,46.140605,866a58007ffffff,2015-02-01T00:00:00+00:00,0.482942,0,0,2015
2,42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.327050,46.140605,866a58007ffffff,2015-03-01T00:00:00+00:00,-0.269795,0,0,2015
3,42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.327050,46.140605,866a58007ffffff,2015-04-01T00:00:00+00:00,0.067042,0,0,2015
4,42,KEN,KE,KEN,KE,133,AFR,Member State,KEN,Kenya,edc9ac00-2ebc-4ed1-9996-509777c61847,47.327050,46.140605,866a58007ffffff,2015-05-01T00:00:00+00:00,0.049126,0,0,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542235,53,UGA,UG,UGA,UG,253,AFR,Member State,UGA,Uganda,5442b7f4-c3e2-4f7c-bbe5-e9d6b1505cc3,19.617091,24.311077,867a4db6fffffff,2020-08-01T00:00:00+00:00,2.018657,0,1,2020
1542236,53,UGA,UG,UGA,UG,253,AFR,Member State,UGA,Uganda,5442b7f4-c3e2-4f7c-bbe5-e9d6b1505cc3,19.617091,24.311077,867a4db6fffffff,2020-09-01T00:00:00+00:00,1.359646,0,1,2020
1542237,53,UGA,UG,UGA,UG,253,AFR,Member State,UGA,Uganda,5442b7f4-c3e2-4f7c-bbe5-e9d6b1505cc3,19.617091,24.311077,867a4db6fffffff,2020-10-01T00:00:00+00:00,0.662396,0,1,2020
1542238,53,UGA,UG,UGA,UG,253,AFR,Member State,UGA,Uganda,5442b7f4-c3e2-4f7c-bbe5-e9d6b1505cc3,19.617091,24.311077,867a4db6fffffff,2020-11-01T00:00:00+00:00,1.424782,0,1,2020


In [29]:
needed_cols = ["hex_id", "date", "spi"]
result = result[needed_cols].copy()
result

Unnamed: 0,hex_id,date,spi
0,866a58007ffffff,2015-01-01T00:00:00+00:00,0.677105
1,866a58007ffffff,2015-02-01T00:00:00+00:00,0.482942
2,866a58007ffffff,2015-03-01T00:00:00+00:00,-0.269795
3,866a58007ffffff,2015-04-01T00:00:00+00:00,0.067042
4,866a58007ffffff,2015-05-01T00:00:00+00:00,0.049126
...,...,...,...
1542235,867a4db6fffffff,2020-08-01T00:00:00+00:00,2.018657
1542236,867a4db6fffffff,2020-09-01T00:00:00+00:00,1.359646
1542237,867a4db6fffffff,2020-10-01T00:00:00+00:00,0.662396
1542238,867a4db6fffffff,2020-11-01T00:00:00+00:00,1.424782


In [31]:
result.to_parquet("space2stats_sample_ts.parquet", index=False)