# Inspect Generated Zonal Stats
This notebook will run through a checks to see if the generated h3 zonal stats have been calculated correctly. This will include checking at the following steps:

1. H1 CSV files on S3
2. Aggregated parquet files on S3
3. S2S database

In [3]:
from typing import Dict

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from geojson_pydantic import Feature, Polygon
#from lonboard import Map, ScatterplotLayer
from shapely import from_geojson

from space2stats import StatsTable

from shapely.geometry import shape
import requests
import json
#import folium as flm

In [4]:
# Define input variables
iso3 = 'KEN'
ADM = "ADM0"
s3_csv_file = 's3://wbg-geography01/Space2Stats/h3_stats_data/GLOBAL/Urbanization_Pop/815b3ffffffffff/GHS_POP_2020_Urban_Breakdown.csv'
s3_parquet_file = 's3://wbg-geography01/Space2Stats/parquet/GLOBAL/NTL_VIIRS_LEN_2012_combined.parquet'
s2s_fields = ['']


# Fetch the admin boundaries and convert to geojson
def fetch_admin_boundaries(iso3: str, adm: str) -> gpd.GeoDataFrame:
    """Fetch administrative boundaries from GeoBoundaries API."""
    url = f"https://www.geoboundaries.org/api/current/gbOpen/{iso3}/{adm}/"
    res = requests.get(url).json()
    return gpd.read_file(res["gjDownloadURL"])

adm_boundaries = fetch_admin_boundaries(iso3, ADM)
geojson_str = adm_boundaries.to_json()
adm_geojson = json.loads(geojson_str)
adm_features = adm_geojson["features"]
feature = adm_features[0]

# Read in the environment variables 
load_dotenv("../../db.env")

True

In [5]:
with StatsTable.connect() as stats_table:
    data = stats_table.summaries(
        aoi=feature,
        spatial_join_method="touches",
        fields=stats_table.fields(),
        geometry="polygon",
    )
    df = pd.DataFrame(data)

df.head()

AttributeError: module 'h3' has no attribute 'polyfill'

## Assess s3 csv file

In [None]:
# df["geometry"] = df["geometry"].apply(lambda geom: from_geojson(geom))
df['geometry'] = df['geometry'].apply(lambda geom: shape(geom))
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
# gdf

In [None]:
gdf.to_file("kenya_from_db.geojson", driver="GeoJSON")
# gdf = gpd.read_file("kenya_from_db.geojson")

In [None]:
m = gdf.explore(
    column='sum_pop_m_30_2020',
    tooltip='sum_pop_m_30_2020',
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## From S3 Parquet

In [None]:
# parquet_file = join(expanduser("~"), "Downloads", "m_30_2020.parquet")
parquet_file = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/WorldPop_2020_Demographics/m_30_2020.parquet"

In [None]:
df_ = pd.read_parquet(parquet_file)

In [None]:
df_ = df_.loc[df_['SUM']>-1].copy()

In [None]:
gdf_s3 = gdf.merge(df_, left_on='hex_id', right_on='id', how='left')

In [None]:
m = gdf_s3.explore(
    column='SUM',
    tooltip='SUM',
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## Rerun Local ZS

In [None]:
import os
from os.path import basename
import rasterio as rio
from rasterstats import zonal_stats
from rasterio import features
from affine import Affine
from rasterio.plot import show
from tqdm import tqdm

In [None]:
# Define input raster variables
population_folder = (
    "J://Data/GLOBAL/Population/WorldPop_PPP_2020/GLOBAL_1km_Demographics"
)
pop_files = [
    os.path.join(population_folder, x)
    for x in os.listdir(population_folder)
    if x.endswith("1km.tif")
]

In [None]:
gdf_zs = gdf[['hex_id', 'geometry']].copy()
geom = adm_boundaries.geometry.iloc[0]

In [None]:
for pop_file in tqdm(pop_files):
    with rio.open(pop_file) as src:

        var_name = basename(pop_file.strip("_1km.tif"))
        var_name = var_name.strip('global_')
        var_name = 'sum_pop_'+var_name

        window = features.geometry_window(src, [geom])
        ul_y = window.col_off
        lr_x = window.row_off
        t = src.transform
        affine_wp = Affine(t.a, t.b, t.c+ul_y*t.a, t.d, t.e, t.f+lr_x*t.e)
        data = src.read(1, window=window)

        zs = zonal_stats(
            gdf_zs,
            data,
            affine=affine_wp,
            stats="sum",
            nodata=src.nodata
        )
        gdf_zs = gdf_zs.join(pd.DataFrame(zs).rename(columns={"sum": var_name}))

In [None]:
gdf_zs.loc[:, "sum_pop_2020_calc_zs"] = gdf_zs[['sum_pop_f_0_2020', 'sum_pop_f_10_2020', 'sum_pop_f_15_2020',
       'sum_pop_f_1_2020', 'sum_pop_f_20_2020', 'sum_pop_f_25_2020',
       'sum_pop_f_30_2020', 'sum_pop_f_35_2020', 'sum_pop_f_40_2020',
       'sum_pop_f_45_2020', 'sum_pop_f_50_2020', 'sum_pop_f_55_2020',
       'sum_pop_f_5_2020', 'sum_pop_f_60_2020', 'sum_pop_f_65_2020',
       'sum_pop_f_70_2020', 'sum_pop_f_75_2020', 'sum_pop_f_80_2020',
       'sum_pop_m_0_2020', 'sum_pop_m_10_2020', 'sum_pop_m_15_2020',
       'sum_pop_m_1_2020', 'sum_pop_m_20_2020', 'sum_pop_m_25_2020',
       'sum_pop_m_30_2020', 'sum_pop_m_35_2020', 'sum_pop_m_40_2020',
       'sum_pop_m_45_2020', 'sum_pop_m_50_2020', 'sum_pop_m_55_2020',
       'sum_pop_m_5_2020', 'sum_pop_m_60_2020', 'sum_pop_m_65_2020',
       'sum_pop_m_70_2020', 'sum_pop_m_75_2020', 'sum_pop_m_80_2020']].sum(axis=1)

In [None]:
gdf_zs.to_file("kenya_zs_local.geojson", driver="GeoJSON")

In [None]:
gdf_zs = gpd.read_file("kenya_zs_local.geojson")

In [None]:
m = gdf_zs.explore(
    column='sum_pop_m_30_2020',
    tooltip='sum_pop_m_30_2020',
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m