# Inspect Generated Zonal Stats
This notebook will run through a checks to see if the generated h3 zonal stats have been calculated correctly. This will include checking at the following steps:

1. H1 CSV files on S3
2. Aggregated parquet files on S3
3. S2S database

In [55]:
import os, sys, time
import requests
import json
import h3

import geopandas as gpd
import pandas as pd
import folium as flm

from dotenv import load_dotenv
from geojson_pydantic import Feature, Polygon
from lonboard import Map, ScatterplotLayer
from shapely import from_geojson, Polygon
from space2stats import StatsTable
from shapely.geometry import shape, Point, Polygon
from typing import Dict

sys.path.append(r"C:\WBG\Work\Code\GOSTrocks\src")
import GOSTrocks.rasterMisc as rMisc

sys.path.append("../../src")
import global_zonal
import h3_helper

def tPrint(s):
    """prints the time along with the message"""
    print("%s\t%s" % (time.strftime("%H:%M:%S"), s))

In [11]:
# Define input variables
iso3 = 'KEN'
ADM = "ADM0"
csv_folder = 'C:/WBG/Work/S2S/data/GHSL/'
csv_base = '{h1}/ghsl_built_m_E{year}.csv'

# Fetch the admin boundaries and convert to geojson
def fetch_admin_boundaries(iso3: str, adm: str) -> gpd.GeoDataFrame:
    """Fetch administrative boundaries from GeoBoundaries API."""
    url = f"https://www.geoboundaries.org/api/current/gbOpen/{iso3}/{adm}/"
    res = requests.get(url, verify=False).json()
    return gpd.read_file(res["gjDownloadURL"])

adm_boundaries = fetch_admin_boundaries(iso3, ADM)
geojson_str = adm_boundaries.to_json()
adm_geojson = json.loads(geojson_str)
adm_features = adm_geojson["features"]
feature = adm_features[0]



In [25]:
# loop through all the h1_cells and years and see if there are any actual results
for root, folders, files in os.walk(csv_folder):
    for f in files:
        if f.endswith(".csv"):
            cur_path = os.path.join(root, f)
            h1 = os.path.basename(root)
            year = f.split("_")[-1][1:5]
            cur_d = pd.read_csv(cur_path, index_col=0)
            max_val = cur_d.max(skipna=True)[:4].max(skipna=True)
            if max_val > 0:
                break
            else:
                print(f"no data for {h1} {year}")

no data for  1975
no data for 8001fffffffffff 1975
no data for 8001fffffffffff 1980
no data for 8001fffffffffff 1985
no data for 8001fffffffffff 1990
no data for 8001fffffffffff 1995
no data for 8001fffffffffff 2000
no data for 8001fffffffffff 2005
no data for 8001fffffffffff 2010
no data for 8001fffffffffff 2015
no data for 8001fffffffffff 2020
no data for 8001fffffffffff 2025
no data for 8003fffffffffff 1975
no data for 8003fffffffffff 1980
no data for 8003fffffffffff 1985
no data for 8003fffffffffff 1990
no data for 8003fffffffffff 1995
no data for 8003fffffffffff 2000
no data for 8003fffffffffff 2005
no data for 8003fffffffffff 2010
no data for 8003fffffffffff 2015
no data for 8003fffffffffff 2020
no data for 8003fffffffffff 2025
no data for 8005fffffffffff 1975
no data for 8005fffffffffff 1980
no data for 8005fffffffffff 1985
no data for 8005fffffffffff 1990
no data for 8005fffffffffff 1995
no data for 8005fffffffffff 2000
no data for 8005fffffffffff 2005
no data for 8005fffffffff

# Run test zonal stats

In [37]:
h3_level = 6
ghsl_folder = "C:/WBG/Work/data/GHSL"
ghsl_files = [os.path.join(ghsl_folder, f) for f in os.listdir(ghsl_folder) if f.endswith(".tif")]
out_folder = "C:/WBG/Work/S2S/data/GHSL"

In [31]:
h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False, 
                read_pickle=True, pickle_file="h0_dictionary_of_h6_geodata_frames_land.pickle")

Loading pickle file h0_dictionary_of_h6_geodata_frames_land.pickle: it exists True


In [34]:
for h3_0_key, h6_list in h3_0_list.items():
    print(f"Processing {h3_0_key}: {len(h6_list)} hexes")

sample_h0 = '8007fffffffffff'
inH = h3_0_list[sample_h0]

Processing 8001fffffffffff: 10516 hexes
Processing 8003fffffffffff: 37740 hexes
Processing 8005fffffffffff: 51141 hexes
Processing 8007fffffffffff: 51656 hexes
Processing 8009fffffffffff: 39466 hexes
Processing 800bfffffffffff: 114016 hexes
Processing 800dfffffffffff: 59002 hexes
Processing 800ffffffffffff: 69309 hexes
Processing 8011fffffffffff: 110447 hexes
Processing 8013fffffffffff: 111635 hexes
Processing 8015fffffffffff: 109233 hexes
Processing 8017fffffffffff: 30219 hexes
Processing 8019fffffffffff: 18011 hexes
Processing 801bfffffffffff: 9786 hexes
Processing 801dfffffffffff: 44 hexes
Processing 801ffffffffffff: 99411 hexes
Processing 8021fffffffffff: 120228 hexes
Processing 8023fffffffffff: 2973 hexes
Processing 8025fffffffffff: 118630 hexes
Processing 8027fffffffffff: 118168 hexes
Processing 8029fffffffffff: 46609 hexes
Processing 802bfffffffffff: 47178 hexes
Processing 802dfffffffffff: 97535 hexes
Processing 802ffffffffffff: 19039 hexes
Processing 8031fffffffffff: 68036 hexe

In [63]:
out_file = os.path.join(out_folder, f"{sample_h0}_ghsl_stats.csv")
zonal_res = global_zonal.zonal_stats_numerical(inH, 'shape_id', ghsl_files[0], out_file, minVal=0, maxVal=100000)
zonal_res[out_file].max(skipna=True)

SUM             2549965
MIN                   0
MAX                8792
MEAN         787.026235
id      8607b67b7ffffff
dtype: object

In [57]:
# Write h3 cells and clipped GHSL data to file
temp_folder = os.path.join(out_folder, "temp")
if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)

temp_h3_file = os.path.join(temp_folder, f"{sample_h0}_h3.shp")
if not os.path.exists(temp_h3_file):
    inH.to_file(temp_h3_file)

temp_ghsl_file = os.path.join(temp_folder, f"{sample_h0}_ghsl.tif")
if not os.path.exists(temp_ghsl_file):
    rMisc.clipRaster(ghsl_files[0], inH, temp_ghsl_file)

In [61]:
sel_h6_id = '86075d8b7ffffff'
res = zonal_res[out_file]
res.loc[res['id'] == sel_h6_id]

Unnamed: 0,SUM,MIN,MAX,MEAN,id
38111,,,,,86075d8b7ffffff


In [None]:
m = s2s_gdf.explore(
    column=s2s_field[0],
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## Assess s3 csv file

In [None]:
s2s_df['h1'] = s2s_df['hex_id'].apply(lambda x: h3.cell_to_parent(x, 0))
for unq_h1 in s2s_df['h1'].unique():
    s3_file = s3_csv_base.format(h1=unq_h1)
    curD = pd.read_csv(s3_file, index_col=0)
curD.head()

In [None]:
def get_geom(x):
    xx = h3.cell_to_latlng(x)
    return(Point([xx[1], xx[0]]))
curD = curD.reset_index()
curD = curD.merge(s2s_gdf, left_on="id", right_on="hex_id", how='right')
curD = gpd.GeoDataFrame(curD, geometry='geometry', crs=4326).reset_index()

In [None]:
m = curD.explore(
    column=s2s_field,
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## From S3 Parquet

In [None]:
gdf = pd.read_parquet(s3_parquet_file)
gdf = gdf.reset_index()
gdf_columns = list(gdf.columns)
gdf_columns

In [None]:
gdf_columns[0] = 'hex_id'
gdf.columns = gdf_columns
gdf.to_parquet(s3_parquet_file, index=False)

In [None]:
gdf_s3 = gdf.merge(curD, left_on='hex_id', right_on='id', how='right')
gdf_s3 = gpd.GeoDataFrame(gdf_s3, geometry='geometry', crs=4326)
gdf_s3.head()

In [None]:
m = gdf_s3.explore(
    column=s2s_field,
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m