# Inspect Generated Zonal Stats
This notebook will run through a checks to see if the generated h3 zonal stats have been calculated correctly. This will include checking at the following steps:

1. H1 CSV files on S3
2. Aggregated parquet files on S3
3. S2S database

In [2]:
import os, time, io, json, sys
import urllib3
import boto3
import rasterio

import geopandas as gpd
import pandas as pd
import numpy as np
import folium as flm
import GOSTrocks.rasterMisc as rMisc

from dotenv import load_dotenv
from shapely.geometry import shape, box
from geojson_pydantic import Feature, Polygon
from urllib3.exceptions import InsecureRequestWarning
from botocore import UNSIGNED
from botocore.config import Config
from pystac_client import Client
from tqdm.notebook import tqdm

sys.path.append("../../src")

import global_zonal
import h3_helper

urllib3.disable_warnings(InsecureRequestWarning)

def tPrint(s):
    """prints the time along with the message"""
    print("%s\t%s" % (time.strftime("%H:%M:%S"), s))

s3_client = boto3.client('s3', verify=False, config=Config(signature_version=UNSIGNED))

In [3]:
# Define input variables
base_folder = "C:/WBG/Work/S2S/data/landcover"
if not os.path.exists(base_folder):
    os.makedirs(base_folder)

bucket = "io-10m-annual-lulc"

h3_0_list = h3_helper.generate_lvl0_lists(
    6,
    return_gdf=True,
    buffer0=False,
    read_pickle=True,
    pickle_file="h0_dictionary_of_h6_geodata_frames_land.pickle",
)

Loading pickle file h0_dictionary_of_h6_geodata_frames_land.pickle: it exists True


In [12]:
h3_0_list.__class__

dict

In [20]:
catalog = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

for h0_lbl, h0_level in h3_0_list.items():
    out_lc_file = os.path.join(base_folder, f"landcover_{h0_lbl}_2022.parquet")
    if not os.path.exists(out_lc_file):
        tPrint(f"Processing {h0_lbl} with {len(h0_level)} features")    
        #https://pystac-client.readthedocs.io/en/latest/tutorials/pystac-client-introduction.html#API-Search
        query = catalog.search(
            collections=["io-lulc-9-class"],
            datetime="2022-01-01/2022-12-31",
            intersects=h0_level.geometry.union_all(),
        )
        query_items = list(query.item_collection())

        h0_level_result = h0_level.copy()
        all_res = []
        for lc_feature in query_items:# tqdm(query_items, desc="Processing Land Cover Features"):
            lc_label = lc_feature.id.replace("-", "_")
            try:        
                obj = s3_client.get_object(Bucket=bucket,Key=f'{lc_label}.tif')
                process = True
            except:
                print(f"Could not find file for {lc_label}, skipping")
                process = False
                continue
            if process:
                raw_data: bytes = obj['Body'].read()
                cur_lc = rasterio.open(io.BytesIO(raw_data))
                if h0_level.crs != cur_lc.crs:
                    h0_level = h0_level.to_crs(cur_lc.crs)            
                lc_box = gpd.GeoDataFrame(pd.DataFrame([[1, box(*cur_lc.bounds)]], columns=['id', 'geometry']), crs=cur_lc.crs, geometry='geometry')
                sel_hexes = gpd.sjoin(h0_level, lc_box, how='inner', predicate='intersects')
                lc_res = rMisc.zonalStats(sel_hexes, cur_lc, rastType='C', unqVals=list(range(1, 13))) 
                lc_res = pd.DataFrame(lc_res, columns=[f'c_{x}' for x in range(1, 13)])       
                lc_res['shape_id'] = sel_hexes['shape_id'].values
                all_res.append(lc_res)
        if len(all_res) > 0:
            cur_h0_res = pd.concat(all_res, ignore_index=True)
            cur_h0_res.set_index('shape_id', inplace=True)
            cur_h0_res['total_lc_cells'] = cur_h0_res.sum(axis=1)
            cur_h0_res = cur_h0_res.loc[cur_h0_res['total_lc_cells'] > 0]
            cur_h0_res['hex_id'] = cur_h0_res.index.values

            # If there are duplicates, sum the values for each hex_id
            cur_all_res = []
            for hex_id, curD in tqdm(cur_h0_res.groupby('hex_id')):
                if len(curD) > 1:
                    # sum the columns        
                    curD = curD.sum()
                    curD['hex_id'] = hex_id   
                    curD = curD.to_frame().T             
                cur_all_res.append(curD)

            if len(cur_all_res) > 0:
                final_h0_res = pd.concat(cur_all_res)
                final_h0_res.to_parquet(out_lc_file, index=False)
    else:
        print(f"File {out_lc_file} already exists, skipping processing")



09:26:27	Processing 8001fffffffffff with 10516 features
File C:/WBG/Work/S2S/data/landcover\landcover_8001fffffffffff_2022.parquet already exists, skipping processing
09:26:27	Processing 8003fffffffffff with 37740 features
File C:/WBG/Work/S2S/data/landcover\landcover_8003fffffffffff_2022.parquet already exists, skipping processing
09:26:27	Processing 8005fffffffffff with 51141 features
File C:/WBG/Work/S2S/data/landcover\landcover_8005fffffffffff_2022.parquet already exists, skipping processing
09:26:27	Processing 8007fffffffffff with 51656 features
File C:/WBG/Work/S2S/data/landcover\landcover_8007fffffffffff_2022.parquet already exists, skipping processing
09:26:27	Processing 8009fffffffffff with 39466 features
File C:/WBG/Work/S2S/data/landcover\landcover_8009fffffffffff_2022.parquet already exists, skipping processing
09:26:27	Processing 800bfffffffffff with 114016 features
File C:/WBG/Work/S2S/data/landcover\landcover_800bfffffffffff_2022.parquet already exists, skipping processi

0it [00:00, ?it/s]

ValueError: No objects to concatenate

In [16]:
final_h0_res.head()

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,total_lc_cells,hex_id,86006a007ffffff,86006a267ffffff
0,162562,0,0,0,0,0,0,101,28170,0,1603,0,192436,860001047ffffff,,
0,419685,0,0,0,0,0,0,6731,238631,0,179,0,665226,860001067ffffff,,
0,314902,0,0,0,0,0,0,1875,399732,0,2321,0,718830,86000106fffffff,,
0,363718,0,0,0,0,0,0,1517,352563,0,0,0,717798,86000114fffffff,,
0,707780,0,0,0,0,0,0,1000,10354,0,0,0,719134,86000116fffffff,,


In [18]:
cur_all_res[0]

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,total_lc_cells,hex_id
0,162562,0,0,0,0,0,0,101,28170,0,1603,0,192436,860001047ffffff


In [None]:
lc_res = rMisc.zonalStats(h0_level, cur_lc, rastType='C', unqVals=list(range(1, 10)))        

In [None]:
import odc.stac
import xarray as xr
import rioxarray

In [None]:
#projection
SMepsg = 3857  #https://epsg.io/3857 Geographic crs, units are m
SMepsg_str = "epsg:{0}".format(SMepsg)

#spatial resolution (in units of projection)
datares = 10
# Disable SSL verification for requests
os.environ['REQUESTS_CA_BUNDLE'] = ''

#https://odc-stac.readthedocs.io/en/latest/_api/odc.stac.load.html
lcxr = odc.stac.load(
    query.item_collection(),             #load the items from our query above
    chunks={},         #use Dask to speed loading
    dtype='int',
    geopolygon=h0_level.geometry.iloc[0]
)


In [None]:
lcxr.to_dataframe()

In [None]:
temp_h0 = h0_level.to_crs(32658)
lcxr.rio.clip([temp_h0.geometry.values[0]], crs=32658)

In [None]:
# loop through all the h1_cells and years and see if there are any actual results
for root, folders, files in os.walk(csv_folder):
    for f in files:
        if f.endswith(".csv"):
            cur_path = os.path.join(root, f)
            h1 = os.path.basename(root)
            year = f.split("_")[-1][1:5]
            cur_d = pd.read_csv(cur_path, index_col=0)
            max_val = cur_d.max(skipna=True)[:4].max(skipna=True)
            if max_val > 0:
                break
            else:
                print(f"no data for {h1} {year}")

# Run test zonal stats

In [None]:
h3_level = 6
ghsl_folder = "C:/WBG/Work/data/GHSL"
ghsl_files = [os.path.join(ghsl_folder, f) for f in os.listdir(ghsl_folder) if f.endswith(".tif")]
out_folder = "C:/WBG/Work/S2S/data/GHSL"

In [None]:
h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False, 
                read_pickle=True, pickle_file="h0_dictionary_of_h6_geodata_frames_land.pickle")

In [None]:
for h3_0_key, h6_list in h3_0_list.items():
    print(f"Processing {h3_0_key}: {len(h6_list)} hexes")

sample_h0 = '8007fffffffffff'
inH = h3_0_list[sample_h0]

In [None]:
out_file = os.path.join(out_folder, f"{sample_h0}_ghsl_stats.csv")
zonal_res = global_zonal.zonal_stats_numerical(inH, 'shape_id', ghsl_files[0], out_file, minVal=0, maxVal=100000)
zonal_res[out_file].max(skipna=True)

In [None]:
# Write h3 cells and clipped GHSL data to file
temp_folder = os.path.join(out_folder, "temp")
if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)

temp_h3_file = os.path.join(temp_folder, f"{sample_h0}_h3.shp")
if not os.path.exists(temp_h3_file):
    inH.to_file(temp_h3_file)

temp_ghsl_file = os.path.join(temp_folder, f"{sample_h0}_ghsl.tif")
if not os.path.exists(temp_ghsl_file):
    rMisc.clipRaster(ghsl_files[0], inH, temp_ghsl_file)

In [None]:
sel_h6_id = '86075d8b7ffffff'
res = zonal_res[out_file]
res.loc[res['id'] == sel_h6_id]

In [None]:
m = s2s_gdf.explore(
    column=s2s_field[0],
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## Test data on dev server

In [None]:
load_dotenv("../../../dev_db.env")

In [None]:
with StatsTable.connect() as stats_table:
    fields = stats_table.fields()
fields

In [None]:
bbox = adm_boundaries.total_bounds
bbox

In [None]:
sel_fields = ['sum_built_area_m_2000','sum_built_area_m_2020']
AOIModel = Feature[Polygon, Dict]
bbox = adm_boundaries.total_bounds

# ~kenya
aoi = {
    "type": "Feature",
    "geometry": {
        "type": "Polygon",
        "coordinates": [
            [
                [bbox[0], bbox[1]],
                [bbox[0], bbox[3]],
                [bbox[2], bbox[3]],
                [bbox[2], bbox[1]],
                [bbox[0], bbox[1]]
            ]
        ],
    },
    "properties": {"name": "Updated AOI"},
}


feat = AOIModel(**aoi)

In [None]:
with StatsTable.connect() as stats_table:
    data = stats_table.summaries(
        aoi=feat, spatial_join_method="centroid", fields=sel_fields, geometry="point"
    )
    df = pd.DataFrame(data)

df.head()

In [None]:
df['geometry'].iloc[0]

In [None]:
def get_change(x):
    try:
        return (x['sum_built_area_m_2020'] - x['sum_built_area_m_2000'])/x['sum_built_area_m_2000']
    except:
        return np.nan


gdf = df.copy()
gdf['geometry'] = gdf['geometry'].apply(lambda x: shape(json.loads(x)))
gdf = gpd.GeoDataFrame(gdf, geometry='geometry', crs=4326)
gdf['built_change'] = gdf.apply(lambda x: get_change(x), axis=1)
gdf.head()

In [None]:
m = gdf.explore(
    column='built_change',
    tooltip='built_change',
    cmap='YlGnBu',
    legend=True,
    scheme='quantiles',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Change in built area'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## Assess s3 csv file

In [None]:
s2s_df['h1'] = s2s_df['hex_id'].apply(lambda x: h3.cell_to_parent(x, 0))
for unq_h1 in s2s_df['h1'].unique():
    s3_file = s3_csv_base.format(h1=unq_h1)
    curD = pd.read_csv(s3_file, index_col=0)
curD.head()

In [None]:
def get_geom(x):
    xx = h3.cell_to_latlng(x)
    return(Point([xx[1], xx[0]]))
curD = curD.reset_index()
curD = curD.merge(s2s_gdf, left_on="id", right_on="hex_id", how='right')
curD = gpd.GeoDataFrame(curD, geometry='geometry', crs=4326).reset_index()

In [None]:
m = curD.explore(
    column=s2s_field,
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m

## From S3 Parquet

In [None]:
s3_parquet_file = r"C:\WBG\Work\S2S\ingest\GHSL_built_area_m.parquet"
gdf = pd.read_parquet(s3_parquet_file)
#gdf.replace(np.nan, 0).to_parquet(s3_parquet_file.replace(".parquet", "_zero.parquet"))
gdf.head()

In [None]:
gdf.fillna(np.nan)

In [None]:
gdf_columns[0] = 'hex_id'
gdf.columns = gdf_columns
gdf.to_parquet(s3_parquet_file, index=False)

In [None]:
gdf_s3 = gdf.merge(curD, left_on='hex_id', right_on='id', how='right')
gdf_s3 = gpd.GeoDataFrame(gdf_s3, geometry='geometry', crs=4326)
gdf_s3.head()

In [None]:
m = gdf_s3.explore(
    column=s2s_field,
    tooltip=s2s_field,
    cmap='YlGnBu',
    legend=True,
    scheme='naturalbreaks',
    legend_kwds=dict(colorbar=True, caption='Population', interval=False),
    style_kwds=dict(weight=0, fillOpacity=0.8),
    name='Population by Hexagon'
)
flm.LayerControl('topright', collapsed = False).add_to(m)
m