# Comparing h3 cells to administrative divisions

While most of our work has focused on the consistent, global, hexagon grid, the ultimate goal of the S2S work program is to generate a database of geospatial aggregates at the administrative level 2. **We are still in the process of acquiring the admin bounds we are going to publish.**

This notebook focuses on identifying percentage overlap between our official administrative divisions and our gobal hexagonal grid.

In [1]:
import sys, os, multiprocessing
import h3ronpy, h3

import pandas as pd
import geopandas as gpd


from tqdm.notebook import tqdm
from h3ronpy.pandas.vector import geodataframe_to_cells, cells_dataframe_to_geodataframe
from h3ronpy import ContainmentMode

from GOSTrocks.misc import tPrint

In [2]:
base_folder = r"C:\WBG\Work\S2S"
admin_folder = os.path.join(base_folder, "data", 'admin')
out_folder = os.path.join(base_folder, "ADM_RESULTS")

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

# Get the admin boundaries
admin_file = r"C:\WBG\Work\data\ADMIN\NEW_WB_BOUNDS\WB_GAD_ADM2.shp"
in_admin = gpd.read_file(admin_file)
in_admin['gID'] = list(range(1, in_admin.shape[0] + 1))
in_admin['gID'] = in_admin.apply(lambda x: f'{x["WB_A3"]}_{x["gID"]}', axis=1)
in_admin.head()

Unnamed: 0,ISO_A3,ISO_A2,WB_A3,HASC_0,HASC_1,HASC_2,GAUL_0,GAUL_1,GAUL_2,WB_REGION,...,ADM2CD_t,ADM2NM,GEOM_SRCE,P_DATE,SALB_DATE,Layer,Shape_Leng,Shape_Area,geometry,gID
0,CHN,CN,CHN,CN,CN.HL,,147295,908,13088,EAP,...,CHN011007,,WB GAD,2020-09-10,,2,12.686519,2.607028,"POLYGON ((133.89558 46.55695, 133.89555 46.545...",CHN_1
1,CHN,CN,CHN,CN,CN.HL,,147295,908,13089,EAP,...,CHN011004,,WB GAD,2020-09-10,,2,8.400738,1.700403,"POLYGON ((132.52369 47.71289, 132.51065 47.683...",CHN_2
2,CHN,CN,CHN,CN,CN.HL,,147295,908,13090,EAP,...,CHN011011,,WB GAD,2020-09-10,,2,10.767553,2.611398,"POLYGON ((133.89558 46.55695, 133.89177 46.549...",CHN_3
3,CHN,CN,CHN,CN,CN.HL,,147295,908,13092,EAP,...,CHN011013,,WB GAD,2020-09-10,,2,13.542786,3.857735,"POLYGON ((130.81428 48.33685, 130.72977 48.307...",CHN_4
4,CHN,CN,CHN,CN,CN.HL,,147295,908,13093,EAP,...,CHN011006,,WB GAD,2020-09-10,,2,16.951572,3.753208,"POLYGON ((129.92461 47.29074, 129.97993 47.268...",CHN_5


In [3]:
def run_country_mp(cntry_label, in_data, out_folder):
    """ Iterate through the country and get intersecting hex_ids for each feature in in_data

    Parameters
    ----------
    cntry_label : string
        The country label to use for the output
    in_data : geopandas.GeoDataFrame
        The input data to process of admini features
    out_folder : string
        The output folder to save the results
    """    
    out_file = os.path.join(out_folder, f'{cntry_label}_h3_intersect.csv')
    if not os.path.exists(out_file):    
        all_res = []
        for idx, row in tqdm(in_data.iterrows(), total=in_data.shape[0], desc=f'Processing {cntry_label}'):
            in_shape = gpd.GeoDataFrame(row.to_frame().T, geometry="geometry", crs=in_data.crs)
            in_shape['geometry'] = in_shape['geometry'].apply(lambda x: x.buffer(0))                                        
            cur_res = get_bounds(in_shape, 6)
            cur_res['cntry'] = cntry_label
            all_res.append(cur_res)
        
        country_res = pd.concat(all_res, ignore_index=True)
        country_res.to_csv(out_file, index=False)
    else:
        country_res = pd.read_csv(out_file)
    return(country_res)


def get_bounds(in_shp, h3_lvl=6):
    """ Generate a geodataframe for the supplied in_shp with the H3 cells and % overlap

    Parameters
    ----------
    in_shp : shapely.geometry.polygon.Polygon
        The input shapely polygon
    h3_lvl : int
        The H3 level to use for the hexagons, default is 6
    """
    # extract the H3 cells
    cols_to_keep = ['gID', 'cell', 'overlap']
    cell_ax = cells_dataframe_to_geodataframe(geodataframe_to_cells(in_shp, 6, ContainmentMode.IntersectsBoundary))
    cell_ax['cell'] = cell_ax['cell'].apply(lambda x: hex(x)[2:])    
    # Identify contained and overlapping hexes with the admin bounds
    contained_h3 = cell_ax.sjoin(in_shp, predicate='within')
    missed_h3 = cell_ax[~cell_ax['cell'].isin(contained_h3['cell'])]
    # calculate h3x overlap with feature
    shp_area = in_shp.union_all()
    cell_ax['overlap'] = 0.0
    cell_ax.loc[contained_h3.index, 'overlap'] = 1.0
    cell_ax.loc[missed_h3.index, 'overlap'] = cell_ax.loc[missed_h3.index,'geometry'].apply(lambda x: x.intersection(shp_area).area/x.area)\
    
    return cell_ax.loc[:, cols_to_keep].reset_index(drop=True)


sel_admin = in_admin.loc[in_admin['WB_A3'] == 'KEN']
xx = get_bounds(gpd.GeoDataFrame(sel_admin.iloc[0].to_frame().T, geometry="geometry", crs=sel_admin.crs), 6)
xx.head(10)



Unnamed: 0,gID,cell,overlap
0,KEN_7788,867a4c187ffffff,0.100703
1,KEN_7788,867a4c197ffffff,0.98902
2,KEN_7788,867a4c19fffffff,0.216256
3,KEN_7788,867a4c1b7ffffff,0.858498
4,KEN_7788,867a4c507ffffff,0.019305
5,KEN_7788,867a4c527ffffff,0.003169
6,KEN_7788,867a4c52fffffff,0.472479
7,KEN_7788,867a4c567ffffff,0.522335
8,KEN_7788,867a4c56fffffff,0.016155
9,KEN_7788,867a4c577ffffff,0.003943


In [None]:
# Not sure why, but this is not working
"""
mp_args = []
for cntry, data in in_admin.groupby('WB_A3'):
    mp_args.append([cntry, data, out_folder])

run_country_mp(*mp_args[0])
"""

In [None]:
for cntry, data, outFolder in tqdm(mp_args):
    print(f'Processing {cntry}')
    if not cntry in ['FJI','RUS']: # There are topological errors current dataset
        res = run_country_mp(cntry, data, outFolder)

# DEBNRURGGIGN

In [None]:
cntry_label = 'RUS'
in_data = in_admin.loc[in_admin['WB_A3'] == cntry_label]
in_data.plot()

In [None]:
in_data.to_file(os.path.join(out_folder, f'{cntry_label}_h3_intersect.gpkg'), driver='GPKG')

In [None]:
all_res = []
for idx, row in in_data.iterrows():
    tPrint(idx)
    in_shape = gpd.GeoDataFrame(row.to_frame().T, geometry="geometry", crs=in_data.crs)
    in_shape['geometry'] = in_shape['geometry'].apply(lambda x: x.buffer(0))                                        
    cur_res = get_bounds(in_shape, 6)
    cur_res['cntry'] = cntry_label
    all_res.append(cur_res)

In [None]:
row

In [None]:
row.geometry.bounds

In [None]:
sel_row = in_data.loc[24588]
in_shp = gpd.GeoDataFrame(sel_row.to_frame().T, geometry="geometry", crs=in_data.crs)
            

In [None]:
~contained_h3['cell'].isin(cell_ax['cell'])

In [None]:
cell_ax

In [None]:
cell_ax.to_file(os.path.join(out_folder, f'{cntry_label}_h3_intersect.gpkg'), driver='GPKG')

In [None]:
tPrint('Starting')
cols_to_keep = ['gID', 'cell', 'overlap']
cell_ax = cells_dataframe_to_geodataframe(geodataframe_to_cells(in_shp, 6, ContainmentMode.IntersectsBoundary))
cell_ax['cell'] = cell_ax['cell'].apply(lambda x: hex(x)[2:])
tPrint("Generated H3 cells")
shp_area = in_shp.union_all()
cell_ax['overlap'] = cell_ax['geometry'].apply(lambda x: x.intersection(shp_area).area/x.area)
tPrint("Calculated overlap")
    

# return cell_ax.loc[:, cols_to_keep].reset_index(drop=True)

In [None]:
x.area