In [1]:
import sys, os, multiprocessing, rasterio

import pandas as pd
#import geopandas as gpd
#import numpy as np

from h3 import h3

import GOSTrocks.rasterMisc as rMisc
import GOSTrocks.ntlMisc as ntl
from GOSTrocks.misc import tPrint

sys.path.append("../../src")
import h3_helper
import global_zonal

AWS_S3_BUCKET = 'wbg-geography01'
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN")



In [3]:
h3_1_list = h3_helper.generate_lvl1_lists(6, return_gdf=True, buffer0=True, read_pickle=True, write_pickle=False)

Loading pickle file h1_dictionary_of_h6_geodata_frames.pickle: it exists True


In [34]:
''' Run zonal stats on a continuous raster file using a matching categorical raster 
        file and a list of h3 cells. For each defined category in the categorical 
        raster file, calculate the sum, min, max, mean for that category.

        Parameters
        ----------
        gdf : geopandas.GeoDataFrame
            data frame of polygons to run zonal stats with
        gdf_id : str
            column in gdf with unique id
        raster_file : str
            path to raster file of continuous values to summarize with gdf
        category_raster_file : str
            path to raster file of categorical data to categorize the raster_file
        out_file : str
            path to write results
        categories : list of numbers, optional
            List of unique categories in category_raster_file to summarize, defaults to None.
            One of categories and reclass_dict must be defined
        reclass_dict : dictionary
            describes how to reclass category_raster_file if it is a continuous dataset, defaults to None.
            One of categories and reclass_dict must be defined
        buffer0 : boolean
            Should gdf be buffered by 0 (fixes shapely shapes), defaults to False.
        minVal : int
            minimum value to summarize in raster_file, anything less than minVal is set to 0, defaults to None.
        maxVal : int
            maximum value to summarize in raster_file, anything more than maxVal is set to 0, defaults to None.
        verbose : bool
            print extraneous updates using print statements
    '''
gdf = h3_1_list['81753ffffffffff']
ghsl_folder = "/home/public/Data/GLOBAL/GHSL/"
category_raster_file = os.path.join(ghsl_folder, "SMOD", "GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif")
raster_file = os.path.join(ghsl_folder, "Pop", "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0.tif") 
categories = [11,12,13,21,22,23,30]
minVal = 0
maxVal = 1000000
gdf_id = 'shape_id'
verbose = True

In [22]:
#extract category raster to gdf extent
cat_d, cat_profile = rMisc.clipRaster(rasterio.open(category_raster_file), gdf)
# extract raster to gdf extent
rast_d, rast_profile = rMisc.clipRaster(rasterio.open(raster_file), gdf)

In [31]:
# standardize categorical raster to zonal raster
final_zonal_res = []
with rMisc.create_rasterio_inmemory(rast_profile, rast_d) as rast_src:
    with rMisc.create_rasterio_inmemory(cat_profile, cat_d) as cat_src:
        cat_d, cat_profile = rMisc.standardizeInputRasters(cat_src, rast_src, resampling_type='nearest')        
        # Loop through each category
        for cur_cat in categories:
            cur_cat_d = (cat_d == cur_cat) * 1
            cur_rast_d = rast_d * cur_cat_d
            with rMisc.create_rasterio_inmemory(rast_profile, cur_rast_d) as cur_rast_src:
                res = rMisc.zonalStats(gdf, cur_rast_src, minVal=minVal, maxVal=maxVal, verbose=verbose, reProj=True)
                res = pd.DataFrame(res, columns=[f'{cur_cat}_SUM', f'{cur_cat}_MIN', f'{cur_cat}_MAX', f'{cur_cat}_MEAN'])
                res['id'] = gdf[gdf_id].values
                res.set_index('id', inplace=True)
                final_zonal_res.append(res)
ret = pd.concat(final_zonal_res, axis=1)
if verbose:
    tPrint(f'**** finished')


10:03:33	Processing 1000 of 16807
10:03:34	Processing 2000 of 16807
10:03:36	Processing 3000 of 16807
10:03:38	Processing 4000 of 16807
10:03:39	Processing 5000 of 16807
10:03:41	Processing 6000 of 16807
10:03:42	Processing 7000 of 16807
10:03:44	Processing 8000 of 16807
10:03:46	Processing 9000 of 16807
10:03:47	Processing 10000 of 16807
10:03:49	Processing 11000 of 16807
10:03:50	Processing 12000 of 16807
10:03:52	Processing 13000 of 16807
10:03:54	Processing 14000 of 16807
10:03:55	Processing 15000 of 16807
10:03:57	Processing 16000 of 16807


NameError: name 'gdf_id' is not defined

In [33]:
gdf

Unnamed: 0,geometry,shape_id
86752842fffffff,"POLYGON ((-1.13963 7.05408, -1.11190 7.06010, ...",86752842fffffff
8675142cfffffff,"POLYGON ((-5.32976 8.48204, -5.30549 8.46892, ...",8675142cfffffff
86751e367ffffff,"POLYGON ((-5.07600 6.44495, -5.05242 6.43263, ...",86751e367ffffff
867509b27ffffff,"POLYGON ((-1.37872 4.58718, -1.35177 4.59248, ...",867509b27ffffff
867500da7ffffff,"POLYGON ((-3.48467 8.21337, -3.45776 8.21978, ...",867500da7ffffff
...,...,...
867509127ffffff,"POLYGON ((-1.72340 4.69916, -1.69657 4.70450, ...",867509127ffffff
8675041afffffff,"POLYGON ((-3.02919 8.64681, -3.00194 8.65335, ...",8675041afffffff
8675326afffffff,"POLYGON ((-5.46685 9.48987, -5.44223 9.47636, ...",8675326afffffff
86752e8b7ffffff,"POLYGON ((-1.37098 8.05540, -1.34309 8.06172, ...",86752e8b7ffffff
