# S2S Zonal statistics

Zonal statistics are run on the standardized [H3 grid](https://h3geo.org/docs/core-library/restable/); the process is run on a country-by-country basis.

For the zonal statistics, each zonal statistic is run against the source dataset as a whole, then it is stratified by urban classification from the European Commission - [GHS-SMOD](https://ghsl.jrc.ec.europa.eu/ghs_smod2019.php). This creates an summary dataset that has the standard zonal stats columns (SUM, MEAN, MAX, MIN) as well as the same for urban areas (SUM_urban, MEAN_urban, MAX_urban, MIN_urban).

In [1]:
import sys, os, importlib, math, multiprocessing
import rasterio, geojson

import pandas as pd
import geopandas as gpd
import numpy as np

from h3 import h3
from tqdm import tqdm
from shapely.geometry import Polygon

import GOSTrocks.rasterMisc as rMisc
import GOSTrocks.ntlMisc as ntl
from GOSTrocks.misc import tPrint

sys.path.append("../src")
import h3_helper
import global_zonal

%load_ext autoreload
%autoreload 2



In [None]:
admin_folder = "/home/wb411133/data/Global/ADMIN/"


In [11]:
h3_level = 6

adm0_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
adm2_bounds = "/home/wb411133/data/Global/ADMIN/Admin2_Polys.shp"
out_folder = f"/home/wb411133/projects/Space2Stats/Population"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
global_urban = "/home/public/Data/GLOBAL/GHSL/SMOD/GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif"

## Run analysis on population by gender and age

In [3]:
population_folder = "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/GLOBAL_1km_Demographics"
pop_files = [os.path.join(population_folder, x) for x in os.listdir(population_folder) if x.endswith("1km.tif")]

In [4]:
# get a list of h3 levels to process
h3_0_list = h3_helper.generate_lvl0_lists(6, return_gdf=True, buffer0=False)

Loading pickle file h0_dictionary_of_h6_geodata_frames.pickle: it exists True


In [31]:
inA.crs

<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World between 85.06°S and 85.06°N.
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [60]:
# Generate a list from the global admin boundaries
inA = gpd.read_file(adm2_bounds)
inA_0 = gpd.read_file(adm0_bounds)

def try_get_iso3(x):
    try:
        region = inA_0.loc[inA_0['ISO3'] == x,'Region'].values[0]
        if region is None:
            return('Other')
        return(region)
    except:
        print(f'Error getting region for {x}')
        return('Other')
inA['WB_REGION'] = inA['ISO3'].apply(try_get_iso3)
inA = inA.to_crs(4326)
region_mapping = {
    'Latin America & Caribbean':'LCR',
    'Other':'Other',
    'Sub-Saharan Africa':'AFR',
    'East Asia & Pacific':'EAP',
    'Middle East & North Africa':'MENA',
    'Europe & Central Asia':'ECA',
    'South Asia':'SAR'
}
inA['WB_REGION'] = inA['WB_REGION'].map(region_mapping)
inA['geometry'] = inA['geometry'].buffer(0)
inA = inA.loc[~inA['geometry'].apply(lambda x: x is None)]
inA.to_file(adm2_bounds)

Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None
Error getting region for None


In [29]:
AWS_S3_BUCKET = 'wbg-geography01'
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN")

def run_zonal(gdf, cur_raster_file, out_file):
    cName = f'{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}'
    tPrint(f'Starting {cName}')
    res = rMisc.zonalStats(gdf, cur_raster_file, minVal=0)
    res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN'])
    res['id'] = gdf['shape_id'].values
    res.to_csv(
        f"s3://{AWS_S3_BUCKET}/{out_file}",
        index=False,
        storage_options={
            "key": AWS_ACCESS_KEY_ID,
            "secret": AWS_SECRET_ACCESS_KEY,
            "token": AWS_SESSION_TOKEN,
        },
    )
    #res.to_csv(out_file)
    tPrint(f'**** finished {cName}')
    return(res)
    

In [None]:
# set up mp arguments
arg_list = []
processed_list = []

for h3_0_key, cur_gdf in h3_0_list.items():
    for pop_file in pop_files:
        filename = os.path.basename(f'{pop_file.replace(".tif", "")}_zonal.csv')
        out_s3_key = f'Space2Stats/h3_stats_data/ADM_GLOBAL/WorldPop_2020_Demographics/{h3_0_key}/{filename}'
        full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key)        
        try:
            tempPD = pd.read_csv(full_path)
            processed_list.append(filename)
        except:
            arg_list.append([cur_gdf, pop_file, out_s3_key])
    tPrint(f'Remaining: {len(arg_list)}\t Processed: {len(processed_list)}')


In [None]:
with mp.pool(mp.cpu_count() - 10):
    res = 

# DEBUGGING

In [63]:
all_files = global_zonal.get_global_table_from_s3("WorldPop_2020_Demographics", 
                                                  prefix='Space2Stats/h3_stats_data/ADM_GLOBAL/',
                                                  read_data=True)
all_files

{'global_f_0_2020_1km_zonal':                SUM       MIN         MAX       MEAN     id
 0      9271.396484  0.776875   18.729755   2.788390   2245
 1      4781.694336  0.902219   14.667562   2.646206   2246
 2      8980.358398  0.709510   22.562714   2.098705   2247
 3     12372.816406  0.806483   28.565184   3.262013   2248
 4     11200.134766  0.023843   18.526701   1.417559   2249
 ...            ...       ...         ...        ...    ...
 1345   6294.885742  0.002925   37.945431   1.862943  29170
 1346   4111.004883  0.001787   16.255522   0.609218  29171
 1347  14159.621094  0.241238  200.965500  17.633402  29172
 1348  15714.679688  0.000000   73.089981   9.501016  29173
 1349   8892.969727  0.010115   92.677490   4.588736  29174
 
 [37278 rows x 5 columns],
 'global_f_10_2020_1km_zonal':                SUM       MIN          MAX        MEAN     id
 0     36274.640625  3.037503    73.231514   10.909666   2245
 1     18885.472656  3.586561    57.348728   10.451285   2246
 2    

In [65]:
orig_files = all_files.copy()
inA = gpd.read_file(adm2_bounds)
for label, inD in orig_files.items():
    tempD = inD.set_index('id')
    tempA = inA.join(tempD)
    c_label = "_".join(label.split("_")[1:3])
    inA[c_label] = tempA['SUM']

In [66]:
inA.head()

Unnamed: 0,OBJECTID,ISO_A2,WB_ADM1_CO,WB_ADM0_CO,WB_ADM0_NA,WB_ADM1_NA,WB_ADM2_CO,WB_ADM2_NA,Shape_Leng,Shape_Area,...,m_40,m_45,m_50,m_55,m_5,m_60,m_65,m_70,m_75,m_80
0,1,AF,272,1,Afghanistan,Badakhshan,3445,Baharak,426829.104142,4721102000.0,...,4204.877441,2760.053955,2042.870605,1643.195068,8620.248047,1423.365479,768.383423,657.413391,234.051926,194.463745
1,2,AF,272,1,Afghanistan,Badakhshan,3446,Darwaz,360455.172484,4728969000.0,...,3135.317871,2061.960938,1529.525879,1233.140381,6418.612793,1068.314209,579.843018,492.28656,176.273575,147.215302
2,3,AF,272,1,Afghanistan,Badakhshan,3447,Fayzabad,298846.796442,4631564000.0,...,11744.238281,7707.375,5706.112305,4589.418945,24076.332031,3975.566162,2146.647461,1836.886353,653.836792,543.436035
3,4,AF,272,1,Afghanistan,Badakhshan,3448,Ishkashim,316148.305669,2447967000.0,...,767.449646,504.885223,374.65625,302.177094,1570.741211,261.79306,142.223328,120.588257,43.221664,36.128349
4,5,AF,272,1,Afghanistan,Badakhshan,3449,Jurm,359146.510708,5454013000.0,...,4500.88623,2954.054199,2186.755859,1758.863892,9227.073242,1523.58606,822.58667,703.841003,250.55455,208.213867


In [68]:
inA.columns

Index(['OBJECTID', 'ISO_A2', 'WB_ADM1_CO', 'WB_ADM0_CO', 'WB_ADM0_NA',
       'WB_ADM1_NA', 'WB_ADM2_CO', 'WB_ADM2_NA', 'Shape_Leng', 'Shape_Area',
       'ISO3', 'WB_REGION', 'geometry', 'f_0', 'f_10', 'f_15', 'f_1', 'f_20',
       'f_25', 'f_30', 'f_35', 'f_40', 'f_45', 'f_50', 'f_55', 'f_5', 'f_60',
       'f_65', 'f_70', 'f_75', 'f_80', 'm_0', 'm_10', 'm_15', 'm_1', 'm_20',
       'm_25', 'm_30', 'm_35', 'm_40', 'm_45', 'm_50', 'm_55', 'm_5', 'm_60',
       'm_65', 'm_70', 'm_75', 'm_80', 'f_total', 'm_total', 'f_youth',
       'm_youth'],
      dtype='object')

In [67]:
f_cols = ['f_0', 'f_10', 'f_15', 'f_1', 'f_20', 'f_25', 'f_30', 'f_35', 'f_40',
       'f_45', 'f_50', 'f_55', 'f_5', 'f_60', 'f_65', 'f_70', 'f_75', 'f_80']
m_cols = ['m_0', 'm_10', 'm_15', 'm_1', 'm_20', 'm_25', 'm_30', 'm_35', 'm_40',
       'm_45', 'm_50', 'm_55', 'm_5', 'm_60', 'm_65', 'm_70', 'm_75', 'm_80']
numeric_cols = f_cols + m_cols

for c_col in numeric_cols:
    inA[c_col] = inA[c_col].astype(int)

inA['f_total'] = inA.loc[:,f_cols].sum(axis=1)
inA['m_total'] = inA.loc[:,m_cols].sum(axis=1)

inA['f_youth'] = inA.loc[:,['f_20','f_25']].sum(axis=1)
inA['m_youth'] = inA.loc[:,['m_20','m_25']].sum(axis=1)

total_cols = ['f_total','m_total','f_youth','m_youth','geometry']

In [70]:
good_cols = ['ISO3','WB_REGION','WB_ADM0_NA','WB_ADM1_NA', 'WB_ADM2_NA']
inA.loc[:,good_cols+numeric_cols+total_cols].head()

Unnamed: 0,ISO3,WB_REGION,WB_ADM0_NA,WB_ADM1_NA,WB_ADM2_NA,f_0,f_10,f_15,f_1,f_20,...,m_60,m_65,m_70,m_75,m_80,f_total,m_total,f_youth,m_youth,geometry
0,AFG,SAR,Afghanistan,Badakhshan,Baharak,1999,8950,7148,6070,5263,...,1423,768,657,234,194,56985,70603,8761,12848,"POLYGON ((71.31934 37.24848, 71.31261 37.24059..."
1,AFG,SAR,Afghanistan,Badakhshan,Darwaz,1487,6661,5322,4523,3930,...,1068,579,492,176,147,42579,52670,6555,9588,"POLYGON ((70.99956 38.47933, 71.00709 38.47256..."
2,AFG,SAR,Afghanistan,Badakhshan,Fayzabad,5583,25000,19969,16959,14701,...,3975,2146,1836,653,543,159189,197217,24472,35888,"POLYGON ((70.76822 37.28326, 70.77164 37.28042..."
3,AFG,SAR,Afghanistan,Badakhshan,Ishkashim,363,1630,1302,1107,962,...,261,142,120,43,36,10419,12886,1605,2346,"POLYGON ((71.46068 37.18361, 71.44989 37.18064..."
4,AFG,SAR,Afghanistan,Badakhshan,Jurm,2139,9581,7652,6499,5634,...,1523,822,703,250,208,61002,75574,9378,13753,"POLYGON ((71.18169 36.49196, 71.17219 36.48955..."


In [71]:
out_folder = "/home/wb411133/temp/ADM2_ZONAL"
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
inA.loc[:,good_cols+numeric_cols+total_cols].to_file(os.path.join(out_folder, "TOTAL_adm_zonal_pop.shp"))
inA.loc[:,good_cols+total_cols].to_file(os.path.join(out_folder, "SIMPLE_adm_zonal_pop.shp"))