# Generate H3 geospatial data on AWS bucket

In [57]:
import sys, os, importlib, math, multiprocessing, boto3, pickle
import rasterio, geojson

import pandas as pd
import geopandas as gpd
import numpy as np

from h3 import h3
from tqdm import tqdm
from shapely.geometry import Polygon

sys.path.insert(0, "/home/wb411133/Code/gostrocks/src")
import GOSTrocks.rasterMisc as rMisc
import GOSTrocks.ntlMisc as ntl
import GOSTrocks.mapMisc as mapMisc
from GOSTrocks.misc import tPrint

sys.path.append("../src")
import h3_helper

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Define S3 parameters
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_spatial_data'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)
out_folder = "/home/wb411133/projects/Space2Stats/"

In [None]:
admin_bounds = "/home/public/Data/GLOBAL/ADMIN/ADMIN2/HighRes_20230328/shp/WB_GAD_ADM0.shp"
inA = gpd.read_file(admin_bounds)
inA['ID'] = inA.index
inA.head()

In [None]:
all_args = []
h3_level = 6

for idx, row in inA.iterrows():
    cur_prefix = os.path.join(prefix, row['WB_REGION'], row['ISO_A3'])
    '''
    out_file = f's3://{bucket}/{cur_prefix}/h3_level_{h3_level}.geojson'
    try:
        xx = gpd.read_file(out_file)
    except:
        print(out_file)
    '''
    print(cur_prefix)
    all_args.append([row, h3_level])
    

In [None]:
def generate_grid(row, lvl):
    cur_prefix = os.path.join(prefix, row['WB_REGION'], row['ISO_A3'])
    out_file = f's3://{bucket}/{cur_prefix}/h3_level_{lvl}.geojson'
    tPrint(f"Starting {cur_prefix}")    
    cur_gpd = gpd.GeoDataFrame(pd.DataFrame(row).transpose(), geometry='geometry', crs=inA.crs)
    zonalC = country_zonal.country_h3_zonal(row['ISO_A3'], cur_gpd, "ID", lvl, out_folder)
    try:
        h3_grid = zonalC.generate_h3_grid()
        h3_grid.to_file(out_file, driver="GeoJSON")
        tPrint(f"Completed {cur_prefix}")
        return(h3_grid)
    except:
        tPrint(f"Error processing lvl {lvl} for {row['ISO_A3']}")



In [None]:
with multiprocessing.Pool(processes=min([70,len(all_args)])) as pool:
    results = pool.starmap(generate_grid, all_args)

# Aggregate H3 zonal results

In [3]:
# Define S3 parameters
bucket = 'wbg-geography01' 
zonal_prefix = 'Space2Stats/h3_stats_data/GLOBAL/'
out_prefix = 'Space2Stats/parquet/GLOBAL/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)

In [9]:
# Search for zonal results
# Loop through the S3 bucket and get all the keys for files that are .tif 
more_results = True
loops = 0
good_res = {}
verbose=True
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=zonal_prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=zonal_prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('csv'):            
            cur_variable = res['Key'].split("/")[-3]
            try:
                good_res[cur_variable].append(res['Key'])
            except:
                good_res[cur_variable] = [res['Key']]

Completed loop: 0
Completed loop: 1
Completed loop: 2
Completed loop: 3
Completed loop: 4
Completed loop: 5
Completed loop: 6
Completed loop: 7
Completed loop: 8
Completed loop: 9
Completed loop: 10
Completed loop: 11
Completed loop: 12
Completed loop: 13
Completed loop: 14
Completed loop: 15
Completed loop: 16
Completed loop: 17
Completed loop: 18
Completed loop: 19
Completed loop: 20
Completed loop: 21
Completed loop: 22


In [13]:
good_res.keys()

dict_keys(['Urbanization', 'VIIRS_Monthly_LEN', 'WorldPop_2020_Demographics'])

In [27]:
# Aggregate the WorldPop demographics data
pop_files = good_res['WorldPop_2020_Demographics']
pop_yrs = {}
for pFile in pop_files:
    lbl = "_".join(os.path.basename(pFile).split("_")[1:4])
    try:
        pop_yrs[lbl].append(pFile)
    except:
        pop_yrs[lbl] = [pFile]

In [34]:
for lbl, files in pop_yrs.items():
    out_file = os.path.join('s3://', bucket, out_prefix, 'WorldPop_2020_Demographics', f'{lbl}.parquet')
    pop_dfs = [pd.read_csv(os.path.join("s3://", bucket, x)) for x in files]
    all_res = pd.concat(pop_dfs)
    all_res.to_parquet(out_file)
    tPrint(lbl)

15:50:53	f_0_2020
15:52:16	f_10_2020
15:53:36	f_15_2020
15:54:59	f_1_2020
15:56:23	f_20_2020
15:57:44	f_25_2020
15:59:04	f_30_2020
16:00:28	f_35_2020
16:01:51	f_40_2020
16:03:17	f_45_2020
16:04:42	f_50_2020
16:06:10	f_55_2020
16:07:35	f_5_2020
16:09:04	f_60_2020
16:10:30	f_65_2020
16:11:57	f_70_2020
16:13:27	f_75_2020
16:14:58	f_80_2020
16:16:27	m_0_2020
16:17:54	m_10_2020
16:19:20	m_15_2020
16:20:46	m_1_2020
16:22:10	m_20_2020
16:23:34	m_25_2020
16:25:01	m_30_2020
16:26:27	m_35_2020
16:27:50	m_40_2020
16:29:16	m_45_2020
16:30:42	m_50_2020
16:32:08	m_55_2020
16:33:33	m_5_2020
16:34:57	m_60_2020
16:36:21	m_65_2020
16:37:45	m_70_2020
16:39:09	m_75_2020
16:40:35	m_80_2020


In [48]:
# Aggregate the nighttime lights data
ntl_files = good_res['VIIRS_Monthly_LEN']
ntl_yrs = {}
for pFile in ntl_files:
    lbl = "_".join(os.path.basename(pFile).split("_")[:3])   
    try:
        ntl_yrs[lbl].append(pFile)
    except:
        ntl_yrs[lbl] = [pFile]

In [54]:
for lbl, files in ntl_yrs.items():
    year = lbl.split("_")[-1][:4]
    month = lbl.split("_")[-1][4:6]
    out_file = os.path.join('s3://', bucket, out_prefix, 'NTL_VIIRS_LEN', year, month, f'{lbl}.parquet')    
    pop_dfs = [pd.read_csv(os.path.join("s3://", bucket, x)) for x in files]
    all_res = pd.concat(pop_dfs)
    all_res.to_parquet(out_file)
    tPrint(lbl)

09:32:03	DNB_j01_20230601-20230630
09:33:46	DNB_j01_20230701-20230731
09:35:34	DNB_j01_20230901-20230930
09:37:22	DNB_j01_20231001-20231031
09:39:09	DNB_j01_20231101-20231130
09:40:51	DNB_j01_20231201-20231231
09:42:30	DNB_j01_20240101-20240131
09:44:20	DNB_npp_20120119-20120131
09:46:09	DNB_npp_20120201-20120229
09:48:02	DNB_npp_20120301-20120331
09:49:46	DNB_npp_20120401-20120430
09:51:34	DNB_npp_20120501-20120531
09:53:23	DNB_npp_20120601-20120630
09:55:08	DNB_npp_20120701-20120731
09:56:59	DNB_npp_20120801-20120831
09:58:52	DNB_npp_20120901-20120930
10:00:47	DNB_npp_20121001-20121031
10:02:47	DNB_npp_20121101-20121130
10:04:50	DNB_npp_20121201-20121231
10:06:50	DNB_npp_20130101-20130131
10:09:02	DNB_npp_20130201-20130228
10:11:12	DNB_npp_20130301-20130331
10:13:21	DNB_npp_20130401-20130430
10:15:29	DNB_npp_20130501-20130531
10:17:38	DNB_npp_20130601-20130630
10:19:42	DNB_npp_20130701-20221231
10:21:53	DNB_npp_20130801-20130831
10:24:10	DNB_npp_20130901-20130930
10:26:22	DNB_npp_201

In [53]:
out_file

's3://wbg-geography01/Space2Stats/parquet/GLOBAL/NTL_VIIRS_LEN/2023/06/DNB_j01_20230601-20230630.parquet'

In [61]:
pickle_path = "/home/wb411133/Code/DECAT_Space2Stats/src/h1_dictionary_of_h6_geodata_frames.pickle"
os.path.exists(pickle_path)

True

In [62]:
with open(pickle_path, 'rb') as handle:
    xx = pickle.load(handle)

AttributeError: Can't get attribute '_unpickle_block' on <module 'pandas._libs.internals' from '/home/wb411133/.conda/envs/ee/lib/python3.9/site-packages/pandas/_libs/internals.cpython-39-x86_64-linux-gnu.so'>