http://wiki.worldbank.org/display/GEOS/MEX_AGEBS_Zonal

In [1]:
import sys, os, importlib
import rasterio, ee, geojson

import pandas as pd
import geopandas as gpd
import skimage.graph as graph

from shapely.geometry import box

sys.path.append("../../../../gostrocks/src")
import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint

sys.path.append("../../../../GEE_Zonal/src")
import gee_tools as gee

sys.path.append("../../../../GOST_Urban/")
from src import UrbanRaster

sys.path.append("../../../../GOSTNets_Raster/src")
import GOSTNets_Raster.market_access as ma

ee.Initialize()
cat = gee.Catalog()

In [2]:
# Define inpput Data
in_folder = "/home/wb411133/data/Country/MEX"
zonal_out_folder = os.path.join(in_folder, "ZONAL_OUTPUTS")
raster_folder = os.path.join(in_folder, "GIS_DATA")
ndvi_folder = os.path.join(zonal_out_folder, "NDVI")
chirps_folder = os.path.join(zonal_out_folder, "CHIRPS")

for tFolder in [zonal_out_folder, raster_folder, ndvi_folder, chirps_folder]:
    if not os.path.exists(tFolder):
        os.makedirs(tFolder)
    
agebs_folder = os.path.join(in_folder, 'AGEB', 'AGEBS')
agebs_files = [os.path.join(agebs_folder, x) for x in os.listdir(agebs_folder) if x.endswith(".shp")]

pop_file = "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/ppp_2020_1km_Aggregated.tif"
ghsl_file = '/home/public/Data/GLOBAL/URBAN/GHS/GHS_SMOD/GHS_SMOD_POP2015_GLOBE_R2019A_54009_1K_V2_0.tif'
global_access_map = '/home/public/Data/GLOBAL/INFRA/FRICTION_2020/2020_motorized_friction_surface.geotiff'


In [3]:
# Define output data
master_agebs = f"{agebs_folder}.shp"
cur_file = master_agebs

out_ghsl = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_GHSL.csv')
out_pop_summaries = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_Pop2020.csv')
ndvi_zonal = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_NDVI_monthly.csv')
ntl_zonal_csv = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_ntl_zonal_res.csv') 
urban_pop = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_UrbanPop2020.csv')
hd_pop = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_HD_UrbanPop2020.csv')
urban_access_res = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_urban_market_access.csv')
hd_urban_access_res = os.path.join(zonal_out_folder, f'{os.path.basename(cur_file)[:-4]}_hd_urban_market_access.csv')

local_pop         = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_WorldPop.tif')
local_urban       = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_Urban.tif')
local_urban_pop   = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_UrbanPop.tif')
local_urban_hd    = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_HDUrban.tif')
local_urban_hdpop = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_HDUrbanPop.tif')
local_access_map  = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_friction_surface.tif')
urban_extents_file = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_UrbanExtents.shp')
hd_urban_extents_file = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_HD_UrbanExtents.shp')
urban_access = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_access_to_urban.tif')
hd_urban_access = os.path.join(raster_folder, f'{os.path.basename(cur_file)[:-4]}_access_to_hd_urban.tif')

In [4]:
# combine agebs into a single dataset
if not os.path.exists(master_agebs):
    try: 
        del final
    except:
        pass
    for agebs_file in agebs_files:
        inD = gpd.read_file(agebs_file)
        try:
            final = final.append(inD)
            final.reset_index()
        except:
            final = inD

    final.to_file(master_agebs)
    inD = final    
else:
    inD = gpd.read_file(master_agebs)

In [5]:
# Convert the input shapefile into a GEE collection
def gpd_to_gee(inD, id_col):
    ''' Create a google earth engine feature collection from a geopandas object
    INPUT
        inD [geopandas dataframe]
        id_col [string] - column name used to ID the features
    '''
    all_polys = []
    bad_idx = []
    for idx, row in inD.iterrows():
        try:
            shpJSON = geojson.Feature(geometry=row['geometry'], properties={"ID":row[id_col]})
            try:
                ee_poly = ee.Geometry.Polygon(shpJSON['geometry']['coordinates'])
            except:
                ee_poly = ee.Geometry.MultiPolygon(shpJSON['geometry']['coordinates'])
            all_polys.append(ee_poly)
        except:
            print(idx)            
            bad_idx.append(idx)
    cur_ee = ee.featurecollection.FeatureCollection(all_polys)
    return(cur_ee)

def get_zonal_res(res):
    ''' create a data frame from the results of GEE zonal results (res.getInfo())
    '''
    all_res = []
    for feat in res['features']:
        all_res.append(feat['properties'])
    return(pd.DataFrame(all_res))

# NDVI - Vegetation

In [None]:
results = cat.search_tags("ndvi")
results = results.search_title("Landsat")
results = results.search_title("32-day")

landsat_collection = results.datasets.iloc[0,0]
lc_id = landsat_collection[landsat_collection.find("/")+1:landsat_collection.find("/")+5]
stat = 'MAX'

results.datasets

In [None]:
importlib.reload(gee)
step_count = 250
steps = list(range(step_count, inD.shape[0], step_count))
steps.append(inD.shape[0])
#steps = list(range(step_count, 1000, step_count))
start_idx = 0
try:
    del brokenD
except:
    pass
for end_idx in steps:
    curD = inD.iloc[start_idx:end_idx,]
    out_file = os.path.join(ndvi_folder, f'MASTER_LIST_NDVI_{start_idx}_{end_idx}.csv')
    if not os.path.exists(out_file):
        try:
            tPrint(os.path.basename(out_file))
            cur_ee = gpd_to_gee(curD, 'CVEGEO')
            # Run analysis on just the L8 data
            zs = gee.ZonalStats(collection_id = landsat_collection,
                            target_features = cur_ee,
                            statistic_type = "all",
                            output_name=f"{lc_id}_ndvi_{stat}",
                            scale=1000,
                            min_threshold=0.1,
                            water_mask=True,                                    
                            tile_scale = 16
                           )
            zonal_res = zs.runZonalStats()
            res = zonal_res.getInfo()
            pd_res = get_zonal_res(res)
            pd_res["CVEGEO"] = curD["CVEGEO"].values
            pd_res.to_csv(out_file)                        
        except:
            try:
                brokenD = brokenD.append(curD)
            except:
                brokenD = curD
            tPrint(f'***ERROR{os.path.basename(out_file)}')
            pass
    start_idx = end_idx

In [None]:
brokenD.shape

In [None]:
importlib.reload(gee)
step_count = 100
steps = list(range(step_count, brokenD.shape[0], step_count))
steps.append(inD.shape[0])
start_idx = 0
try:
    del stillBrokenD
except:
    pass

for end_idx in steps:
    curD = brokenD.iloc[start_idx:end_idx,]
    out_file = os.path.join(ndvi_folder, f'MASTER_LIST_NDVI_BROKEN_{start_idx}_{end_idx}.csv')
    if not os.path.exists(out_file):
        try:
            tPrint(os.path.basename(out_file))
            cur_ee = gpd_to_gee(curD, 'CVEGEO')
            # Run analysis on just the L8 data
            zs = gee.ZonalStats(collection_id = landsat_collection,
                            target_features = cur_ee,
                            statistic_type = "all",
                            output_name=f"{lc_id}_ndvi_{stat}",
                            scale=1000,
                            min_threshold=0.1,
                            water_mask=True,                                    
                            tile_scale = 16
                           )
            zonal_res = zs.runZonalStats()
            res = zonal_res.getInfo()
            pd_res = get_zonal_res(res)
            pd_res["CVEGEO"] = curD["CVEGEO"].values
            pd_res.to_csv(out_file)                        
        except:
            try:
                stillBrokenD = stillBrokenD.append(curD)
            except:
                stillBrokenD = curD
            tPrint(f'***ERROR{os.path.basename(out_file)}')
            pass
    start_idx = end_idx

In [None]:
os.path.exists(out_file)

In [None]:
try:
    del final
except:
    pass
for ndvi_file in os.listdir(ndvi_folder):
    curN = pd.read_csv(os.path.join(ndvi_folder, ndvi_file), index_col=0)
    bad_id_cnt = curN['CVEGEO'].fillna(-1).value_counts().iloc[0]
    if bad_id_cnt > 1:
        start_idx = int(ndvi_file.split("_")[-2])
        end_idx = int(ndvi_file.split("_")[-1].replace(".csv", ""))
        curD = inD.iloc[start_idx:end_idx,]
        good_ids = curD['CVEGEO'].values
        curN['CVEGEO'] = good_ids
    tPrint(f"{ndvi_file}: {bad_id_cnt}")
    try:
        final = final.append(curN)
    except:
        final = curN        

In [None]:
final.to_csv(ndvi_zonal)

# CHIRPS - Weather 

In [6]:
results = cat.search_tags("weather")
results = results.search_title("CHIRPS")

chirps_collection = results.datasets.iloc[1,0]
stat = 'MAX'

In [None]:
importlib.reload(gee)
step_count = 100
steps = list(range(step_count, inD.shape[0], step_count))
steps.append(inD.shape[0])
start_idx = 0
try:
    del brokenD
except:
    pass
for end_idx in steps:
    curD = inD.iloc[start_idx:end_idx,]
    out_file = os.path.join(chirps_folder, f'MASTER_LIST_CHIRPS_{start_idx}_{end_idx}.csv')
    if not os.path.exists(out_file):
        try:
            tPrint(os.path.basename(out_file))
            cur_ee = gpd_to_gee(curD, 'CVEGEO')
            # Run analysis on just the L8 data
            zs = gee.ZonalStats(collection_id = chirps_collection,
                            target_features = cur_ee,
                            statistic_type = "sum",
                            output_name = '',
                            scale=1000,
                            min_threshold=0,
                            water_mask=True,  
                            frequency='monthly',
                            temporal_stat='max',
                            tile_scale = 16
                           )
            zonal_res = zs.runZonalStats()
            res = zonal_res.getInfo()
            pd_res = get_zonal_res(res)
            pd_res["CVEGEO"] = curD["CVEGEO"].values
            pd_res.to_csv(out_file)                        
        except:
            try:
                brokenD = brokenD.append(curD)
            except:
                brokenD = curD
            tPrint(f'***ERROR{os.path.basename(out_file)}')
            break
            pass
    start_idx = end_idx

09:51:23	MASTER_LIST_CHIRPS_100_200.csv
09:51:25	MASTER_LIST_CHIRPS_200_300.csv
09:51:28	MASTER_LIST_CHIRPS_300_400.csv
09:51:31	MASTER_LIST_CHIRPS_400_500.csv
09:51:39	MASTER_LIST_CHIRPS_500_600.csv
09:51:48	MASTER_LIST_CHIRPS_600_700.csv
09:51:58	MASTER_LIST_CHIRPS_700_800.csv
09:52:03	MASTER_LIST_CHIRPS_800_900.csv
09:52:07	MASTER_LIST_CHIRPS_900_1000.csv
09:52:09	MASTER_LIST_CHIRPS_1000_1100.csv
09:52:12	MASTER_LIST_CHIRPS_1100_1200.csv
09:52:15	MASTER_LIST_CHIRPS_1200_1300.csv
09:52:17	MASTER_LIST_CHIRPS_1300_1400.csv
09:52:20	MASTER_LIST_CHIRPS_1400_1500.csv
09:52:22	MASTER_LIST_CHIRPS_1500_1600.csv
09:52:25	MASTER_LIST_CHIRPS_1600_1700.csv
09:52:27	MASTER_LIST_CHIRPS_1700_1800.csv
09:52:31	MASTER_LIST_CHIRPS_1800_1900.csv
09:52:34	MASTER_LIST_CHIRPS_1900_2000.csv
09:52:38	MASTER_LIST_CHIRPS_2000_2100.csv
09:52:41	MASTER_LIST_CHIRPS_2100_2200.csv
09:52:44	MASTER_LIST_CHIRPS_2200_2300.csv
09:52:47	MASTER_LIST_CHIRPS_2300_2400.csv
09:52:50	MASTER_LIST_CHIRPS_2400_2500.csv
09:53:01	

10:04:14	MASTER_LIST_CHIRPS_19300_19400.csv
10:04:17	MASTER_LIST_CHIRPS_19400_19500.csv
10:04:19	MASTER_LIST_CHIRPS_19500_19600.csv
10:04:22	MASTER_LIST_CHIRPS_19600_19700.csv
10:04:24	MASTER_LIST_CHIRPS_19700_19800.csv
10:04:27	MASTER_LIST_CHIRPS_19800_19900.csv
10:04:29	MASTER_LIST_CHIRPS_19900_20000.csv
10:04:33	MASTER_LIST_CHIRPS_20000_20100.csv
10:04:38	MASTER_LIST_CHIRPS_20100_20200.csv
10:04:46	MASTER_LIST_CHIRPS_20200_20300.csv
10:04:54	MASTER_LIST_CHIRPS_20300_20400.csv
10:05:07	MASTER_LIST_CHIRPS_20400_20500.csv
10:05:17	MASTER_LIST_CHIRPS_20500_20600.csv
10:05:27	MASTER_LIST_CHIRPS_20600_20700.csv
10:05:35	MASTER_LIST_CHIRPS_20700_20800.csv
10:05:40	MASTER_LIST_CHIRPS_20800_20900.csv
10:05:43	MASTER_LIST_CHIRPS_20900_21000.csv
10:05:52	MASTER_LIST_CHIRPS_21000_21100.csv
10:05:56	MASTER_LIST_CHIRPS_21100_21200.csv
10:06:00	MASTER_LIST_CHIRPS_21200_21300.csv
10:06:03	MASTER_LIST_CHIRPS_21300_21400.csv
10:06:07	MASTER_LIST_CHIRPS_21400_21500.csv
10:06:10	MASTER_LIST_CHIRPS_2150

In [20]:
cur_ee = gpd_to_gee(curD, 'CVEGEO')
# Run analysis on just the L8 data
zs = gee.ZonalStats(collection_id = chirps_collection,
                target_features = cur_ee,
                statistic_type = "sum",
                output_name = '',
                scale=1000,
                min_threshold=0,
                water_mask=True,  
                frequency='monthly',
                temporal_stat='max',
                tile_scale = 16
               )
zonal_res = zs.runZonalStats()
res = zonal_res.getInfo()
pd_res = get_zonal_res(res)
pd_res["CVEGEO"] = curD["CVEGEO"].values

# Population Density

In [None]:
inR = rasterio.open(pop_file)

res = rMisc.zonalStats(inD, inR, minVal=0)
pop_res = pd.DataFrame(res, columns=['SUM','MIN','MAX','MEAN'])
pop_res['CVEGEO'] = inD['CVEGEO']
pop_res.to_csv(out_pop_summaries)

# Construction Density

In [None]:
inR = rasterio.open(ghsl_file)
inD = inD.to_crs(inR.crs)

res = rMisc.zonalStats(inD, inR, rastType='C', unqVals = [1,2,3,4,5,6])
ghsl_res = pd.DataFrame(res, columns = [f'c{x}' for x in [1,2,3,4,5,6]])
ghsl_res['CVEGEO'] = inD['CVEGEO']
ghsl_res.to_csv(out_ghsl)

# Infrastructure Density

# Urbanization

In [None]:
if not os.path.exists(local_pop):
    rMisc.clipRaster(rasterio.open(pop_file), inD, local_pop)
    
inR = rasterio.open(local_pop)
if inD.crs != inR.crs:
    inD = inD.to_crs(inR.crs)
    
urb = UrbanRaster.urbanGriddedPop(local_pop)
try:
    urban_extents = urb.calculateUrban(densVal=300, totalPopThresh=5000, 
                                       smooth=False, queen=False, 
                                       raster=local_urban, raster_pop=local_urban_pop)
    if not os.path.exists(urban_pop):
        res = rMisc.zonalStats(inD, local_urban_pop, minVal=0)
        pop_res = pd.DataFrame(res, columns=['SUM','MIN','MAX','MEAN'])
        pop_res['CVEGEO'] = inD['CVEGEO']
        pop_res.to_csv(urban_pop)
    urban_extents.to_file(urban_extents_file)
except:
    print("Could not calculate urban popualtion")
try:
    hd_extents =    urb.calculateUrban(densVal=1500, totalPopThresh=50000, 
                                   smooth=True, queen=True, 
                                   raster=local_urban_hd, raster_pop=local_urban_hdpop)
    res = rMisc.zonalStats(inD, local_urban_hdpop, minVal=0)
    pop_res = pd.DataFrame(res, columns=['SUM','MIN','MAX','MEAN'])
    pop_res['CVEGEO'] = inD['CVEGEO']
    pop_res.to_csv(hd_pop)
    hd_extents.to_file(hd_urban_extents_file)
except:
    print("Could not calculate high density urban popualtion")

# Market Access

In [None]:
urban_dests = gpd.read_file(urban_extents_file)
urban_dests['geometry'] = urban_dests['geometry'].apply(lambda x: x.centroid)

hd_urban_dests = gpd.read_file(hd_urban_extents_file)
hd_urban_dests['geometry'] = hd_urban_dests['geometry'].apply(lambda x: x.centroid)

if not os.path.exists(local_access_map):
    inD = gpd.read_file(master_agebs)
    inD_bounds = gpd.GeoDataFrame(pd.DataFrame([[1,box(*inD.total_bounds)]], 
                                               columns=['id','geometry']), 
                                               geometry="geometry", crs=inD.crs)
    rMisc.clipRaster(rasterio.open(global_access_map), inD_bounds, local_access_map)
    
inR = rasterio.open(local_access_map)
frictionD = inR.read()[0,:,:] * 1000
mcp = graph.MCP_Geometric(frictionD)

travel_costs, traceback = ma.calculate_travel_time(inR, mcp, urban_dests, urban_access)
travel_costs, traceback = ma.calculate_travel_time(inR, mcp, hd_urban_dests, hd_urban_access)


In [None]:
# Standardize population to market access
inR1 = rasterio.open(local_pop)
inR2 = rasterio.open(urban_access)

sPop, metadata = rMisc.standardizeInputRasters(inR1, inR2)
access_data = inR2.read()
pop_access = sPop * access_data

with rMisc.create_rasterio_inmemory(metadata, pop_access) as pop_access_R:
    res = rMisc.zonalStats(inD, pop_access_R, minVal=0)
    res = pd.DataFrame(res, columns=['SUM','MIN','MAX','MEAN'])
    
pop_stats = pd.read_csv(out_pop_summaries)
res['urban_access'] = res['SUM']/pop_stats['SUM']
res.to_csv(urban_access_res)

inR2 = rasterio.open(hd_urban_access)
access_data = inR2.read()
pop_access = sPop * access_data

with rMisc.create_rasterio_inmemory(metadata, pop_access) as pop_access_R:
    res = rMisc.zonalStats(inD, pop_access_R, minVal=0)
    res = pd.DataFrame(res, columns=['SUM','MIN','MAX','MEAN'])
    
pop_stats = pd.read_csv(out_pop_summaries)
res['urban_access'] = res['SUM']/pop_stats['SUM']
res.to_csv(hd_urban_access_res)

# Nighttime Lights

In [None]:
# Get a list of the VIIRS images in S3. This example leverages the GOST teams S3 bucket
s3_base = 's3://wbgdecinternal-ntl/'
ntl_file_list = "/home/wb411133/temp/YEM/AWS_NTL_S3.txt"
focal_tile = "TILE1"

all_files = []
with open(ntl_file_list, 'r') as in_aws:
    for line in in_aws:
        if focal_tile in line and 'avg_rade9' in line:
            all_files.append(os.path.join(s3_base, line.split(" ")[-1][:-1]))
            
all_files[:5]

In [None]:
# Run zonal statistics against the admin area
for cur_tif in all_files:
    res = rMisc.zonalStats(inD, cur_tif, minVal=0.05)
    res = pd.DataFrame(res,columns=['SUM','MIN','MAX','MEAN'])
    inD[cur_tif.split("/")[5]] = res['SUM']
    tPrint(os.path.basename(cur_tif))
    
ntl_res = pd.DataFrame(inD.drop(['geometry'], axis=1))

In [None]:
ntl_res.to_csv(ntl_zonal_csv)