## Step 6 - Catchment area generation

This notebook convert OD matrices into spatial catchment areas for destinations. The catchment area extents are determined by cutoff time extents (1 hour, 2 hours, etc.). Note that catchments are exclusive -- even if an origin location in reality has effective acces to 2 or more locations, it is only considered part of the nearest destination's catchment.

In [21]:
import os, sys
import time

# data science basics
import pandas as pd
import geopandas as gpd
import numpy as np

# vector data basics
import shapely
from shapely import wkt
from shapely.wkt import loads
from shapely.ops import transform
from shapely.geometry import Point, MultiPoint

# raster data basics
import rasterio
from rasterio.profiles import DefaultGTiffProfile
from rasterio.transform import from_origin
from rasterio.features import rasterize

# other
import pyproj
import geopy

### Setup

Functions

In [22]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [23]:
# function for sorting alphanumerically

import re

def sorted_nicely( l ): 
    """ Sort the given iterable in the way that humans expect.""" 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

# funciton for sorting matrices smallest to largest, by origin ID then destination ID

def sort_od_matrix(od_matrix):
    
    # sort by O_IDs, then dest node IDs
    od_matrix = od_matrix.sort_values('Unnamed: 0').reindex(sorted_nicely(od_matrix.columns), axis=1)

    # reset O_ID column to the front
    od_matrix = od_matrix[ ['Unnamed: 0'] + [ col for col in od_matrix.columns if col != 'Unnamed: 0' ] ]

    # set the Dest_ID column back to index so the shape is the same as the dWeight shape
    od_matrix.set_index('Unnamed: 0',inplace=True)

Parameters

In [24]:
# pd.set_option('max_columns',None)

In [25]:
simplif_meters = 25

In [26]:
source_epsg = 4326
target_epsg = 3106

In [27]:
# WorldPop data parameters

# constraint_status = 'constrained'
constraint_status = 'unconstrained'

# wp_res = 100
wp_res = 250
# wp_res = '1k'

In [28]:
# Production date for outputs being used

# prod_date = '210312'
# prod_date = '210329'
prod_date = '210503'

File paths

In [29]:
# Local folders
input_pth = r'inputs\\dests'
interm_pth = r'intermediate'
fin_pth = r'final'
res_pth = r'results'

# Shared drive folders
tab_pth = r'../../../Tabular'
geo_pth = r'../../../GEO'
origin_folder = r'..\..\..\GEO\Population'

Define destinations

In [30]:
# # Looping lists

# dest_lst = ['All_cities', 'Minor_cities', 'Dhaka_Chitt',\
#             'Dry_ports', 'River_ports', 'Deep_sea_ports',\
#             'All_SEZs', 'Functioning_SEZs']

dest_lst = ['All_SEZs', 'Functioning_SEZs']

# destination = ['current_PopOrig_all_cities', 'current_PopOrig_deep_sea_ports', 'current_PopOrig_DhakaChitt', 'current_PopOrig_dry_ports', 'current_PopOrig_minor_cities',\
#                'current_CityOrig_all_cities', 'current_CityOrig_deep_sea_ports', 'current_CityOrig_DhakaChitt', 'current_CityOrig_dry_ports', 'current_CityOrig_minor_cities']

In [31]:
# Looping dicts

# dests_time_filt_dct = {'All_cities_PopOrigins' : {}, 'Deep_sea_ports_PopOrigins' : {}, 'Dhaka_Chitt_PopOrigins' : {}, 'Dry_ports_PopOrigins' : {}, 'Minor_cities_PopOrigins' : {},\
#                       'All_cities_CityOrigins' : {}, 'Deep_sea_ports_CityOrigins' : {}, 'Dhaka_Chitt_CityOrigins' : {}, 'Dry_ports_CityOrigins' : {}, 'Minor_cities_CityOrigins': {}}

# dests_pop_time_filt_dct = {'All_cities_PopOrigins' : {}, 'Deep_sea_ports_PopOrigins' : {}, 'Dhaka_Chitt_PopOrigins' : {}, 'Dry_ports_PopOrigins' : {}, 'Minor_cities_PopOrigins' : {},\
#                       'All_cities_CityOrigins' : {}, 'Deep_sea_ports_CityOrigins' : {}, 'Dhaka_Chitt_CityOrigins' : {}, 'Dry_ports_CityOrigins' : {}, 'Minor_cities_CityOrigins': {}}

# dests_time_filt_dct = {'All_cities' : {}, 'Minor_cities' : {}, 'Dhaka_Chitt' : {},\
#                        'Dry_ports' : {}, 'River_ports' : {}, 'Deep_sea_ports': {},\
#                        'All_SEZs' : {}, 'Functioning_SEZs' : {}}

# dests_pop_time_filt_dct = {'All_cities' : {}, 'Minor_cities' : {}, 'Dhaka_Chitt' : {},\
#                            'Dry_ports' : {}, 'River_ports' : {}, 'Deep_sea_ports' : {},\
#                            'All_SEZs' : {}, 'Functioning_SEZs' : {}}

dests_time_filt_dct = {'All_SEZs' : {}, 'Functioning_SEZs' : {}}

dests_pop_time_filt_dct = {'All_SEZs' : {}, 'Functioning_SEZs' : {}}

In [32]:
# # rename stuff that's badly named
# import re

# for key in dests_pop_time_filt_dct.keys():
#     print(re.search('^(.*?_){2}',key).group())

Time filters to loop over (in minutes)

In [33]:
# Minute-wise time cutoffs as needed

# time_filters = [60, 90, 120, 180]
time_filters = [15,30,45,60,90]

### Tabular data transformations

New, per destination

In [34]:
# Loop over each destination, computing all the relevant, filtered and aggregated dataframes for later comptuational usage

for dest,v in dests_pop_time_filt_dct.items():
    
    print(dest)
    
    # read in od grid for calculations

    dest_origs = pd.read_csv(os.path.join(res_pth,prod_date,f'final_od_grid_{dest}_PopOrigins_{constraint_status}_{wp_res}m_res_{simplif_meters}m_simplification.csv'))

    # make dest_origs spatial 

    dest_origs['geometry'] = dest_origs['geometry'].apply(wkt.loads)
    dest_origs_gdf = gpd.GeoDataFrame(dest_origs,geometry='geometry')
    dest_origs_gdf['lon'] = dest_origs_gdf.geometry.x
    dest_origs_gdf['lat'] = dest_origs_gdf.geometry.y

    # Calculate raw filtered origin dataframes, populate to a dict

    raw_time_filt_dct = {}

    for t in time_filters:

        df = dest_origs_gdf[dest_origs_gdf['PLOT_TIME_MINS'] <= t]  
        raw_time_filt_dct.update({t:df})

    # Calculate the aggregate population for the *nearest* destination per time range

    pop_time_filt_dct = {}

    for k, v in raw_time_filt_dct.items():
        df = pd.pivot_table(v,values='VALUE',index='D_ID',aggfunc='sum')\
                .rename(columns={'VALUE' : 'Pop'})\
                .reset_index()
        pop_time_filt_dct.update({k:df})

    # Insert these dics of filtered, aggregated data frames as the values of the master destination dict

    dests_time_filt_dct[dest] = raw_time_filt_dct
    dests_pop_time_filt_dct[dest] = pop_time_filt_dct


All_SEZs
Functioning_SEZs


### Consolidate population per time band per destination and export to a shapefile

In [35]:
for dest_key, val_dct in dests_pop_time_filt_dct.items():
    
    print(dest_key)
    
    dest_gdf = pd.read_csv(os.path.join(fin_pth,prod_date,f'{dest_key}_{constraint_status}_{wp_res}m_res_{simplif_meters}m_simplification_snapped.csv'))
    
#     # rename Destination columns
#     if 'City' in dest_gdf.columns:
#         dest_gdf.rename({'City':'Destination'},axis=1,inplace=True)
#     elif 'RIVER_PORT' in dest.gdf.columns:
#         dest_gdf.rename({'RIVER_PORT':'Destination'},axis=1,inplace=True)
#     else:
#         None

    # load geometry of GDF
    
    dest_gdf['geometry'] = dest_gdf['geometry'].apply(wkt.loads)
    dest_gdf = gpd.GeoDataFrame(dest_gdf,geometry='geometry')
    dest_gdf.rename({'NN':'D_ID'},axis=1,inplace=True)
    dest_gdf.sort_values(by='D_ID',inplace=True)
    
    # Merge the population with the destination GDF, then replace the val_dct with that, renamed for interpretability
    
    for t, val_df in val_dct.items():
        
        time_cutoff = str(t) + 'min'
        
        dest_gdf = pd.merge(dest_gdf,val_df.rename(columns={'Pop' : time_cutoff}),how="left",on='D_ID')
#         print(dest_gdf.head())
#         print(val_df.head())
        val_dct.update({t:dest_gdf[['D_ID','Destination',time_cutoff]].rename(columns={time_cutoff : 'Pop'})})
        
    dest_gdf.to_file(os.path.join(res_pth,prod_date,f"spatial/{dest_key}_catchment_pops.shp"),driver="ESRI Shapefile")

All_SEZs
Functioning_SEZs


In [36]:
dest_gdf.filter(regex='Destination|min').head(10)

Unnamed: 0,Destination,15min,30min,45min,60min,90min
0,"Gazaria: Gajaria Economic Zone, Abdul Monem Ec...",66674.635033,231430.1,340902.3,392615.5,413670.1
1,Gopalganj Sadar: Gopalganj Economic Zone – 2,43343.070404,103099.9,185770.9,274882.2,508091.8
2,Meghna: Cumilla Economic Zone,20550.10336,74731.2,179736.3,312310.2,743862.9
3,"Jamalpur Sadar: Jamalpur Economic Zone, Jamalp...",100939.56839,228651.3,373519.0,636517.0,1328500.0
4,Patiya: Patia Economic Zone,143467.746347,2417314.0,4005090.0,4600692.0,5266895.0
5,Shibalaya: Manikganj Economic Zone (Unused lan...,10672.052635,34379.69,77323.75,130465.7,369779.6
6,Gowainghat: Sylhet Special Economic Zone,5962.576889,31479.78,48144.63,85381.16,261111.6
7,Debiganj: Panchagarh Economic Zone,37004.454521,168589.6,371465.1,585479.9,1137145.0
8,Chunarughat: Habiganj Economic Zone,1773.766033,11879.63,24070.91,37000.87,126369.0
9,Sreepur: Shreepur Economic Zone,45441.737579,284812.6,633658.9,1036795.0,2118544.0


### Convert CSV to raster

Two options -- rasterio way below (https://stackoverflow.com/questions/62472750/how-to-rasterize-a-pandas-dataframe-with-many-points-per-pixel) or gdal_grid method (https://gis.stackexchange.com/questions/254330/python-gdal-grid-correct-use)

also useful : https://gis.stackexchange.com/questions/279953/numpy-array-to-gtiff-using-rasterio-without-source-raster

Define a function for outputting raster catchment extents, optionally with the per-cell population in the second band

In [37]:
def extent_catch(dest,filt_val,filtered_df,pop=False):

    # read in existing worldpop raster to provide metadata conditions for new layers

    with rasterio.open(os.path.join(origin_folder,f'WorldPop/{constraint_status}/bgd_ppp_2020_UNadj_{constraint_status}_{wp_res}m_3106.tif')) as wp_src:
        prof = wp_src.profile
        if pop == False:
            None
        else:
            prof.update(count=2) # set number of bands

    # Rasterize by nearest destination ID and filter out above maximum value

    with rasterio.open(f"results/{prod_date}/spatial/{dest}_{filt_val}min_catch.tif", 'w+',**prof) as out:

        out.nodata = -9999
                       
        # Read in the respective bands for later writing
        out_arr1 = out.read(1)

        # create a generator of geom, value pairs to use in rasterizing, then rasterize
        dest_shapes = ((geom, dest_id) for geom, dest_id in zip(filtered_df.geometry, filtered_df["D_ID"].astype(int)))
        dest_burned = rasterize(shapes=dest_shapes, fill=0, out=out_arr1, transform=out.transform)
        
        # write band
        
        out.write_band(1, dest_burned)
                       
        if pop == False:
            None
        else:
            out_arr2 = out.read(2)

            time_shapes = ((geom, time_to_reach) for geom, time_to_reach in zip(filtered_df.geometry, filtered_df["PLOT_TIME_MINS"].astype(int)))
            time_burned = rasterize(shapes=time_shapes, fill=0, out=out_arr2, transform=out.transform)

            out.write_band(2, time_burned)
            

Loop over each filtered dataframe of origins and output as a raster to the prod_date folder

In [38]:
for dest_key, val_dct in dests_time_filt_dct.items():
    print(dest_key)
    for t, v in val_dct.items():
        print(t)
        extent_catch(dest_key,t,v,pop=True)

All_SEZs
15
30
45
60
90
Functioning_SEZs
15
30
45
60
90


#### Convert raster to polygon

In [39]:
# Define a function to convert rasters to polygons and join in the population covered by each catchment

def catch_rast_to_poly(dest_name, catch_rast, rast_profile, time_filt, dest_pop_df):

    # Start timer
    
    func_start = time.time()

    # open each created raster
    
    with rasterio.open(catch_rast, 'r',**rast_profile) as rast:

        # populate geoms list

        results = (
            {'properties': {'D_ID': v}, 'geometry': s}
            for i, (s, v) 
            in enumerate(
                rasterio.features.shapes(rast.read(1), transform=rast.transform)))

        geoms = list(results)

        # convert to GDF, clean up, and dissolve

        catch_poly = gpd.GeoDataFrame.from_features(geoms)
        catch_poly['D_ID'] = catch_poly['D_ID'].astype(int)
        catch_poly['D_ID'].replace(-9999,0,inplace=True) # replace nulls with 0
        catch_poly = catch_poly.dissolve(by='D_ID')
        
        # join in total population, drop uncovered areas

        catch_poly = pd.merge(catch_poly,dest_pop_df,how='left',on='D_ID')
        catch_poly = catch_poly[catch_poly['D_ID'] != 0]
        catch_poly.crs = f"EPSG:{target_epsg}"
        catch_poly = catch_poly.to_crs(source_epsg)

        # export to shapefile

        catch_poly.to_file(f"results/{prod_date}/spatial/{dest_name}_{time_filt}min_catch_poly.shp",driver="ESRI Shapefile")
        return catch_poly
            
    # Report function time
    
    func_end = time.time()
    print(f'time elapsed for summing {dest_name}')
    print(str((func_end - func_start) / 60) + ' minutes')

Create polygons from the catchment rasters and join in the populations covered for each destination

In [40]:
with rasterio.open(os.path.join(origin_folder,f'WorldPop/{constraint_status}/bgd_ppp_2020_UNadj_{constraint_status}_{wp_res}m_{source_epsg}.tif')) as wp_src:
    
    prof = wp_src.profile
    prof.update(count=2)

    for dest_key, val_dct in dests_pop_time_filt_dct.items():
        
        # Keep track of which destination is being processed
        print(dest_key)
        
        for t, val_df in val_dct.items():
            
            # Keep track of which travel time is being processed
            print(t)
            
            catch_rast = f"results/{prod_date}/spatial/{dest_key}_{t}min_catch.tif"

            catch_rast_to_poly(dest_name=dest_key,catch_rast=catch_rast,rast_profile = prof,time_filt=t,dest_pop_df=val_df)

All_SEZs
15
30
45
60
90
Functioning_SEZs
15
30
45
60
90
