## Country-level INFRA-SAP


- Origins: Population grid (Worldpop downsampled to 500 meters)
- Destinations: Cities, airports, border crossings, and ports

Typical access analysis with two adjustments:
    1. Extract different sets of destinations from OD
    2. Join travel time to origin grid based on "NN with the fastest route" (not necessarily closest NN)

In [2]:
import os, sys, time, importlib

import geopandas as gpd
import pandas as pd
import networkx as nx
sys.path.append('/home/wb514197/Repos/GOSTnets')

import GOSTnets as gn
import rasterio as rio
from osgeo import gdal
import GOSTnets.calculate_od_raw as calcOD
import numpy as np
from shapely.geometry import Point

sys.path.append('/home/wb514197/Repos/INFRA_SAP')
from infrasap import aggregator
from shapely.wkt import loads

In [3]:
# This is a Jupyter Notebook extension which reloads all of the modules whenever you run the code
# This is optional but good if you are modifying and testing source code
%load_ext autoreload
%autoreload 2

In [5]:
country = 'costarica'
iso3 = 'CRI'
epsg = 32616

### Load origins and graph

In [6]:
base_in = "/home/public/Data/PROJECTS/INFRA_SAP"
in_folder = os.path.join(base_in, iso3)

# define data paths
focal_admin2 = os.path.join(in_folder, "admin.shp")
focal_osm = os.path.join(in_folder, f"{country}-latest.osm.pbf")
pop_name = "WP_2020_1km"
wp_1km = os.path.join(in_folder, f"{pop_name}.tif")
urban_extents = os.path.join(in_folder, "urban_extents.shp")
airports = os.path.join(in_folder, "airports.shp")
ports = os.path.join(in_folder, "ports.shp")
borders = os.path.join(in_folder, "borders.shp")
G_path = os.path.join(in_folder, 'graph', f"G_{iso3}.pickle")

base_out = "/home/wb514197/data/INFRA_SAP" # GOT permission denied using public 
out_folder = os.path.join(base_out, iso3)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

Convert **WP_2020_1km.tif** into a point GeoData frame

In [7]:
out_pop_csv = os.path.join(out_folder, f"{pop_name}.csv")
wp_df = pd.read_csv(out_pop_csv, sep=' ')
wp_df.rename(columns={"Z":"Pop"}, inplace=True)
wp_df = wp_df.loc[wp_df.Pop!=-99999.0].copy()
geoms = [Point(xy) for xy in zip(wp_df.X, wp_df.Y)]
wp_df.drop(["X","Y"], axis=1, inplace=True)
crs = 'EPSG:4326'
origins = gpd.GeoDataFrame(wp_df, crs=crs, geometry=geoms)
origins['pointid'] = origins.index

### Prepare Graph

In [8]:
G_time = nx.read_gpickle(os.path.join(out_folder, 'graph', f'G_{iso3}_Salt.pickle'))

#### Select largest graph (again)

In [9]:
list_of_subgraphs = [G_time.subgraph(c).copy() for c in sorted(nx.strongly_connected_components(G_time), key=len, reverse=True)]

In [10]:
G_largest = list_of_subgraphs[0]

### Prepare destinations

In [20]:
from shapely.wkt import loads

In [16]:
def load_csv(csv_path, geometry = 'geometry', crs = 'epsg:4326'):
    df = pd.read_csv(csv_path, index_col=0)
    df[geometry] = df[geometry].apply(loads)
    gdf = gpd.GeoDataFrame(df, crs = crs)
    return(gdf)

In [17]:
dest_all = load_csv(os.path.join(out_folder, 'destination_all.csv'))

In [21]:
len(origins), len(dest_all)

(62322, 22)

### Snap origins and destinations

#### Snap to origins to 5 nearest nodes

In [22]:
utm = f"EPSG:{epsg}"

In [23]:
%%time
# this function returns a dictionary of origin IDs, with a list of 5 NNs, and a corresponding list of distances
origins_snapped_dict = gn.pandana_snap_to_many(G_largest, origins, source_crs='epsg:4326', target_crs=utm, 
                                               add_dist_to_node_col = True, k_nearest=5, origin_id='pointid')

CPU times: user 9.74 s, sys: 172 ms, total: 9.92 s
Wall time: 9.89 s


In [24]:
dest_snapped = gn.pandana_snap_c(G_largest, dest_all, source_crs='epsg:4326', target_crs=utm,
                                 add_dist_to_node_col = False)

In [25]:
# origins_unique_nn = list(set(origins_snapped['NN']))
dest_nn = list(dest_snapped['NN'])
list_origins_NN = []
for each in origins_snapped_dict.values():
    list_origins_NN += each['NN']
origins_unique_nn = list(set(list_origins_NN))

In [26]:
%%time
curOD = gn.calculate_OD(G_largest, origins_unique_nn, dest_nn, fail_value = 999999999, weight='length')

CPU times: user 233 ms, sys: 905 µs, total: 234 ms
Wall time: 233 ms


In [27]:
curOD[curOD==999999999]

array([], dtype=float64)

In [28]:
curOD.shape

(2471, 22)

In [29]:
od_df = pd.DataFrame(curOD, index=origins_unique_nn, columns=dest_nn)

In [30]:
od_df.head()

Unnamed: 0,1750,new_obj_1163,4763,new_obj_264,new_obj_1030_599_2116,1750.1,new_obj_1754_541_1904,new_obj_481_119_411,5267_50_148,new_obj_1139,...,new_obj_1805_627_2200,new_obj_311,6805,6805.1,6805.2,6805.3,new_obj_1867,new_obj_1867.1,new_obj_1867.2,new_obj_1867.3
8195,159319.697987,71306.821165,5390.790836,143229.577052,66367.946227,159319.697987,202741.704519,170611.029736,322299.145659,3222.975087,...,228337.373992,115338.176101,308883.904514,308883.904514,308883.904514,308883.904514,264777.178387,264777.178387,264777.178387,264777.178387
8196,16463.270385,239256.197692,166431.07818,266251.578821,234317.322754,16463.270385,366535.423191,338560.406263,465913.215579,171167.972418,...,355606.554325,265135.347898,349181.127963,349181.127963,349181.127963,349181.127963,387389.358719,387389.358719,387389.358719,387389.358719
new_obj_1052,234133.125196,9888.607571,79733.174092,207288.842829,11120.867687,234133.125196,140990.214469,93122.215822,339644.479587,74468.822129,...,166585.883943,132683.510028,326229.238441,326229.238441,326229.238441,326229.238441,203025.688338,203025.688338,203025.688338,203025.688338
new_obj_469,140385.128919,94781.714848,22574.272153,116446.164834,89842.83991,140385.128919,226216.598202,194085.92342,316107.801591,28080.585211,...,251812.267676,113924.664168,296648.811758,296648.811758,296648.811758,296648.811758,288252.07207,288252.07207,288252.07207,288252.07207
22,266314.699024,38291.693648,111914.747919,243157.440926,46989.465784,266314.699024,93143.189705,115670.55271,375513.077683,106650.395956,...,118738.859179,168552.108125,362097.836538,362097.836538,362097.836538,362097.836538,155178.663574,155178.663574,155178.663574,155178.663574


For each origin set of 5 (k) possible NN:
    - Add snapping dist (in time) + time to a destination
    - Which destination? min time from all of them won't necessarily work, so we need to find the closest destination for each origin, and select the NN which yields the fastest travel time to that dest
    - closest_dest.idx should match the OD column order.

In [32]:
%%time
closest_dest = gn.pandana_snap_points(origins, dest_all, source_crs='epsg:4326', target_crs=utm,
                                      add_dist_to_node_col=True)

CPU times: user 3.16 s, sys: 40.5 ms, total: 3.2 s
Wall time: 3.2 s


In [33]:
closest_dest = closest_dest.set_index('pointid')

In [34]:
closest_dest.head()

Unnamed: 0_level_0,Pop,geometry,idx,idx_dist
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
177,265.94754,POINT (651070.347 1239876.698),21,605.39366
178,197.396774,POINT (651980.301 1239880.985),21,1128.859165
179,33.11467,POINT (652890.259 1239885.298),21,1959.147076
180,15.100848,POINT (653800.219 1239889.637),21,2838.718257
181,8.140772,POINT (654710.182 1239894.001),21,3732.898671


In [35]:
%%time
fastest_nn = []
fastest_dist = []
custom_speed = 30 # km/h

for pointid, items in origins_snapped_dict.items():
    dest_index = closest_dest.loc[pointid].idx
    nn_list = items['NN']
    dist_list = items['NN_dist']
    total_dist_list = []
    for i in range(0, len(nn_list)):
        dist_snapping = dist_list[i]
#         time_snapping = ((dist_list[i] / 1000) / custom_speed) * 60 * 60
        dist_to_dest = od_df.loc[nn_list[i]].iloc[dest_index]
#         time_to_dest = od_df.loc[nn_list[i]].iloc[dest_index]
        total_dist = dist_snapping+dist_to_dest
#         total_time = time_snapping+time_to_dest
        total_dist_list.append(total_dist)
#         print(f"id: {nn_list[i]}, snapping dist (km): {dist_list[i]/1000:.2f}, time to dest (min): {(total_time/60)/60:.2f}")
    min_pos = total_dist_list.index(min(total_dist_list))
    fastest_nn.append(nn_list[min_pos])
    fastest_dist.append(dist_list[min_pos])
#     origins_snapped_smart.loc[pointid, "NN"] = nn_list[min_pos]
#     origins_snapped_smart.loc[pointid, "NN_dist"] = dist_list[min_pos]

CPU times: user 43.6 s, sys: 39.5 ms, total: 43.6 s
Wall time: 43.6 s


In [38]:
origins_snapped = origins.copy().set_index('pointid')
origins_snapped['NN'] = pd.Series(fastest_nn, index = origins_snapped.index)
origins_snapped['NN_dist'] = pd.Series(fastest_dist, index = origins_snapped.index)
origins_snapped['pointid'] = origins_snapped.index
origins_snapped['NN_dist_hours'] = ((origins_snapped.NN_dist / 1000) / custom_speed)

In [39]:
origins_join = origins_snapped.join(od_df, on='NN')

In [43]:
all(origins_join.columns[6:] == dest_snapped.NN)

True

In [44]:
origins_join_rename = origins_join.copy()
origins_join_rename.columns = pd.MultiIndex.from_arrays([['origin' for each in origins_snapped.columns]+list(dest_snapped.dest_type), origins_snapped.columns.append(dest_snapped.index)])

Add snapping distance

In [46]:
origins_join2 = origins_join_rename.apply(lambda x: (x + origins_join_rename.origin.NN_dist)/1000 if x.name[1] in dest_snapped.index else x)

In [49]:
out_folder

'/home/wb514197/data/INFRA_SAP/CRI'

In [50]:
origins_join2.to_csv(os.path.join(out_folder, 'OD_08_03_Distances.csv'))

### Make rasters of min travel time to each dest

In [53]:
raster_path = wp_1km

In [54]:
output_path = os.path.join(out_folder, "travel_distance")
if not os.path.exists(output_path):
    os.mkdir(output_path)

In [56]:
# CHECK THAT MOST POPULATED CITY IS THE CAPITAL
cap_idx = dest_all.sort_values('Pop', ascending=False).iloc[[0]].index[0]

In [57]:
city_min = pd.DataFrame(origins_join2['city'].min(axis=1), columns=["dist_city"])
ports_min = pd.DataFrame(origins_join2['port'].min(axis=1), columns=["dist_port"])
airports_min = pd.DataFrame(origins_join2['airport'].min(axis=1), columns=["dist_airport"])
borders_min = pd.DataFrame(origins_join2['border'].min(axis=1), columns=["dist_border"])
capital_dist = origins_join2['city'].loc[:,[cap_idx]].rename(columns={cap_idx:'dist_capital'})

In [58]:
origins_dist = origins_snapped.join([city_min, ports_min, airports_min, borders_min, capital_dist])

In [61]:
origins_dist.columns

Index(['Pop', 'geometry', 'NN', 'NN_dist', 'pointid', 'NN_dist_hours',
       'dist_city', 'dist_port', 'dist_airport', 'dist_border',
       'dist_capital'],
      dtype='object')

In [62]:
aggregator.rasterize_gdf(origins_dist, 'dist_city', raster_path, os.path.join(output_path,f"cities_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_port', raster_path, os.path.join(output_path,f"port_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_airport', raster_path, os.path.join(output_path,f"airport_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_border', raster_path, os.path.join(output_path,f"borders_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_capital', raster_path, os.path.join(output_path,f"capital_dist.tif"))