## Country-level INFRA-SAP


- Origins: Population grid (Worldpop downsampled to 500 meters)
- Destinations: Cities, airports, border crossings, and ports

Typical access analysis with two adjustments:
    1. Extract different sets of destinations from OD
    2. Join travel time to origin grid based on "NN with the fastest route" (not necessarily closest NN)

In [1]:
import os, sys, time, importlib

import geopandas as gpd
import pandas as pd
import networkx as nx
sys.path.append('/home/wb514197/Repos/GOSTnets')

import GOSTnets as gn
import GOSTnets.calculate_od_raw as calcOD
from GOSTnets.load_osm import *
import rasterio as rio
from osgeo import gdal
import numpy as np
from shapely.geometry import Point

sys.path.append('/home/wb514197/Repos/INFRA_SAP')
from infrasap import aggregator

from shapely.wkt import loads

%load_ext autoreload
%autoreload 2

In [2]:
country = 'guatemala'
iso3 = 'GTM'
epsg = 26915

### Load origins and graph

In [3]:
base_in = "/home/public/Data/PROJECTS/INFRA_SAP"
in_folder = os.path.join(base_in, iso3)

# define data paths
focal_admin2 = os.path.join(in_folder, "admin.shp")
focal_osm = os.path.join(in_folder, f"{country}-latest.osm.pbf")
pop_name = "WP_2020_1km"
wp_1km = os.path.join(in_folder, f"{pop_name}.tif")
urban_extents = os.path.join(in_folder, "urban_extents.shp")
airports = os.path.join(in_folder, "airports.shp")
ports = os.path.join(in_folder, "ports.shp")
borders = os.path.join(in_folder, "borders.shp")
G_path = os.path.join(in_folder, 'graph', f"G_{iso3}.pickle")

base_out = "/home/wb514197/data/INFRA_SAP" # GOT permission denied using public 
out_folder = os.path.join(base_out, iso3)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

Convert **WP_2020_1km.tif** into a point GeoData frame

In [4]:
out_pop_csv = os.path.join(out_folder, f"{pop_name}.csv")
wp_df = pd.read_csv(out_pop_csv, sep=' ')
wp_df.rename(columns={"Z":"Pop"}, inplace=True)
wp_df = wp_df.loc[wp_df.Pop!=-99999.0].copy()
geoms = [Point(xy) for xy in zip(wp_df.X, wp_df.Y)]
wp_df.drop(["X","Y"], axis=1, inplace=True)
crs = 'EPSG:4326'
origins = gpd.GeoDataFrame(wp_df, crs=crs, geometry=geoms)
origins['pointid'] = origins.index

### Prepare Graph

In [5]:
G_time = nx.read_gpickle(os.path.join(out_folder, 'graph', f'G_{iso3}_Salt.pickle'))

#### Select largest graph (again)

In [6]:
list_of_subgraphs = [G_time.subgraph(c).copy() for c in sorted(nx.strongly_connected_components(G_time), key=len, reverse=True)]

In [7]:
G_largest = list_of_subgraphs[0]

### Prepare destinations

In [8]:
def load_csv(csv_path, geometry = 'geometry', crs = 'epsg:4326'):
    df = pd.read_csv(csv_path, index_col=0)
    df[geometry] = df[geometry].apply(loads)
    gdf = gpd.GeoDataFrame(df, crs = crs)
    return(gdf)

In [9]:
dest_all = load_csv(os.path.join(out_folder, 'destination_all.csv'))

In [10]:
len(origins), len(dest_all)

(133659, 96)

### Snap origins and destinations

#### Snap to origins to 5 nearest nodes

In [11]:
utm = f"EPSG:{epsg}"

In [12]:
%%time
# this function returns a dictionary of origin IDs, with a list of 5 NNs, and a corresponding list of distances
origins_snapped_dict = gn.pandana_snap_to_many(G_largest, origins, source_crs='epsg:4326', target_crs=utm, 
                                               add_dist_to_node_col = True, k_nearest=5, origin_id='pointid')

CPU times: user 21.8 s, sys: 480 ms, total: 22.3 s
Wall time: 22.6 s


In [13]:
dest_snapped = gn.pandana_snap_c(G_largest, dest_all, source_crs='epsg:4326', target_crs=utm,
                                 add_dist_to_node_col = False)

In [14]:
# origins_unique_nn = list(set(origins_snapped['NN']))
dest_nn = list(dest_snapped['NN'])
list_origins_NN = []
for each in origins_snapped_dict.values():
    list_origins_NN += each['NN']
origins_unique_nn = list(set(list_origins_NN))

In [15]:
%%time
curOD = gn.calculate_OD(G_largest, origins_unique_nn, dest_nn, fail_value = 999999999, weight='length')

CPU times: user 5.9 s, sys: 54.7 ms, total: 5.95 s
Wall time: 5.95 s


In [16]:
curOD[curOD==999999999]

array([], dtype=float64)

In [17]:
curOD.shape

(5703, 96)

In [18]:
od_df = pd.DataFrame(curOD, index=origins_unique_nn, columns=dest_nn)

In [19]:
od_df.head()

Unnamed: 0,new_obj_5260,new_obj_4782,5314_649_1937,25191,new_obj_4446_2635_7840,5313_646_1922,32423,30920,new_obj_3102,11722,...,9240_1094_3290,9240_1094_3290.1,new_obj_5335,new_obj_5335.1,new_obj_5335.2,new_obj_5335.3,new_obj_3954_2614_7785,new_obj_3954_2614_7785.1,new_obj_3954_2614_7785.2,new_obj_3954_2614_7785.3
5,537726.095487,451973.873592,236875.292508,215288.707545,289648.989966,168886.879029,139188.15033,332507.014137,258560.669117,82763.261511,...,356837.29735,356837.29735,616426.635592,616426.635592,616426.635592,616426.635592,143142.641783,143142.641783,143142.641783,143142.641783
new_obj_3380,463579.178203,293114.097548,279502.854334,267667.866158,214550.213518,211514.440855,258633.17992,173647.238093,99700.893073,202208.291101,...,212728.952147,212728.952147,500264.87822,500264.87822,500264.87822,500264.87822,262587.671374,262587.671374,262587.671374,262587.671374
13,548325.342238,377860.261583,338171.176759,326336.188583,299296.377553,270182.76328,245112.079726,258393.402128,184447.057108,188687.190907,...,282723.685341,282723.685341,585011.042255,585011.042255,585011.042255,585011.042255,249066.57118,249066.57118,249066.57118,249066.57118
new_obj_1622,510892.167968,320629.59733,418163.456749,406328.468573,317544.958209,350175.04327,378977.095505,130137.02807,114010.131094,322552.206686,...,131830.617139,131830.617139,527780.378001,527780.378001,527780.378001,527780.378001,382931.586958,382931.586958,382931.586958,382931.586958
31,591476.459169,421011.378514,343690.18443,328257.75984,342447.494484,275701.770951,246041.672046,301544.51906,227598.174039,189616.783227,...,325874.802273,325874.802273,628162.159186,628162.159186,628162.159186,628162.159186,249996.1635,249996.1635,249996.1635,249996.1635


For each origin set of 5 (k) possible NN:
    - Add snapping dist (in time) + time to a destination
    - Which destination? min time from all of them won't necessarily work, so we need to find the closest destination for each origin, and select the NN which yields the fastest travel time to that dest
    - closest_dest.idx should match the OD column order.

In [20]:
%%time
closest_dest = gn.pandana_snap_points(origins, dest_all, source_crs='epsg:4326', target_crs=utm,
                                      add_dist_to_node_col=True)

CPU times: user 6.85 s, sys: 109 ms, total: 6.96 s
Wall time: 6.96 s


In [21]:
closest_dest = closest_dest.set_index('pointid')

In [22]:
closest_dest.head()

Unnamed: 0_level_0,Pop,geometry,idx,idx_dist
pointid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
148,17.833401,POINT (713526.895 1970820.776),47,38167.960297
149,16.891191,POINT (714410.449 1970830.304),47,37662.11206
150,16.950291,POINT (715294.008 1970839.871),47,37170.417996
151,17.671246,POINT (716177.570 1970849.478),47,36693.447402
152,16.73247,POINT (717061.136 1970859.125),47,36231.782056


In [23]:
%%time
fastest_nn = []
fastest_dist = []
custom_speed = 30 # km/h

for pointid, items in origins_snapped_dict.items():
    dest_index = closest_dest.loc[pointid].idx
    nn_list = items['NN']
    dist_list = items['NN_dist']
    total_dist_list = []
    for i in range(0, len(nn_list)):
        dist_snapping = dist_list[i]
#         time_snapping = ((dist_list[i] / 1000) / custom_speed) * 60 * 60
        dist_to_dest = od_df.loc[nn_list[i]].iloc[dest_index]
#         time_to_dest = od_df.loc[nn_list[i]].iloc[dest_index]
        total_dist = dist_snapping+dist_to_dest
#         total_time = time_snapping+time_to_dest
        total_dist_list.append(total_dist)
#         print(f"id: {nn_list[i]}, snapping dist (km): {dist_list[i]/1000:.2f}, time to dest (min): {(total_time/60)/60:.2f}")
    min_pos = total_dist_list.index(min(total_dist_list))
    fastest_nn.append(nn_list[min_pos])
    fastest_dist.append(dist_list[min_pos])
#     origins_snapped_smart.loc[pointid, "NN"] = nn_list[min_pos]
#     origins_snapped_smart.loc[pointid, "NN_dist"] = dist_list[min_pos]

CPU times: user 1min 32s, sys: 138 ms, total: 1min 32s
Wall time: 1min 32s


In [24]:
origins_snapped = origins.copy().set_index('pointid')
origins_snapped['NN'] = pd.Series(fastest_nn, index = origins_snapped.index)
origins_snapped['NN_dist'] = pd.Series(fastest_dist, index = origins_snapped.index)
origins_snapped['pointid'] = origins_snapped.index
origins_snapped['NN_dist_hours'] = ((origins_snapped.NN_dist / 1000) / custom_speed)

In [25]:
origins_join = origins_snapped.join(od_df, on='NN')

In [26]:
all(origins_join.columns[6:] == dest_snapped.NN)

True

In [27]:
origins_join_rename = origins_join.copy()
origins_join_rename.columns = pd.MultiIndex.from_arrays([['origin' for each in origins_snapped.columns]+list(dest_snapped.dest_type), origins_snapped.columns.append(dest_snapped.index)])

Add snapping distance

In [28]:
origins_join2 = origins_join_rename.apply(lambda x: (x + origins_join_rename.origin.NN_dist)/1000 if x.name[1] in dest_snapped.index else x)

In [29]:
out_folder

'/home/wb514197/data/INFRA_SAP/GTM'

In [30]:
origins_join2.to_csv(os.path.join(out_folder, 'OD_08_06_Distances.csv'))

### Make rasters of min travel time to each dest

In [31]:
raster_path = wp_1km

In [32]:
output_path = os.path.join(out_folder, "travel_distance")
if not os.path.exists(output_path):
    os.mkdir(output_path)

In [33]:
# CHECK THAT MOST POPULATED CITY IS THE CAPITAL
cap_idx = dest_all.sort_values('Pop', ascending=False).iloc[[0]].index[0]

In [34]:
city_min = pd.DataFrame(origins_join2['city'].min(axis=1), columns=["dist_city"])
ports_min = pd.DataFrame(origins_join2['port'].min(axis=1), columns=["dist_port"])
airports_min = pd.DataFrame(origins_join2['airport'].min(axis=1), columns=["dist_airport"])
borders_min = pd.DataFrame(origins_join2['border'].min(axis=1), columns=["dist_border"])
capital_dist = origins_join2['city'].loc[:,[cap_idx]].rename(columns={cap_idx:'dist_capital'})

In [35]:
origins_dist = origins_snapped.join([city_min, ports_min, airports_min, borders_min, capital_dist])

In [36]:
origins_dist.columns

Index(['Pop', 'geometry', 'NN', 'NN_dist', 'pointid', 'NN_dist_hours',
       'dist_city', 'dist_port', 'dist_airport', 'dist_border',
       'dist_capital'],
      dtype='object')

In [37]:
aggregator.rasterize_gdf(origins_dist, 'dist_city', raster_path, os.path.join(output_path,f"cities_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_port', raster_path, os.path.join(output_path,f"port_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_airport', raster_path, os.path.join(output_path,f"airport_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_border', raster_path, os.path.join(output_path,f"borders_min_dist.tif"))
aggregator.rasterize_gdf(origins_dist, 'dist_capital', raster_path, os.path.join(output_path,f"capital_dist.tif"))