# Launch INFRA SAP
The World Bank's Geospatial Operational Support team, in collaboration with the Infrastructure Chief Economist's office have developed a diagnostic toolkit for assessing the state of infrastucture in a country through an assessment of infrastructure, access, connectivity, and commodity flows.

The purpose of this notebook is to launch the data preparation step of the INFRA SAP toolkit. It is principally designed to integrate with the GOST team's high compute cluster, but has been made as flexible as possible to facilitate replication. The steps in data processing require the following input:
1. Administrative boundaries of interest (defines total extent of analysis and level of aggregation)
2. Country ISO3 code

Based on these basic datasets we will extract the following datasets **these steps are particular to the World Bank's data schema, but can be directly supplied to later functions if necessary**

1. Open Street Map
2. WorldPop 2020 gridded population data
3. International airports (from OSM)
4. Major ports (from OSM)
5. Official Border Crossings (from ???)

With these data either extracted or processed we run the following analyses

1. Calculate urban and rural following the GURBA process - LINK
2. Attempt to identify/name urban areas
3. (optional) Re-sample population to 1km

Following these data preparation steps a sanity check should be performed on the extracted data to ensure major POIs are not missed and that all data have been properly extracted

In [2]:
import sys, os, shutil, importlib
import rasterio

import geopandas as gpd
import pandas as pd

from shapely.geometry import Point

sys.path.append("../")

import infrasap.wp_helper as wp
import infrasap.osm_extractor as osm
import infrasap.rai_calculator as rai
import infrasap.gsm_rasterizer as gsm_r
import infrasap.infra_helper as helper
import infrasap.rasterMisc as rMisc

In [3]:
# Define baseline data
iso3 = "ZWE"
epsg = 4209 #Check out https://epsg.io/ to select epsg - use 3857 if nothing else jibes
focal_osm = '/home/public/Data/PROJECTS/INFRA_SAP/%s/zimbabwe-latest.osm.pbf' % iso3
base_out = "/home/wb411133/temp" #r"J:\Data\PROJECTS\INFRA_SAP"
out_folder = os.path.join(base_out, iso3)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [15]:
global_boundaries =  "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
global_admin2 = r"/home/public/Data/GLOBAL/ADMIN/Admin2_Polys.shp"
wp_dataset = "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/MOSAIC_ppp_prj_2020/ppp_prj_2020_%s.tif" % iso3
if not os.path.exists(wp_dataset):
    print("Check to make sure population dataset exists")
border_crossings = "/home/public/Data/GLOBAL/INFRA/BORDERS/border_crossings_phv.shp"#J:\Data\GLOBAL\INFRA
global_ports = "/home/public/Data/GLOBAL/INFRA/PORTS/attributed_ports.shp"
global_airports = "/home/public/Data/GLOBAL/INFRA/AIRPORTS/International_Airports_Volume.shp"
global_power_plants = "/home/public/Data/GLOBAL/ENERGY/global_power_plant_database.csv"
grid_lines = "/home/public/Data/GLOBAL/ENERGY/grid.gpkg"
global_solar = "/home/public/Data/GLOBAL/ENERGY/Renewables/Solar/World_PVOUT_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/PVOUT.tif"
global_wind = "/home/public/Data/GLOBAL/ENERGY/Renewables/Wind/Power Density/global_power_dens.tif"
gsm_folder = "/home/public/Data/GLOBAL/INFRA/GSMA/2019/MCE/Data_MCE/Global"
gsm_files = {}
for gsm in ['2G', '3G', '4G']:
    gsm_files[gsm] = os.path.join(gsm_folder, f"MCE_Global{gsm}_2020.tif")
openCellID = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"    
combo_viirs = '/home/public/Data/GLOBAL/NighttimeLights/VIIRS_COMBO_2013_2016_2019.vrt'

In [12]:
importlib.reload(helper)

#define output data
focal_admin2 = os.path.join(out_folder, "admin.shp")
wp_1km = os.path.join(out_folder, "WP_2020_1km.tif")
urban_extents = os.path.join(out_folder, "urban_extents.shp")
airports = os.path.join(out_folder, "airports.shp")
highways = os.path.join(out_folder, "highways.shp")
ports = os.path.join(out_folder, "ports.shp")
borders = os.path.join(out_folder, "borders.shp")
power_plants = os.path.join(out_folder, "power_plants.shp")
transmission = os.path.join(out_folder, "transmission_lines.shp")
solar_power = os.path.join(out_folder, "SOLAR_PVOUT.tif")
wind_power = os.path.join(out_folder, "WIND_PowerDens.tif")
cell_towers = os.path.join(out_folder, "OpenCellID.shp")
combo_ntl = os.path.join(out_folder, 'combo_ntl.tif')

global_data = gpd.read_file(global_boundaries)    
# select out admin2 from global boundaries dataset
if not os.path.exists(focal_admin2):
    in_bounds = gpd.read_file(global_admin2)
    out_bounds = in_bounds.loc[in_bounds['ISO3'] == iso3]
    out_bounds = out_bounds.to_crs({'init':'epsg:4326'})
    out_bounds.to_file(focal_admin2)
else:
    out_bounds = gpd.read_file(focal_admin2)


In [6]:
# extract national OSM from global OSM PBF
global_osm = "/home/public/Data/GLOBAL/OSM/GLOBAL/planet-latest_RAILWAYS_INFRA.osm.pbf"
if not os.path.exists(focal_osm):
    ## BEN: Look into using WGET to download from GeoFabrik    
    extractor = osm.osmExtraction(osmosisCmd = "/home/wb411133/Code/Osmosis/bin/osmosis", tempFile = "/home/wb411133/temp/temp_execution.bat")
    temp_osm = os.path.join(base_out, os.path.basename(focal_osm))
    print(extractor.extractBoundingBox(global_osm, focal_admin2, temp_osm, execute=False))

In [13]:
# Extract airports, rails, highways
importlib.reload(osm)
if not os.path.exists(highways):
    pois = osm.load_pois(focal_osm, out_bounds.unary_union)
    for key, value in pois.items():
        if key in ["ports",'airports']:
            key = "OSM_%s" % key
        try:
            value.to_file(os.path.join(out_folder, "%s.shp" % key))
        except:
            print("ERROR processing %s" % key)

In [16]:
# Re-sample WP to 1km
if not os.path.exists(wp_1km):
    inR = rasterio.open(wp_dataset)
    wp.resample_wp(inR, wp_1km, factor=10)
    
# Calculate urban extents from 1km WorldPop
if not os.path.exists(urban_extents):
    urban_shp = wp.calculateUrban(rasterio.open(wp_1km), smooth=False)
    urban_shp.to_file(urban_extents)

# Extract airports
if not os.path.exists(airports):
    inA = gpd.read_file(global_airports)
    selA = inA.loc[inA.intersects(out_bounds.unary_union)]
    if selA.shape[0] > 0:
        selA.to_file(airports)
                
# Extract ports
if not os.path.exists(ports):
    inP = gpd.read_file(global_ports)
    inP.crs = {'init':'epsg:4326'}
    selP = inP.loc[inP.intersects(out_bounds.unary_union)]
    if selP.shape[0] > 0:
        selP.to_file(ports)
                
# Extract borders
if not os.path.exists(borders):
    all_borders = gpd.read_file(border_crossings)
    focal_borders = all_borders[all_borders.intersects(out_bounds.unary_union.buffer(0.01))]
    if focal_borders.shape[0] > 0:
        focal_borders.to_file(borders)

# Extract power plants
if not os.path.exists(power_plants):
    pp = helper.extract_power_plants(global_power_plants, out_bounds)
    pp.to_file(power_plants)    

# Extract transmission lines
if not os.path.exists(transmission):
    lines = helper.extract_transmission_lines(grid_lines, out_bounds)
    lines.to_file(transmission)

if not os.path.exists(combo_ntl):
    rMisc.clipRaster(rasterio.open(combo_viirs), out_bounds, combo_ntl)
    
if not os.path.exists(solar_power):
    rMisc.clipRaster(rasterio.open(global_solar), out_bounds, solar_power)
    
if not os.path.exists(wind_power):
    rMisc.clipRaster(rasterio.open(global_wind), out_bounds, wind_power)

# Extract most recent GSMA coverage data
for key, gsm_file in gsm_files.items():
    out_file = os.path.join(out_folder, f"GSM_{key}.tif")
    if not os.path.exists(out_file):
        rMisc.clipRaster(rasterio.open(gsm_file), out_bounds, out_file)

# Extract Cell towers
if not os.path.exists(cell_towers):
    global_towers = pd.read_csv(openCellID)
    # Query global towers dataset using bounding box
    b = out_bounds.total_bounds
    total_geom = out_bounds.unary_union

    country_towers = global_towers.loc[(global_towers['lon'] > b[0]) & (global_towers['lon'] < b[2]) &
                                    (global_towers['lat'] > b[1]) & (global_towers['lat'] < b[3])]
    country_towers_geom = [Point(x) for x in zip(country_towers['lon'], country_towers['lat'])]
    country_towers = gpd.GeoDataFrame(country_towers, geometry=country_towers_geom, crs={'init':'epsg:4326'})
    country_towers = country_towers.loc[country_towers.intersects(total_geom)]
    country_towers.drop(['mcc','net','cell','lon','lat','range','samples','changeable','created','updated','averageSignal'], axis=1, inplace=True)
    country_towers.to_file(cell_towers)

In [9]:
# Copy map file
in_map_file = "/home/public/Data/PROJECTS/INFRA_SAP/CRI/INFRASTRUCTURE_map.qgz"
out_map_file = os.path.join(out_folder, os.path.basename(in_map_file))
shutil.copy(in_map_file, out_map_file)

'/home/wb411133/temp/ZWE/INFRASTRUCTURE_map.qgz'

# Calculate RAI

In [None]:
importlib.reload(rai)
rai_roadnetwork = rai.extract_rai_network(focal_osm, epsg=epsg)

In [None]:
rai_roadnetwork.head()

In [None]:
importlib.reload(rai)
wp_lyr = rasterio.open(wp_dataset)
if wp_lyr.crs != rai_roadnetwork.crs:
    rai_roadnetwork = rai_roadnetwork.to_crs(wp_lyr.crs)
rai_vals = rai.calculate_rai(out_bounds, "WB_ADM2_CO", wp_lyr, rai_roadnetwork, out_folder)

In [None]:
rai.to_csv(os.path.join(out_folder, "RAI_population.csv"))

# Debugging

In [None]:
in_folder = "/home/wb411133/data/Projects/CoVID/IDN/hd_urban_fishnets"
all_files = []
for root, dirs, files in os.walk(in_folder):
    for f in files:
        if f[-4:] == ".shp" and not "zonal" in f:
            all_files.append(os.path.join(root, f))

try:
    del(final)
except:
    pass
for f in all_files:
    inF  = gpd.read_file(f)
    try:
        final = final.append(inF)
    except:
        final = inF
    print(f"{inF.shape[0]}: {final.shape[0]}")

In [None]:
final = final.reset_index()
final.to_file("%s.shp" % in_folder)

In [None]:
final.shape

In [None]:
final.head()

In [None]:
in_data_centers = '/home/public/Data/GLOBAL/INFRA/DATA_CENTERS/datacenters.csv'
inD = pd.read_csv(in_data_centers)

In [None]:
inD = pd.read_csv(in_data_centers)
def get_numbs(x):
    try:
        x = x.replace(",", "")
        x = int(x)
    except:
        x = 0
    return(x)

inD['SQFT'] = inD['Colo sqft'].apply(lambda x: get_numbs(x))

inD_geom = [Point(x) for x in zip(inD['Long'].astype(float), inD['Lat'].astype(float))]
inD = gpd.GeoDataFrame(inD, geometry=inD_geom, crs={'init':'epsg:4326'})

In [None]:
inD.to_file("/home/wb411133/temp/Data_Centers.geojson", driver="GeoJSON")

In [None]:
inD['Colo sqft']