# Launch INFRA SAP
The World Bank's Geospatial Operational Support team, in collaboration with the Infrastructure Chief Economist's office have developed a diagnostic toolkit for assessing the state of infrastucture in a country through an assessment of infrastructure, access, connectivity, and commodity flows.

The purpose of this notebook is to launch the data preparation step of the INFRA SAP toolkit. It is principally designed to integrate with the GOST team's high compute cluster, but has been made as flexible as possible to facilitate replication. The steps in data processing require the following input:
1. Administrative boundaries of interest (defines total extent of analysis and level of aggregation)
2. Country ISO3 code

Based on these basic datasets we will extract the following datasets **these steps are particular to the World Bank's data schema, but can be directly supplied to later functions if necessary**

1. Open Street Map
2. WorldPop 2020 gridded population data
3. International airports (from OSM)
4. Major ports (from OSM)
5. Official Border Crossings (from ???)

With these data either extracted or processed we run the following analyses

1. Calculate urban and rural following the GURBA process - LINK
2. Attempt to identify/name urban areas
3. (optional) Re-sample population to 1km

Following these data preparation steps a sanity check should be performed on the extracted data to ensure major POIs are not missed and that all data have been properly extracted

In [1]:
import sys, os, importlib
import rasterio

import geopandas as gpd
import pandas as pd

from shapely.geometry import Point

sys.path.append("../")

import infrasap.wp_helper as wp
import infrasap.osm_extractor as osm
#import infrasap.rai_calculator as rai
import infrasap.gsm_rasterizer as gsm_r
import infrasap.infra_helper as helper
import infrasap.GOSTRocks.rasterMisc as rMisc

In [27]:
# Define baseline data
iso3 = "URY"
global_boundaries =  "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
global_admin2 = r"/home/public/Data/GLOBAL/ADMIN/Admin2_Polys.shp"
global_osm = "/home/public/Data/GLOBAL/OSM/GLOBAL/planet-latest.osm.pbf"
wp_dataset = "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/MOSAIC_ppp_prj_2020/ppp_prj_2020_%s.tif" % iso3
if not os.path.exists(wp_dataset):
    print("Check to make sure population dataset exists")
border_crossings = "/home/public/Data/GLOBAL/INFRA/BORDERS/border_crossings_phv.shp"#J:\Data\GLOBAL\INFRA
global_power_plants = "/home/public/Data/GLOBAL/ENERGY/global_power_plant_database.csv"
grid_lines = "/home/public/Data/GLOBAL/ENERGY/grid.gpkg"
global_solar = "/home/public/Data/GLOBAL/ENERGY/Renewables/Solar/World_PVOUT_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/PVOUT.tif"
global_wind = "/home/public/Data/GLOBAL/ENERGY/Renewables/Wind/Power Density/global_power_dens.tif"

In [29]:
importlib.reload(helper)

epsg = 32722
base_out = "/home/public/Data/PROJECTS/INFRA_SAP" #r"J:\Data\PROJECTS\INFRA_SAP"
out_folder = os.path.join(base_out, iso3)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
#define output data
focal_admin2 = os.path.join(out_folder, "admin.shp")
focal_osm = os.path.join(out_folder, "national_complete.osm.pbf")
wp_1km = os.path.join(out_folder, "WP_2020_1km.tif")
urban_extents = os.path.join(out_folder, "urban_extents.shp")
airports = os.path.join(out_folder, "airports.shp")
ports = os.path.join(out_folder, "ports.shp")
borders = os.path.join(out_folder, "borders.shp")
power_plants = os.path.join(out_folder, "power_plants.shp")
transmission = os.path.join(out_folder, "transmission_lines.shp")
solar_power = os.path.join(out_folder, "SOLAR_PVOUT.tif")
wind_power = os.path.join(out_folder, "WIND_PowerDens.tif")

global_data = gpd.read_file(global_boundaries)    
# select out admin2 from global boundaries dataset
if not os.path.exists(focal_admin2):
    in_bounds = gpd.read_file(global_admin2)
    out_bounds = in_bounds.loc[in_bounds['ISO3'] == iso3]
    out_bounds = out_bounds.to_crs({'init':'epsg:4326'})
    out_bounds.to_file(focal_admin2)
else:
    out_bounds = gpd.read_file(focal_admin2)
    
# extract national OSM from global OSM PBF
if not os.path.exists(focal_osm):
    ## BEN: Look into using WGET to download from GeoFabrik    
    extractor = osm.osmExtraction(osmosisCmd = "/home/wb411133/Code/Osmosis/bin/osmosis", tempFile = "/home/wb411133/temp/temp_execution.bat")
    print(extractor.extractBoundingBox(global_osm, focal_admin2, focal_osm, execute=False))

# Re-sample WP to 1km
if not os.path.exists(wp_1km):
    inR = rasterio.open(wp_dataset)
    wp.resample_wp(inR, wp_1km, factor=10)
    
# Calculate urban extents from 1km WorldPop
if not os.path.exists(urban_extents):
    urban_shp = wp.calculateUrban(rasterio.open(wp_1km), smooth=False)
    urban_shp.to_file(urban_extents)
    
# Extract airports and ports and rails from OSM
if not os.path.exists(airports):
    pois = osm.load_pois(focal_osm, out_bounds.unary_union)
    for key, value in pois.items():
        value.to_file(os.path.join(out_folder, "%s.shp" % key))

# Extract borders
if not os.path.exists(borders):
    all_borders = gpd.read_file(border_crossings)
    focal_borders = all_borders[all_borders.intersects(out_bounds.unary_union.buffer(0.01))]
    focal_borders.to_file(borders)

# Extract power plants
if not os.path.exists(power_plants):
    pp = helper.extract_power_plants(global_power_plants, out_bounds)
    pp.to_file(power_plants)
    

# Extract transmission lines
if not os.path.exists(transmission):
    lines = helper.extract_transmission_lines(grid_lines, out_bounds)
    lines.to_file(transmission)

if not os.path.exists(solar_power):
    rMisc.clipRaster(rasterio.open(global_solar), out_bounds, solar_power)
    
if not os.path.exists(wind_power):
    rMisc.clipRaster(rasterio.open(global_wind), out_bounds, wind_power)
    
# Extract and rasterize GSM
gsm_folder = "/home/public/Data/GLOBAL/INFRA/GSMA/"
gsm_shapefiles = []

for root, dirs, files in os.walk(gsm_folder):
    for f in files:
        if (f[-4:] == ".shp") and not ("Inclusions" in f):
            gsm_shapefiles.append(os.path.join(root, f))

gsm_process = gsm_r.gsm_rasterizer(gsm_shapefiles, out_folder)
gsm_process.initial_read_in()
gsm_process.extract_country_vectors(iso3, global_data, out_folder = out_folder)
gsm_process.rasterize_gsm_vectors()

In [28]:
rMisc.clipRaster?

In [337]:
# Extract and rasterize GSM
gsm_folder = "/home/public/Data/GLOBAL/INFRA/GSMA/"
gsm_shapefiles = []

for root, dirs, files in os.walk(gsm_folder):
    for f in files:
        if (f[-4:] == ".shp") and not ("Inclusions" in f) and not ("National_Rasters" in root):
            gsm_shapefiles.append(os.path.join(root, f))
gsm_shapefiles

['/home/public/Data/GLOBAL/INFRA/GSMA/2016/Global_3G_201612.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2016/Global_4G_201612.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2016/Global_GSM_201612.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2018/MCE_201812_3G.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2018/MCE_201812_4G.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2018/MCE_201812_GSM.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2017/Data/Global_3G_201712.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2017/Data/Global_4G_201712.shp',
 '/home/public/Data/GLOBAL/INFRA/GSMA/2017/Data/Global_GSM_201712.shp']

In [343]:
importlib.reload(gsm_r)
iso3 = "URY"
gsm_process = gsm_r.gsm_rasterizer(gsm_shapefiles, out_folder)
gsm_process.initial_read_in()
gsm_process.extract_country_vectors(iso3, global_data, out_folder = out_folder)
gsm_process.rasterize_gsm_vectors()

NameError: name 'gsm_process' is not defined

In [345]:
base_data = gsm_process.gsm_data

In [349]:
importlib.reload(gsm_r)

gsm_process = gsm_r.gsm_rasterizer(gsm_shapefiles, out_folder)
gsm_process.gsm_data = base_data
gsm_process.extract_country_vectors(iso3, global_data, out_folder = out_folder)
gsm_process.rasterize_gsm_vectors()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  select.to_file(out_file)
TopologyException: Input geom 1 is invalid: Ring Self-intersection at or near point -58.44865462599995 -34.039201843999933 at -58.44865462599995 -34.039201843999933


# Calculate RAI

In [238]:
importlib.reload(rai)
rai_roadnetwork = rai.extract_rai_network(focal_osm, epsg=epsg)


NameError: name 'out_folder' is not defined

In [241]:
importlib.reload(rai)
rai = rai.calculate_rai(out_bounds, "GID_2", rasterio.open(wp_dataset), rai_roadnetwork, out_folder)

In [230]:
rai.to_csv(os.path.join(out_folder, "RAI_population.csv"))

# Processing cell tower data

In [3]:
# simplify the global open cell id data
base_data = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"

inD = pd.read_csv(base_data)

Unnamed: 0,radio,mcc,net,area,cell,unit,lon,lat,range,samples,changeable,created,updated,averageSignal
0,UMTS,262,2,801,86355,0,13.285512,52.522202,1000,7,1,1282569574,1300155341,0
1,GSM,262,2,801,1795,0,13.276907,52.525714,5716,9,1,1282569574,1300155341,0
2,GSM,262,2,801,1794,0,13.285064,52.524,6280,13,1,1282569574,1300796207,0
3,UMTS,262,2,801,211250,0,13.285446,52.521744,1000,3,1,1282569574,1299466955,0
4,UMTS,262,2,801,86353,0,13.293457,52.521515,1000,2,1,1282569574,1291380444,0


In [None]:
global_data = gpd.read_file(global_boundaries)
global_data['incomeG'].value_counts()

In [10]:
inD_geom = [Point(x) for x in zip(inD['lon'], inD['lat'])]
inG = gpd.GeoDataFrame(inD, geometry=inD_geom, crs = {'init':'epsg:4326'})
inG = inG.to_crs(global_data.crs)

In [13]:
joined_towers = gpd.sjoin(inG, global_data, how='left', op='within', lsuffix='p_', rsuffix='g_')

In [24]:
ury_towers = joined_towers.loc[joined_towers['ISO3'] == 'URY']
ury_towers.to_file("/home/wb411133/temp/URY_towers.shp")

In [23]:
lac_towers = joined_towers.loc[joined_towers['Region'] == 'Latin America & Caribbean']
lac_towers.drop(['index_g_', 'OBJECTID', 'ISO_A2', 'WB_ADM0_CO','WB_ADM0_NA', 'Shape_Leng', 'Shape_Area', 'UN_m49', 'Region','incomeG', 'lendingC', 'FID_100'], axis=1, inplace=True)

lac_towers.to_file("/home/wb411133/temp/LAC_towers.shp")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# Debugging

In [13]:
in_folder = "/home/wb411133/data/Projects/CoVID/IDN/hd_urban_fishnets"
all_files = []
for root, dirs, files in os.walk(in_folder):
    for f in files:
        if f[-4:] == ".shp" and not "zonal" in f:
            all_files.append(os.path.join(root, f))

try:
    del(final)
except:
    pass
for f in all_files:
    inF  = gpd.read_file(f)
    try:
        final = final.append(inF)
    except:
        final = inF
    print(f"{inF.shape[0]}: {final.shape[0]}")

6705: 6705
1881: 8586
1981: 10567
1666: 12233
923: 13156
1322: 14478
855: 15333
632: 15965
318: 16283
422: 16705
288: 16993
388: 17381
449: 17830
569: 18399
254: 18653
481: 19134
330: 19464
378: 19842
220: 20062
215: 20277
136: 20413
168: 20581
178: 20759
187: 20946
160: 21106
163: 21269
163: 21432
114: 21546
147: 21693
134: 21827
130: 21957
116: 22073
99: 22172
118: 22290
155: 22445
144: 22589
128: 22717
108: 22825
83: 22908
98: 23006
93: 23099
69: 23168
112: 23280
88: 23368
82: 23450
63: 23513
58: 23571
126: 23697
111: 23808
85: 23893
96: 23989
86: 24075
60: 24135
92: 24227
58: 24285
100: 24385
96: 24481
80: 24561
83: 24644
65: 24709
56: 24765
52: 24817
74: 24891
51: 24942
72: 25014
73: 25087
56: 25143
69: 25212
50: 25262
39: 25301
45: 25346
50: 25396
48: 25444
52: 25496
53: 25549
39: 25588
50: 25638
58: 25696
38: 25734
73: 25807
62: 25869
46: 25915
44: 25959
43: 26002
33: 26035
48: 26083
37: 26120
33: 26153
41: 26194
31: 26225
35: 26260
35: 26295
57: 26352
40: 26392
41: 26433
39: 26

In [14]:
final = final.reset_index()
final.to_file("%s.shp" % in_folder)

In [10]:
final.shape

(47790, 3)

In [12]:
final.head()

Unnamed: 0,FID,geohash,geometry
0,0,,POLYGON ((11788594.88015904 -657995.4158944433...
1,1,,POLYGON ((11788594.88015904 -658995.4158944433...
2,2,,POLYGON ((11788594.88015904 -659995.4158944433...
3,3,,POLYGON ((11788594.88015904 -660995.4158944433...
4,4,,POLYGON ((11788594.88015904 -661995.4158944433...
