This notebook collects key steps taken to produce the training dataset that:
* is aggregated to 1x1sqkm (from by block data)
* has POI features as surfaces

## dependencies

In [None]:
!pip install -q rasterio geopandas

import pandas as pd
import numpy as np
import pathlib
import rasterio as rio
import geopandas as gpd
import re
import sys
sys.path.insert(0, '../utils')

import geoutils

In [None]:
def sub(text, start,end): 
    return text[start:end];

def clean_name(text):
    return re.sub('[^a-z ]','', text.lower()).replace(' ', '_')

## workspace

In [None]:
data_dir = 'data/'
feats_dir = data_dir + 'features/'
inds_dir = data_dir + 'indicators/'

dirs = [feats_dir, inds_dir]
for dir_ in dirs:
    with pathlib.Path(dir_) as path:
        if not path.exists():
            path.mkdir(parents=True, exist_ok=True)

In [None]:
features = [
    'vegetation',
    'aridity',
    'temperature',
    'nighttime_lights',
    'population',
]

area = 'colombia'

BBOX = [-73.17020892181104, 11.560920839000062, -72.52724612099996, 10.948171764015513]

## data

In [None]:
!gsutil cp gs://immap-masks/admin_boundaries/admin_bounds.gpkg .
!gsutil -m cp gs://immap-wash-training/features/2018_{area}_*.tif {feats_dir}
!gsutil cp gs://immap-wash-training/indicators/Manzanas_urbano.zip .
!unzip Manzanas_urbano.zip
gdf = gpd.read_file('Manzanas_urbano/Manzanas_urbano.shp')
adm = gpd.read_file('admin_bounds.gpkg', driver = 'GPKG')

## indicators

1. Generate grid boxes for whole Colombia using QGIS (grid_1x1km.gpkg)
2. Run cell below to get wash_indicators_by_block.csv
3. Upload wash_indicators_by_block to BQ then process using indicator_labelled_grid.sql

In [None]:
adm['adm2_code'] = adm['admin2Pcod'].apply(sub, args=(2,None)).astype(int)
gdf['adm2_code'] = gdf['cod_dane'].apply(sub, args=(0,5)).astype(int)

gdf2 = pd.merge(gdf, adm[['adm2_code', 'admin1Name', 'admin2RefN']], on = 'adm2_code', how = 'left')
rnm = {
    'd_mc_acued': 'perc_hh_no_water_supply',
    'd_mc_alcan': 'perc_hh_no_sewage',
    'd_mc_sanit': 'perc_hh_no_toilet',
    'admin1Name': 'adm1_name', 
    'admin2RefN': 'adm2_name',
}

gdf2.rename(columns=rnm, inplace = True)
gdf2.dropna(inplace = True)
gdf2 = gdf2.to_crs('EPSG:4326')

gdf2.to_file('wash_indicators_by_block.gpkg', driver = 'GPKG')
gdf2.to_csv('wash_indicators_by_block.csv', index = False)
!gsutil cp wash_indicators_by_block.gpkg gs://immap-wash-training/indicators/

## features

In [None]:
# generate surfaces from points of interest
depts = get_depts()
pois = ['road', 'waterway', 'commercial', 'restaurant', 'hospital', 'airport']

for poi in pois:
    print(f'Processing {poi}')
    process_by_dept(poi)

In [None]:
# pierce through rasters
for feature in features:
    tif_file = feats_dir + f'2018_{area}_{feature}.tif'
    raster = rio.open(tif_file)

    # Perform point sampling
    # https://github.com/thinkingmachines/lulc/blob/master/notebooks/develop/02%20Clean%20data.ipynb
    pxl = []
    for index, row in gdf.iterrows():
        for val in raster.sample([(row['geometry'].x, row['geometry'].y)]):
            pxl.append(val[0])

    # Add column to geodataframe
    gdf2[feature] = pxl

## impute

For blank pixels, use median of admin1 (1834 pixels, 12%)

In [None]:
# https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group
impute_cols = [
    'vegetation',
    'aridity',
    'temperature',
]

for col in impute_cols:
    df[col] = df.groupby("adm1_name")[col].transform(lambda x: x.fillna(x.median()))

# format for r-inla code
df['adm1_name_cln'] = df['adm1_name'].apply(clean_name)
df.rename(columns={'left': 'x', 'top': 'y'}, inplace=True)

df.to_csv('20200725_dataset.csv', index = False)
!gsutil cp 20200725_dataset.csv gs://immap-wash-training/

## checks

In [None]:
354129/407851 # fill rate

0.868280327864833

In [None]:
indicators = [
    'perc_hh_no_toilet',
    'perc_hh_no_water_supply',
    'perc_hh_no_sewage',
]

for i in indicators:
    print(sum(gdf2[i] == 0.0)/gdf2.shape[0])

0.6969805823869663
0.78861248214513
0.707035879449575


In [None]:
gdf2.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ipm,353545.0,22.348797,22.379941,0.0,4.938272,15.789474,33.333333,100.0
d_u_dpto,353545.0,36.835956,26.973339,5.0,11.0,25.0,66.0,99.0
d_u_mpio,353545.0,247.589625,291.055329,1.0,1.0,109.0,470.0,980.0
d_ua_clase,353545.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
d_c_sanita,353545.0,0.980981,3.800692,0.0,0.0,0.0,1.0,348.0
d_c_acuedu,353545.0,1.279967,5.593081,0.0,0.0,0.0,0.0,422.0
d_c_alcant,353545.0,2.386372,7.688756,0.0,0.0,0.0,1.0,463.0
d_c_basura,353545.0,1.05584,4.354098,0.0,0.0,0.0,0.0,259.0
d_c_aguaco,353545.0,7.319617,17.623938,0.0,1.0,4.0,8.0,3669.0
d_c_freq_b,353545.0,9.933842,24.81491,0.0,0.0,2.0,11.0,1965.0


In [None]:
print(gdf2.admin1Name.nunique())
gdf2.admin1Name.value_counts()

BogotÃ¡ D.C.                 43331
Antioquia                    39496
Valle del Cauca              35689
AtlÃ¡ntico                   23408
Cundinamarca                 22163
Santander                    22077
BolÃ­var                     20701
Norte de Santander           16396
CÃ³rdoba                     15421
Tolima                       14758
Magdalena                    14655
Cesar                        13461
BoyacÃ¡                      13073
Meta                         12812
Huila                        12400
Sucre                        10055
NariÃ±o                       9989
Caldas                        8795
Risaralda                     8719
La Guajira                    8533
Cauca                         8468
QuindÃ­o                      6747
Casanare                      6552
CaquetÃ¡                      5049
Arauca                        3814
Putumayo                      3700
ChocÃ³                        3623
Guaviare                       881
Vichada             

In [None]:
gdf2[gdf2['perc_hh_no_toilet'].isnull()].adm1_name.value_counts()

BogotÃ¡ D.C.                 4679
Antioquia                    4662
Valle del Cauca              4447
Santander                    3541
Cundinamarca                 3002
AtlÃ¡ntico                   2819
BolÃ­var                     2560
BoyacÃ¡                      2084
Norte de Santander           2021
Meta                         2001
Tolima                       1924
Huila                        1879
CÃ³rdoba                     1754
Magdalena                    1680
Cesar                        1628
NariÃ±o                      1495
Casanare                     1351
La Guajira                   1301
Sucre                        1278
Cauca                        1269
Risaralda                    1121
Caldas                       1120
QuindÃ­o                      830
ChocÃ³                        828
CaquetÃ¡                      627
Arauca                        564
Putumayo                      517
Vichada                       172
Guaviare                      140
GuainÃ­a      

In [None]:
gdf2.admin2RefN.nunique()

1011