This notebook collects key steps taken to produce the training dataset that:
* is aggregated to 1x1sqkm (from by block data)
* has POI features as surfaces
* has features for urban area characteristics

## dependencies

In [1]:
# !pip install -q rasterio geopandas

import pandas as pd
import numpy as np
import pathlib
import rasterio as rio
import geopandas as gpd
from shapely.wkt import loads
from tqdm import tqdm
import re
import sys
sys.path.insert(0, '../utils')

import geoutils
import bqutils

In [2]:
def clean_name(text):
    return re.sub('[^a-z ]','', text.lower()).replace(' ', '_')

## workspace

In [3]:
data_dir = '../data/'
feats_dir = data_dir + 'features/'
inds_dir = data_dir + 'indicators/'

dirs = [feats_dir, inds_dir]
for dir_ in dirs:
    with pathlib.Path(dir_) as path:
        if not path.exists():
            path.mkdir(parents=True, exist_ok=True)

In [4]:
features = [
    'vegetation',
    'aridity_cgiarv2',
    'temperature',
    'nighttime_lights',
    'population',
    'elevation',
    'urban_index',
    'nearest_highway',
]

area = 'colombia'

BBOX = [-73.17020892181104, 11.560920839000062, -72.52724612099996, 10.948171764015513]

## data

In [None]:
# !gsutil -m cp gs://immap-wash-training/features/2018_{area}_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/urban_area_features.csv {feats_dir}
# !gsutil cp gs://immap-masks/admin_boundaries/admin_bounds.gpkg {feats_dir}
# !gsutil cp gs://immap-wash-training/indicators/Manzanas_urbano.zip {feats_dir}
# !unzip {feats_dir}Manzanas_urbano.zip

## indicators

In [6]:
# blocks = gpd.read_file(feats_dir + 'Manzanas_urbano/Manzanas_urbano.shp')
# adm = gpd.read_file(feats_dir + 'admin_bounds.gpkg', driver = 'GPKG')
# geoutils.generate_blocks_geopackage(blocks, adm)
# bqutils.run_sql('../scripts/indicator_labelled_grid.sql')

# !gsutil cp gs://immap-wash-training/indicators/indicator_labelled_grid.csv {inds_dir}
df = pd.read_csv(inds_dir + 'indicator_labelled_grid.csv')
df['centroid_geometry'] = df['centroid_geometry'].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry='centroid_geometry').set_crs('EPSG:4326')

## features

In [7]:
# points of interest - generate surfaces
pois = ['waterway', 'commercial', 'restaurant', 'hospital', 'airport']#'road',

# depts = get_depts()
# for poi in pois:
#     print(f'Processing {poi}')
#     geoutils.process_by_dept(poi)
    
poi_features = ['clipped_nearest_' + poi for poi in pois]

In [8]:
# satellite image derived - pierce through rasters
geom_col = 'centroid_geometry'

for feature in tqdm(poi_features + features):
    tif_file = feats_dir + f'2018_{area}_{feature}.tif'
    raster = rio.open(tif_file)

    # Perform point sampling
    pxl = []
    for index, row in gdf.iterrows():
        for val in raster.sample([(row[geom_col].x, row[geom_col].y)]):
            pxl.append(val[0])

    # Add column to geodataframe
    col_name = feature.replace('clipped_','')
    gdf[col_name] = pxl

100%|██████████| 13/13 [00:34<00:00,  2.63s/it]


In [9]:
# urban area features - join to existing
# bqutils.run_sql('../scripts/urban_area_features.sql')
ua_feats = pd.read_csv(feats_dir + 'urban_area_features.csv').drop(labels = ['geometry'], axis = 1)

In [17]:
# lagged features
# !gsutil cp gs://immap-wash-training/grid/grid_1x1km_wfeatures_lagged.csv {feats_dir}
lag_feats = pd.read_csv(feats_dir + 'grid_1x1km_wfeatures_lagged.csv')
cols = ['id'] + [text for text in list(lag_feats.columns) if re.search('lag_*', text) is not None]
lag_feats = lag_feats[cols]

In [36]:
# master table
df_ = pd.merge(gdf, ua_feats, how = 'left', on = 'id')
df = pd.merge(df_, lag_feats, how = 'left', on = 'id')

# add night time lights mean
mean_col = df.groupby('pixelated_urban_area_id')['nighttime_lights'].mean() # don't reset the index!
df = df.set_index('pixelated_urban_area_id') # make the same index here
df['nighttime_lights_area_mean'] = mean_col

# format for R-INLA
df['x'] = df['centroid_geometry'].x
df['y'] = df['centroid_geometry'].y

train_df = df.reset_index()
print(train_df.shape)
train_df.head(2)

(7574, 65)


Unnamed: 0,pixelated_urban_area_id,id,geometry,perc_hh_no_toilet,perc_hh_no_water_supply,perc_hh_no_sewage,d_mc_basur,d_mc_aguac,d_mc_freq_,d_mc_pare,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,862.0,417475,"POLYGON((-75.5123828117681 5.05751500688412, -...",0.018677,0.020431,0.030647,0.029925,0.150449,0.793726,0.221855,...,17281.00001,14980.00001,47.137354,75.599635,2032.50001,31.37501,381.252504,23.124894,-75.507891,5.062007
1,83.0,187318,"POLYGON((-76.4376475501431 7.23143798441016, -...",0.190164,0.213115,0.209836,0.062295,0.409836,0.760656,0.501639,...,21134.00001,15023.68751,0.48376,1.064785,175.37501,6.87501,727.6364,0.808125,-76.433156,7.23593


In [38]:
train_df.to_csv(data_dir + '20200821_dataset.csv', index = False)
!gsutil cp {data_dir}20200821_dataset.csv gs://immap-wash-training/training/

Copying file://../data/20200821_dataset.csv [Content-Type=text/csv]...
/ [1 files][  7.6 MiB/  7.6 MiB]                                                
Operation completed over 1 objects/7.6 MiB.                                      


## checks

In [None]:
354129/407851 # fill rate

0.868280327864833

In [None]:
indicators = [
    'perc_hh_no_toilet',
    'perc_hh_no_water_supply',
    'perc_hh_no_sewage',
]

for i in indicators:
    print(sum(gdf2[i] == 0.0)/gdf2.shape[0])

0.6969805823869663
0.78861248214513
0.707035879449575


In [None]:
gdf2.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ipm,353545.0,22.348797,22.379941,0.0,4.938272,15.789474,33.333333,100.0
d_u_dpto,353545.0,36.835956,26.973339,5.0,11.0,25.0,66.0,99.0
d_u_mpio,353545.0,247.589625,291.055329,1.0,1.0,109.0,470.0,980.0
d_ua_clase,353545.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
d_c_sanita,353545.0,0.980981,3.800692,0.0,0.0,0.0,1.0,348.0
d_c_acuedu,353545.0,1.279967,5.593081,0.0,0.0,0.0,0.0,422.0
d_c_alcant,353545.0,2.386372,7.688756,0.0,0.0,0.0,1.0,463.0
d_c_basura,353545.0,1.05584,4.354098,0.0,0.0,0.0,0.0,259.0
d_c_aguaco,353545.0,7.319617,17.623938,0.0,1.0,4.0,8.0,3669.0
d_c_freq_b,353545.0,9.933842,24.81491,0.0,0.0,2.0,11.0,1965.0


In [None]:
print(gdf2.admin1Name.nunique())
gdf2.admin1Name.value_counts()

BogotÃ¡ D.C.                 43331
Antioquia                    39496
Valle del Cauca              35689
AtlÃ¡ntico                   23408
Cundinamarca                 22163
Santander                    22077
BolÃ­var                     20701
Norte de Santander           16396
CÃ³rdoba                     15421
Tolima                       14758
Magdalena                    14655
Cesar                        13461
BoyacÃ¡                      13073
Meta                         12812
Huila                        12400
Sucre                        10055
NariÃ±o                       9989
Caldas                        8795
Risaralda                     8719
La Guajira                    8533
Cauca                         8468
QuindÃ­o                      6747
Casanare                      6552
CaquetÃ¡                      5049
Arauca                        3814
Putumayo                      3700
ChocÃ³                        3623
Guaviare                       881
Vichada             

In [None]:
gdf2[gdf2['perc_hh_no_toilet'].isnull()].adm1_name.value_counts()

BogotÃ¡ D.C.                 4679
Antioquia                    4662
Valle del Cauca              4447
Santander                    3541
Cundinamarca                 3002
AtlÃ¡ntico                   2819
BolÃ­var                     2560
BoyacÃ¡                      2084
Norte de Santander           2021
Meta                         2001
Tolima                       1924
Huila                        1879
CÃ³rdoba                     1754
Magdalena                    1680
Cesar                        1628
NariÃ±o                      1495
Casanare                     1351
La Guajira                   1301
Sucre                        1278
Cauca                        1269
Risaralda                    1121
Caldas                       1120
QuindÃ­o                      830
ChocÃ³                        828
CaquetÃ¡                      627
Arauca                        564
Putumayo                      517
Vichada                       172
Guaviare                      140
GuainÃ­a      

In [None]:
gdf2.admin2RefN.nunique()

1011