This notebook compares 2019 aggregates vs 2019 predictions

In [1]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils



In [2]:
def generate_satellite_features(gdf, year = 2018):
    '''
    Generates features derived from satellite images by piercing through rasters using the centroids of the grid from gdf
    
    Args
        gdf (GeoDataFrame): indicator labelled grid
    Returns
        gdf (GeoDataFrame): indicator labelled grid with features
    '''
    # satellite image derived - pierce through rasters
    geom_col = 'centroid_geometry'
    satellite_features_ = satellite_features + ['nearest_highway']
    pois_ = ['waterway', 'commercial', 'restaurant', 'hospital', 'airport']
    poi_features_ = ['clipped_nearest_' + poi for poi in pois_]
    for feature in tqdm(poi_features_ + satellite_features_):
        if feature in satellite_features_:
            tif_file = feats_dir + f'{year}_{area}_{feature}.tif'
        else:
            tif_file = feats_dir + f'2018_{area}_{feature}.tif'
        raster = rio.open(tif_file)

        # Perform point sampling
        pxl = []
        for index, row in gdf.iterrows():
            for val in raster.sample([(row[geom_col].x, row[geom_col].y)]):
                pxl.append(val[0])

        # Add column to geodataframe
        col_name = feature.replace('clipped_','')
        gdf[col_name] = pxl
    return gdf

In [23]:
!gsutil cp gs://immap-wash-training/grid/grids_in_metro_areas.csv {data_dir}
!gsutil cp gs://immap-wash-training/features/2019_*.tif {feats_dir}
!gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2019_colombia_aridity_cgiarv2.tif
!gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2019_colombia_nearest_highway.tif

Copying gs://immap-wash-training/features/2018_colombia_nearest_highway.tif...
/ [1 files][ 12.4 MiB/ 12.4 MiB]                                                
Operation completed over 1 objects/12.4 MiB.                                     


In [3]:
df = pd.read_csv(data_dir + 'grids_in_metro_areas.csv')
geom_col = 'centroid_geometry'
df[geom_col] = df[geom_col].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry = geom_col)

## Generate data for 2019

In [6]:
# gdf = generate_satellite_features(gdf, year = 2019)
# test_df = geoutils.generate_training_data(gdf)
# cols = ['id', 'metro_id', 'geometry'] + poi_features + satellite_features
# print(test_df.shape)
# print('Complete cases: ' + str(test_df.dropna(subset = cols).shape[0]/test_df.shape[0]))
# test_df.to_csv(data_dir + '20200902_dataset_2019.csv')
test_df = pd.read_csv(data_dir + '20200902_dataset_2019.csv')
print(test_df.shape)
test_df.head(3)

(26542, 42)


Unnamed: 0.1,Unnamed: 0,pixelated_urban_area_id,id,geometry,adm1_name,adm2_name,centroid_geometry,metro_id,nearest_waterway,nearest_commercial,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,0,,18290623,"POLYGON ((-76.4798939023438 3.4579949602661, -...",Valle del Cauca,Cali,POINT (-76.4787677773021 3.45686883565232),9,397.931305,2778.615723,...,,,,,,,,,-76.478768,3.456869
1,1,,17961848,"POLYGON ((-76.5744884023438 3.4557427102661, -...",Valle del Cauca,Cali,POINT (-76.57336227725401 3.4546165853639),9,296.926178,3046.067139,...,,,,,,,,,-76.573362,3.454617
2,2,,18110633,"POLYGON ((-76.5316956523438 3.3363734602661, -...",Valle del Cauca,Cali,POINT (-76.53056952747269 3.33524733538718),9,1662.000244,670.668213,...,,,,,,,,,-76.53057,3.335247


## Train full model on 2018

In [5]:
train_df = pd.read_csv(data_dir + '20200830_dataset.csv')
print(train_df.shape)

(57143, 45)


In [23]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor

def model(train_df, test_df):
    global clf
    clf = RandomForestRegressor(random_state=42)
    
    feats = []
    for indicator in tqdm(indicators):

        avg_metrics = {'correlation':[], 'r2':[], 'mae':[], 'rmse':[]}
        X_train, y_train = train_df[features], train_df[indicator]
        X_test = test_df[features]
        scaler = RobustScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        test_df['pred_' + indicator] = y_pred
        
        feature_importances = pd.DataFrame({'feature': list(train_df[features].columns)
                                            , 'importance': list(clf.feature_importances_)})
        top_features = (feature_importances
                            .sort_values(by=['importance'], ascending = False))
        top_features['indicator'] = indicator
        feats.append(top_features)
    
    return test_df, pd.concat(feats, axis = 0).reset_index(drop = True)

In [24]:
test_df, top_features = model(train_df, test_df)

100%|██████████| 3/3 [02:49<00:00, 56.48s/it]


In [25]:
top_features.to_csv('top_features_2018.csv', index = False)

In [27]:
test_df.to_csv('20200902_predictions2019.csv', index = False)

## Aggregate grid predictions to metro areas

In [12]:
# TODO: column names are quite confusing..
# estimate number of households in grid with wash access
for indicator in indicators:
    test_df['pred_' + indicator.replace('perc_', '')] = test_df['population']*test_df['pred_' + indicator]

# sum household count by area
hh_cols = ['pred_' + ind.replace('perc_', '') for ind in indicators]
pred_metro = (test_df[['metro_id', 'population'] + hh_cols]
                    .groupby('metro_id').agg('sum').reset_index())

# calculate new percentage hh no access
for indicator in indicators:
    pred_metro['pred_' + indicator] = pred_metro['pred_' + indicator.replace('perc_', '')] / pred_metro['population']*100

## Compare to actual values

In [13]:
# !gsutil cp gs://immap-wash-training/indicators/20200831_GEIH_Metro_Areas.csv {data_dir}
true_metro = pd.read_csv(data_dir + '20200831_GEIH_Metro_Areas.csv')

In [14]:
spanish = {
    'personas': 'population',
    'c_acueduct': 'hh_no_water_supply',
    'c_alcantar': 'hh_no_sewage',
    'c_sanitari': 'hh_no_toilet',
    'mc_acueduc': 'perc_hh_no_water_supply',
    'mc_alcanta': 'perc_hh_no_sewage',
    'mc_sanitar': 'perc_hh_no_toilet',
}

In [16]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

df1 = pred_metro.sort_values('metro_id', ascending = True)

df2 = true_metro[['OBJECTID', 'geometry'] + list(spanish.keys())].rename(columns=spanish)

cons = pd.merge(
    df1,#[['metro_id', 'pred_perc_hh_no_water_supply', 'pred_perc_hh_no_sewage', 'pred_perc_hh_no_toilet']], 
    df2,#[['OBJECTID', 'geometry', 'perc_hh_no_water_supply', 'perc_hh_no_sewage', 'perc_hh_no_toilet']], 
    left_on = 'metro_id', right_on = 'OBJECTID'
).drop(labels = 'OBJECTID', axis = 1)

In [17]:
cons.to_csv(data_dir + 'metro_area_predictions.csv')

In [19]:
cons.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
metro_id,23.0,12.0,6.78233,1.0,6.5,12.0,17.5,23.0
population_x,23.0,108508.836067,215455.78219,4214.355219,26101.636974,34534.670718,73406.275534,1014886.483427
pred_hh_no_water_supply,23.0,2772.956998,3473.681796,245.863356,700.053979,1535.060648,4084.265454,16251.313486
pred_hh_no_toilet,23.0,2863.466448,5043.560494,439.973639,758.382561,938.111044,2198.128549,23544.487635
pred_hh_no_sewage,23.0,4841.63324,5587.954825,541.900974,1338.311817,2942.768915,6351.286273,24547.009584
pred_perc_hh_no_water_supply,23.0,6.05031,10.976936,1.543219,1.917897,2.76854,4.904179,54.523666
pred_perc_hh_no_toilet,23.0,4.444147,7.264464,1.874824,2.522608,2.761598,3.237787,37.554561
pred_perc_hh_no_sewage,23.0,9.730462,12.154118,2.341395,3.124794,5.7662,11.832298,59.680417
population_y,23.0,338537.868565,582228.04575,31503.0,88674.19,129538.3,250785.55,2675260.3
hh_no_water_supply,23.0,2634.329333,4443.995598,46.500273,225.19278,1405.232,2397.1995,20213.9


In [23]:
for indicator in indicators:
    print(indicator)
    print(modelutils.calculate_metrics(cons[indicator], cons['pred_' + indicator]))

perc_hh_no_water_supply
{'correlation': 0.9879473050986259, 'r2': 0.9760398776516375, 'mae': 2.979931112106002, 'rmse': 3.600394335884224}
perc_hh_no_toilet
{'correlation': 0.997570708467809, 'r2': 0.9951473183929664, 'mae': 2.664030078428854, 'rmse': 5.335070704540623}
perc_hh_no_sewage
{'correlation': 0.9753960126412986, 'r2': 0.9513973814765444, 'mae': 4.292977846858964, 'rmse': 6.62399054204633}


In [25]:
# population from world pop (_x) is a subset only of population from GEIH (_y)
cons[['metro_id', 'population_x', 'population_y']]

Unnamed: 0,metro_id,population_x,population_y
0,1,28204.572802,99614.157
1,2,24916.08408,122349.8
2,3,10639.144145,54069.61
3,4,4214.355219,31503.0
4,5,15868.288083,65391.59
5,6,223601.905329,458908.5
6,7,1014886.483427,2675260.3
7,8,78840.586622,331984.6
8,9,251332.831071,840107.3
9,10,71876.750022,262341.7
