This notebook compares 2020 aggregates vs 2020 predictions **for metro areas**

In [1]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils



In [2]:
def generate_satellite_features(gdf, year = 2018):
    '''
    Generates features derived from satellite images by piercing through rasters using the centroids of the grid from gdf
    
    Args
        gdf (GeoDataFrame): indicator labelled grid
    Returns
        gdf (GeoDataFrame): indicator labelled grid with features
    '''
    # satellite image derived - pierce through rasters
    geom_col = 'centroid_geometry'
    tifs_with_250m = ['nighttime_lights', 'population', 'elevation', 'urban_index']
    satellite_features_ = [f + '_250m' if f in tifs_with_250m else f for f in satellite_features] + ['nearest_highway']
    pois_ = ['waterway', 'commercial', 'restaurant', 'hospital', 'airport']
    poi_features_ = ['clipped_nearest_' + poi for poi in pois_]
    for feature in tqdm(poi_features_ + satellite_features_):
        if feature in satellite_features_:
            tif_file = feats_dir + f'{year}_{area}_{feature}.tif'
        else:
            tif_file = feats_dir + f'2018_{area}_{feature}.tif'
        raster = rio.open(tif_file)

        # Perform point sampling
        pxl = []
        for index, row in gdf.iterrows():
            for val in raster.sample([(row[geom_col].x, row[geom_col].y)]):
                pxl.append(val[0])

        # Add column to geodataframe
        col_name = feature.replace('clipped_','')
        gdf[col_name] = pxl
        
    # remove _250m suffix
    feats_250m = ['nighttime_lights_250m', 'population_250m', 'elevation_250m', 'urban_index_250m']
    gdf.columns = [f[:-5] if f in feats_250m else f for f in gdf.columns]
    
    return gdf

In [3]:
!gsutil cp gs://immap-wash-training/grid/grids_in_metro_areas_2020.csv {data_dir}
!gsutil cp gs://immap-wash-training/features/2020_*.tif {feats_dir}
!gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2020_colombia_aridity_cgiarv2.tif
!gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2020_colombia_nearest_highway.tif

Copying gs://immap-wash-training/grid/grids_in_metro_areas_2020.csv...
/ [1 files][  6.8 MiB/  6.8 MiB]                                                
Operation completed over 1 objects/6.8 MiB.                                      
Copying gs://immap-wash-training/features/2020_colombia_aridity.tif...
Copying gs://immap-wash-training/features/2020_colombia_elevation.tif...        
Copying gs://immap-wash-training/features/2020_colombia_nighttime_lights.tif... 
Copying gs://immap-wash-training/features/2020_colombia_population.tif...       
\ [4 files][ 22.9 MiB/ 22.9 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://immap-wash-training/features/2020_colombia_temperature.tif...
Copying gs://immap-wash-training/features/20

In [2]:
df = pd.read_csv(data_dir + 'grids_in_metro_areas_2020.csv')
geom_col = 'centroid_geometry'
df[geom_col] = df[geom_col].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry = geom_col)

## Generate data for 2020

In [5]:
# gdf = generate_satellite_features(gdf, year = 2020)
# test_df = geoutils.generate_training_data(gdf)
# cols = ['id', 'metro_id', 'geometry'] + poi_features + satellite_features
# print(test_df.shape)
# print('Complete cases %: ' + str(test_df.dropna(subset = cols).shape[0]/test_df.shape[0]))
# test_df.to_csv(data_dir + '20200916_dataset_2019.csv')
test_df = pd.read_csv(data_dir + '20200916_dataset_2020.csv')
print(test_df.shape)
test_df.head(3)

100%|██████████| 13/13 [01:30<00:00,  6.95s/it]


(26542, 41)
Complete cases: 1.0
(26542, 41)


Unnamed: 0,pixelated_urban_area_id,id,geometry,adm1_name,adm2_name,centroid_geometry,metro_id,nearest_waterway,nearest_commercial,nearest_restaurant,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,,18141899,"POLYGON ((-76.52268665234379 3.4399769602661, ...",Valle del Cauca,Cali,POINT (-76.52156 3.43885),9,584.187866,538.910583,590.252197,...,,,,,,,,,-76.521561,3.438851
1,,18118423,"POLYGON ((-76.5294434023438 3.4219589602661, -...",Valle del Cauca,Cali,POINT (-76.52832 3.42083),9,183.715317,758.190247,340.253296,...,,,,,,,,,-76.528317,3.420833
2,,18204539,"POLYGON ((-76.5046686523438 3.4039409602661, -...",Valle del Cauca,Cali,POINT (-76.50354 3.40281),9,608.579407,2086.701416,614.631348,...,,,,,,,,,-76.503543,3.402815


## Train full model on 2018

In [6]:
train_df = pd.read_csv(data_dir + '20200830_dataset.csv')
print(train_df.shape)

(57143, 45)


In [7]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

def model(train_df, test_df):
    global clf
    clf = RandomForestRegressor(random_state=42)
    
    feats = []
    for indicator in tqdm(indicators):

        avg_metrics = {'correlation':[], 'r2':[], 'mae':[], 'rmse':[]}
        X_train, y_train = train_df[features], train_df[indicator]
        X_test = test_df[features]
        scaler = RobustScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        clf = joblib.load(model_dir + 'model_' + indicator + '_2018_250mv2.pkl')
        # clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        test_df['pred_' + indicator] = y_pred
        
        feature_importances = pd.DataFrame({'feature': list(train_df[features].columns)
                                            , 'importance': list(clf.feature_importances_)})
        top_features = (feature_importances
                            .sort_values(by=['importance'], ascending = False))
        top_features['indicator'] = indicator
        feats.append(top_features)
        
#         joblib.dump(clf, model_dir + 'model_' + indicator + '_2018_250mv2.pkl')
    
#     joblib.dump(scaler, scaler_dir + 'scaler_2018_250mv2.pkl')
    
    return test_df, pd.concat(feats, axis = 0).reset_index(drop = True)

In [8]:
test_df, top_features = model(train_df, test_df)
# top_features.to_csv('top_features_2018.csv', index = False)
# test_df.to_csv('20200908_predictions2020.csv', index = False)

100%|██████████| 3/3 [02:52<00:00, 57.53s/it]


## Aggregate grid predictions to metro areas

In [4]:
raw = pd.read_csv(data_dir + '20200916_predictions2020.csv')
test_df = pd.merge(raw, df[['id', 'metro_id']], how = 'left', on = 'id').dropna(subset = ['metro_id'])

In [5]:
# TODO: column names are quite confusing..
# estimate number of households in grid with wash access
for indicator in indicators:
    test_df['pred_' + indicator.replace('perc_', '')] = test_df['population']*test_df['pred_' + indicator]

# sum household count by area
hh_cols = ['pred_' + ind.replace('perc_', '') for ind in indicators]
pred_metro = (test_df[['metro_id', 'population'] + hh_cols]
                    .groupby('metro_id').agg('sum').reset_index())

# calculate new percentage hh no access
for indicator in indicators:
    pred_metro['pred_' + indicator] = 100*pred_metro['pred_' + indicator.replace('perc_', '')] / pred_metro['population']

## Compare to actual values

In [7]:
# !gsutil cp gs://immap-wash-training/indicators/20200908_GEIH_Metro_Areas_2020.csv {data_dir}
true_metro = pd.read_csv(data_dir + '20200908_GEIH_Metro_Areas_2020.csv')

spanish = {
    'personas': 'population',
    'c_acueduct': 'hh_no_water_supply',
    'c_alcantar': 'hh_no_sewage',
    'c_sanitari': 'hh_no_toilet',
    'mc_acueduc': 'perc_hh_no_water_supply',
    'mc_alcanta': 'perc_hh_no_sewage',
    'mc_sanitar': 'perc_hh_no_toilet',
}

pd.set_option('display.float_format', lambda x: '%.6f' % x)

df1 = pred_metro.sort_values('metro_id', ascending = True)

df2 = true_metro[['OBJECTID', 'geometry'] + list(spanish.keys())].rename(columns=spanish)

cons = pd.merge(
    df1,#[['metro_id', 'pred_perc_hh_no_water_supply', 'pred_perc_hh_no_sewage', 'pred_perc_hh_no_toilet']], 
    df2,#[['OBJECTID', 'geometry', 'perc_hh_no_water_supply', 'perc_hh_no_sewage', 'perc_hh_no_toilet']], 
    left_on = 'metro_id', right_on = 'OBJECTID'
).drop(labels = 'OBJECTID', axis = 1)

cons.to_csv(data_dir + 'metro_area_predictions_2020.csv')

In [14]:
cons.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
metro_id,23.0,12.0,6.78233,1.0,6.5,12.0,17.5,23.0
population_x,23.0,107149.969808,214364.257533,4020.803968,25539.515156,33526.462081,70427.639944,1008162.692571
pred_hh_no_water_supply,23.0,2984.992345,3785.497309,375.242262,823.163019,1610.638819,3926.330589,17931.156332
pred_hh_no_toilet,23.0,3033.848579,5380.217932,448.503569,878.714122,963.708753,2311.318848,25176.602262
pred_hh_no_sewage,23.0,4900.815668,6037.823177,720.291448,1378.069475,2532.055679,5720.063764,27909.73945
pred_perc_hh_no_water_supply,23.0,6.324619,10.314735,1.778597,2.440661,3.772738,5.268309,51.860185
pred_perc_hh_no_toilet,23.0,4.669408,7.197834,2.18646,2.670107,3.145573,3.48311,37.500391
pred_perc_hh_no_sewage,23.0,9.686499,12.163954,2.768377,3.519571,6.177095,10.076741,61.612739
population_y,23.0,343968.0,599073.573509,32560.0,89878.0,132743.0,251821.5,2769346.0
hh_no_water_supply,23.0,2999.466329,5444.379756,0.0,123.989871,629.05378,3014.15315,19516.64


In [15]:
for indicator in indicators:
    print(indicator)
    print(modelutils.calculate_metrics(cons[indicator], cons['pred_' + indicator]))

perc_hh_no_water_supply
{'correlation': 0.9898469593061956, 'r2': 0.9797970028477212, 'mae': 3.1752462527092917, 'rmse': 3.572032230475504}
perc_hh_no_toilet
{'correlation': 0.9945016296831816, 'r2': 0.989033491442504, 'mae': 3.3844946321084297, 'rmse': 5.805581135625987}
perc_hh_no_sewage
{'correlation': 0.9870616086123918, 'r2': 0.9742906191964825, 'mae': 4.82288391705089, 'rmse': 6.160428343871404}


In [16]:
# population from world pop (_x) is a subset only of population from GEIH (_y)
cons[['metro_id', 'population_x', 'population_y']]

Unnamed: 0,metro_id,population_x,population_y
0,1,27416.224103,99866.0
1,2,23470.617219,121065.0
2,3,10642.130358,55073.0
3,4,4020.803968,32560.0
4,5,14548.79535,66999.0
5,6,225499.037853,462727.0
6,7,1008162.692571,2769346.0
7,8,77551.116188,341502.0
8,9,249623.219799,845629.0
9,10,69249.107465,258850.0
