This notebook generates 2020 predictions for urban and rural areas.

In [1]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils



In [2]:
def generate_satellite_features(gdf, year = 2018):
    '''
    Generates features derived from satellite images by piercing through rasters using the centroids of the grid from gdf
    
    Args
        gdf (GeoDataFrame): indicator labelled grid
    Returns
        gdf (GeoDataFrame): indicator labelled grid with features
    '''
    # satellite image derived - pierce through rasters
    geom_col = 'centroid_geometry'
    satellite_features_ = satellite_features + ['nearest_highway']
    pois_ = ['waterway', 'commercial', 'restaurant', 'hospital', 'airport']
    poi_features_ = ['clipped_nearest_' + poi for poi in pois_]
    for feature in tqdm(poi_features_ + satellite_features_):
        if feature in satellite_features_:
            tif_file = feats_dir + f'{year}_{area}_{feature}.tif'
        else:
            tif_file = feats_dir + f'2018_{area}_{feature}.tif'
        raster = rio.open(tif_file)

        # Perform point sampling
        pxl = []
        for index, row in gdf.iterrows():
            for val in raster.sample([(row[geom_col].x, row[geom_col].y)]):
                pxl.append(val[0])

        # Add column to geodataframe
        col_name = feature.replace('clipped_','')
        gdf[col_name] = pxl
    return gdf

In [3]:
# !gsutil cp gs://immap-wash-training/grid/grids_in_urban_and_rural_areas.csv {data_dir}
# !gsutil cp gs://immap-wash-training/features/2020_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2020_colombia_aridity_cgiarv2.tif
# !gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2020_colombia_nearest_highway.tif

In [3]:
df = (pd.read_csv(data_dir + 'grids_in_urban_and_rural_areas.csv')
      .sort_values(by = 'urbanity')
      .drop_duplicates(subset = 'id', keep = 'last'))
geom_col = 'centroid_geometry'
df[geom_col] = df[geom_col].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry = geom_col)

## Generate data for 2020

In [4]:
# gdf = generate_satellite_features(gdf, year = 2020)
# test_df = geoutils.generate_training_data(gdf)
# cols = ['id', 'geometry'] + poi_features + satellite_features
# print(test_df.shape)
# test_df = test_df.dropna(subset = cols)
# print('Complete cases %: ' + str(test_df.dropna(subset = cols).shape[0]/test_df.shape[0]*100))
# test_df.to_csv(data_dir + '20200914_dataset_2020.csv')
test_df = pd.read_csv(data_dir + '20200914_dataset_2020.csv')
print(test_df.shape)
test_df.head(3)

(67218, 42)


Unnamed: 0.1,Unnamed: 0,pixelated_urban_area_id,id,geometry,urbanity,centroid_geometry,adm1_name,adm2_name,nearest_waterway,nearest_commercial,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,0,,25303605,"POLYGON ((-74.4618779023438 5.4985334602661, -...",r,POINT (-74.4607517772487 5.49740733543157),cundinamarca,caparrapi,1408.745117,40000.0,...,,,,,,,,,-74.460752,5.497407
1,1,,25272300,"POLYGON ((-74.47088690234379 5.4827677102661, ...",r,POINT (-74.469760777254 5.48164158547759),cundinamarca,caparrapi,2134.79248,40000.0,...,,,,,,,,,-74.469761,5.481642
2,2,,21837755,"POLYGON ((-75.45962465234379 1.0976369602661, ...",r,POINT (-75.4584985274804 1.09651083525103),caquet,milan,3844.412842,40000.0,...,,,,,,,,,-75.458499,1.096511


## Train full model on 2018

In [5]:
train_df = pd.read_csv(data_dir + '20200830_dataset.csv')
print(train_df.shape)

(57143, 45)


In [6]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

def model(train_df, test_df):
    global clf
    clf = RandomForestRegressor(random_state=42)
    
    feats = []
    for indicator in tqdm(indicators[0:1]):

        avg_metrics = {'correlation':[], 'r2':[], 'mae':[], 'rmse':[]}
        X_train, y_train = train_df[features], train_df[indicator]
        X_test = test_df[features]
        scaler = RobustScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # clf = joblib.load(model_dir + 'model_' + indicator + '_2018_250m.pkl')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        test_df['pred_' + indicator] = y_pred
        
        feature_importances = pd.DataFrame({'feature': list(train_df[features].columns)
                                            , 'importance': list(clf.feature_importances_)})
        top_features = (feature_importances
                            .sort_values(by=['importance'], ascending = False))
        top_features['indicator'] = indicator
        feats.append(top_features)
        
#         joblib.dump(clf, model_dir + 'model_' + indicator + '_2018_250m.pkl')
    
    joblib.dump(scaler, scaler_dir + 'scaler_2018_250m.pkl')
    
    return test_df, pd.concat(feats, axis = 0).reset_index(drop = True)

In [10]:
# test_df, top_features = model(train_df, test_df)
# top_features.to_csv('top_features_2018.csv', index = False)
# test_df.to_csv(data_dir + '20200914_predictions2020.csv', index = False)

100%|██████████| 1/1 [01:00<00:00, 60.52s/it]


In [7]:
scaler = joblib.load(scaler_dir + 'scaler_2018_250m.pkl')

## what changed

In [8]:
agg_level = 'adm1_name'
keep_cols = [agg_level] + features + indicators

def clean_name(text):
    return re.sub('[^a-z ]','', text.lower()).replace(' ', '_')

In [9]:
raw = pd.read_csv(data_dir + '20200830_dataset.csv').drop_duplicates('id')
raw['adm1_name'] = raw['adm1_name'].apply(clean_name)

feats_2020 = pd.read_csv(data_dir + '20200914_dataset_2020.csv')
preds_2020 = pd.read_csv(data_dir + '20200914_predictions2020.csv').rename(columns = {
    'pred_perc_hh_no_water_supply': 'perc_hh_no_water_supply',
    'pred_perc_hh_no_toilet': 'perc_hh_no_toilet',
    'pred_perc_hh_no_sewage': 'perc_hh_no_sewage',
})[['id', 'perc_hh_no_water_supply', 'perc_hh_no_toilet', 'perc_hh_no_sewage']]

In [18]:
# join
wash_grid_2018_ = raw
wash_grid_2020_ = pd.merge(feats_2020, preds_2020, on = 'id')

print(wash_grid_2018_.shape)
print(wash_grid_2020_.shape)
# filter to 2018 grids only for comparability
wash_grid_2020_ = wash_grid_2020_[wash_grid_2020_['id'].isin(list(wash_grid_2018_['id'].unique()))]
print(wash_grid_2020_.shape)

# scale features except population
wash_grid_2018 = wash_grid_2018_[keep_cols].copy()
wash_grid_2020 = wash_grid_2020_[keep_cols].copy()

wash_grid_2018.loc[:,features] = scaler.transform(wash_grid_2018[features])
wash_grid_2020.loc[:,features] = scaler.transform(wash_grid_2020[features])

wash_grid_2018['population'] = wash_grid_2018_['population']
wash_grid_2020['population'] = wash_grid_2020_['population']

# standardize naming
to_replace = {'laguajira': 'la_guajira','valledelcauca': 'valle_del_cauca'}
wash_grid_2018['adm1_name'] = wash_grid_2018['adm1_name'].replace(to_replace)
wash_grid_2020['adm1_name'] = wash_grid_2020['adm1_name'].replace(to_replace)

# get median for everything except population
agg_type = {
    'vegetation': 'median',
    'aridity_cgiarv2': 'median',
    'temperature': 'median',
    'nighttime_lights': 'median',
    'population': 'sum', ###
    'elevation': 'median',
    'urban_index': 'median',
    'nearest_waterway': 'median',
    'nearest_commercial': 'median',
    'nearest_restaurant': 'median',
    'nearest_hospital': 'median',
    'nearest_airport': 'median',
    'nearest_highway': 'median',
    'perc_hh_no_water_supply': 'median',
    'perc_hh_no_toilet': 'median',
    'perc_hh_no_sewage': 'median',
}
wash_metro_2018 = wash_grid_2018.groupby(agg_level).agg(agg_type).reset_index()
wash_metro_2020 = wash_grid_2020.groupby(agg_level).agg(agg_type).reset_index()

# combine (wide format)
wash_agg = pd.merge(
    wash_metro_2018, wash_metro_2020, left_on = agg_level, right_on = agg_level, suffixes = ['', '_2020']
    , how = 'left'
)

# convert to long
df_ = wash_agg.set_index('adm1_name').stack().reset_index()
df_.columns = ['adm1_name', 'feature', 'value']
df_['year'] = 2018
for i, row in df_.iterrows():
    if row.feature[-5:] == '_2020':
        df_.loc[i, 'year'] = 2020
        df_.loc[i, 'feature'] = df_.loc[i, 'feature'][:-5]

df_.to_csv('wash_agg.csv', index = False)
df_.tail(3)

Unnamed: 0,adm1_name,feature,value,year
1053,vichada,perc_hh_no_water_supply,0.176869,2020
1054,vichada,perc_hh_no_toilet,0.078483,2020
1055,vichada,perc_hh_no_sewage,0.722966,2020


In [34]:
wash_agg.median()

vegetation                          0.123851
aridity_cgiarv2                     0.123958
temperature                         0.091185
nighttime_lights                   -0.081389
population                      38136.793795
elevation                          -0.403423
urban_index                        -0.366667
nearest_waterway                    0.101871
nearest_commercial                  0.161975
nearest_restaurant                  0.064027
nearest_hospital                    0.086090
nearest_airport                    -0.019912
nearest_highway                     0.067636
perc_hh_no_water_supply             0.006579
perc_hh_no_toilet                   0.012558
perc_hh_no_sewage                   0.038235
vegetation_2020                    -0.004327
aridity_cgiarv2_2020                0.116220
temperature_2020                    0.213526
nighttime_lights_2020              -0.075808
population_2020                 47619.434995
elevation_2020                     -0.402200
urban_inde