In [16]:
year = 2020

This notebook generates predictions for the relevant year, for both urban and rural areas.

## Imports and Setup

In [2]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils



## Download data from Cloud Storage

In [None]:
# !gsutil cp gs://immap-wash-training/grid/grids_in_urban_and_rural_areas.csv {data_dir}
# !gsutil cp gs://immap-wash-training/features/2020_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/2019_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2020_colombia_aridity_cgiarv2.tif
# !gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2020_colombia_nearest_highway.tif

## Load data to memory

In [18]:
df = (pd.read_csv(data_dir + 'grids_in_urban_and_rural_areas.csv')
      .sort_values(by = 'urbanity')
      .drop_duplicates(subset = 'id', keep = 'last'))
geom_col = 'centroid_geometry'
df[geom_col] = df[geom_col].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry = geom_col)

## Generate data for particular year

In [19]:
gdf = geoutils.generate_satellite_features(gdf, year = year)
test_df = geoutils.generate_training_data(gdf)
cols = ['id', 'geometry'] + poi_features + satellite_features
print(test_df.shape)
test_df = test_df.dropna(subset = cols)
print('Complete cases %: ' + str(test_df.dropna(subset = cols).shape[0]/test_df.shape[0]*100))
test_df.to_csv(data_dir + f'20200916_dataset_{year}.csv')
# test_df = pd.read_csv(data_dir + f'20200916_dataset_{year}.csv')
print(test_df.shape)
test_df.head(3)

100%|██████████| 13/13 [03:39<00:00, 16.90s/it]


(67612, 41)
Complete cases %: 100.0
(67131, 41)


Unnamed: 0,pixelated_urban_area_id,id,geometry,urbanity,centroid_geometry,adm1_name,adm2_name,nearest_waterway,nearest_commercial,nearest_restaurant,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,,25303605,"POLYGON ((-74.4618779023438 5.4985334602661, -...",r,POINT (-74.46075 5.49741),cundinamarca,caparrapi,1408.745117,40000.0,25372.03125,...,,,,,,,,,-74.460752,5.497407
1,,25272300,"POLYGON ((-74.47088690234379 5.4827677102661, ...",r,POINT (-74.46976 5.48164),cundinamarca,caparrapi,2134.79248,40000.0,21935.181641,...,,,,,,,,,-74.469761,5.481642
2,,21837755,"POLYGON ((-75.45962465234379 1.0976369602661, ...",r,POINT (-75.45850 1.09651),caquet,milan,3844.412842,40000.0,6694.293945,...,,,,,,,,,-75.458499,1.096511


## Train full model on 2018

In [12]:
df = pd.read_csv(data_dir + '20200916_dataset.csv')
df['population'] = df['population'].fillna(0)
df = df.dropna().reset_index(drop = True)
train_df = df.copy()
print(train_df.shape)

(57036, 45)


In [21]:
test_df, top_features = modelutils.model_rollout(train_df, test_df)
# top_features.to_csv('top_features_2018.csv', index = False)
test_df.to_csv(data_dir + f'20200916_predictions{year}.csv', index = False)

100%|██████████| 3/3 [00:14<00:00,  4.93s/it]


## what changed

In [None]:
scaler = joblib.load(scaler_dir + 'scaler_2018_250mv2.pkl')

In [None]:
agg_level = 'adm1_name'
keep_cols = [agg_level] + features + indicators

def clean_name(text):
    return re.sub('[^a-z ]','', text.lower()).replace(' ', '_')

In [None]:
raw = pd.read_csv(data_dir + '20200830_dataset.csv').drop_duplicates('id')
raw['adm1_name'] = raw['adm1_name'].apply(clean_name)

feats_2020 = pd.read_csv(data_dir + '20200914_dataset_2020.csv')
preds_2020 = pd.read_csv(data_dir + '20200914_predictions2020.csv').rename(columns = {
    'pred_perc_hh_no_water_supply': 'perc_hh_no_water_supply',
    'pred_perc_hh_no_toilet': 'perc_hh_no_toilet',
    'pred_perc_hh_no_sewage': 'perc_hh_no_sewage',
})[['id', 'perc_hh_no_water_supply', 'perc_hh_no_toilet', 'perc_hh_no_sewage']]

In [None]:
# join
wash_grid_2018_ = raw
wash_grid_2020_ = pd.merge(feats_2020, preds_2020, on = 'id')

print(wash_grid_2018_.shape)
print(wash_grid_2020_.shape)
# filter to 2018 grids only for comparability
wash_grid_2020_ = wash_grid_2020_[wash_grid_2020_['id'].isin(list(wash_grid_2018_['id'].unique()))]
print(wash_grid_2020_.shape)

# scale features except population
wash_grid_2018 = wash_grid_2018_[keep_cols].copy()
wash_grid_2020 = wash_grid_2020_[keep_cols].copy()

wash_grid_2018.loc[:,features] = scaler.transform(wash_grid_2018[features])
wash_grid_2020.loc[:,features] = scaler.transform(wash_grid_2020[features])

wash_grid_2018['population'] = wash_grid_2018_['population']
wash_grid_2020['population'] = wash_grid_2020_['population']

# standardize naming
to_replace = {'laguajira': 'la_guajira','valledelcauca': 'valle_del_cauca'}
wash_grid_2018['adm1_name'] = wash_grid_2018['adm1_name'].replace(to_replace)
wash_grid_2020['adm1_name'] = wash_grid_2020['adm1_name'].replace(to_replace)

# get median for everything except population
agg_type = {
    'vegetation': 'median',
    'aridity_cgiarv2': 'median',
    'temperature': 'median',
    'nighttime_lights': 'median',
    'population': 'sum', ###
    'elevation': 'median',
    'urban_index': 'median',
    'nearest_waterway': 'median',
    'nearest_commercial': 'median',
    'nearest_restaurant': 'median',
    'nearest_hospital': 'median',
    'nearest_airport': 'median',
    'nearest_highway': 'median',
    'perc_hh_no_water_supply': 'median',
    'perc_hh_no_toilet': 'median',
    'perc_hh_no_sewage': 'median',
}
wash_metro_2018 = wash_grid_2018.groupby(agg_level).agg(agg_type).reset_index()
wash_metro_2020 = wash_grid_2020.groupby(agg_level).agg(agg_type).reset_index()

# combine (wide format)
wash_agg = pd.merge(
    wash_metro_2018, wash_metro_2020, left_on = agg_level, right_on = agg_level, suffixes = ['', '_2020']
    , how = 'left'
)

# convert to long
df_ = wash_agg.set_index('adm1_name').stack().reset_index()
df_.columns = ['adm1_name', 'feature', 'value']
df_['year'] = 2018
for i, row in df_.iterrows():
    if row.feature[-5:] == '_2020':
        df_.loc[i, 'year'] = 2020
        df_.loc[i, 'feature'] = df_.loc[i, 'feature'][:-5]

df_.to_csv('wash_agg.csv', index = False)
df_.tail(3)

Unnamed: 0,adm1_name,feature,value,year
1053,vichada,perc_hh_no_water_supply,0.176869,2020
1054,vichada,perc_hh_no_toilet,0.078483,2020
1055,vichada,perc_hh_no_sewage,0.722966,2020


In [None]:
wash_agg.median()

vegetation                          0.123851
aridity_cgiarv2                     0.123958
temperature                         0.091185
nighttime_lights                   -0.081389
population                      38136.793795
elevation                          -0.403423
urban_index                        -0.366667
nearest_waterway                    0.101871
nearest_commercial                  0.161975
nearest_restaurant                  0.064027
nearest_hospital                    0.086090
nearest_airport                    -0.019912
nearest_highway                     0.067636
perc_hh_no_water_supply             0.006579
perc_hh_no_toilet                   0.012558
perc_hh_no_sewage                   0.038235
vegetation_2020                    -0.004327
aridity_cgiarv2_2020                0.116220
temperature_2020                    0.213526
nighttime_lights_2020              -0.075808
population_2020                 47619.434995
elevation_2020                     -0.402200
urban_inde