In [16]:
year = 2020

This notebook predicts for the relevant year using the 2018 model, for both urban and rural areas.

## Imports and Setup

In [2]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils



## Download data from Cloud Storage

In [None]:
# !gsutil cp gs://immap-wash-training/grid/grids_in_urban_and_rural_areas.csv {data_dir}
# !gsutil cp gs://immap-wash-training/features/2020_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/2019_*.tif {feats_dir}
# !gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2020_colombia_aridity_cgiarv2.tif
# !gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2020_colombia_nearest_highway.tif

## Load grids for prediction

In [None]:
df = (pd.read_csv(data_dir + 'grids_in_urban_and_rural_areas.csv')
      .sort_values(by = 'urbanity')
      .drop_duplicates(subset = 'id', keep = 'last'))
geom_col = 'centroid_geometry'
df[geom_col] = df[geom_col].apply(loads)
gdf = gpd.GeoDataFrame(df, geometry = geom_col)

## Load 2018 data

In [12]:
df = pd.read_csv(data_dir + '20200916_dataset.csv')
df['population'] = df['population'].fillna(0)
df = df.dropna().reset_index(drop = True)
train_df = df.copy()
print(train_df.shape)

(57036, 45)


## Generate data for particular year

In [19]:
gdf = geoutils.generate_satellite_features(gdf, year = year)
test_df = geoutils.generate_training_data(gdf)
cols = ['id', 'geometry'] + poi_features + satellite_features
print(test_df.shape)
test_df = test_df.dropna(subset = cols)
print('Complete cases %: ' + str(test_df.dropna(subset = cols).shape[0]/test_df.shape[0]*100))
test_df.to_csv(data_dir + f'20200916_dataset_{year}.csv')
# test_df = pd.read_csv(data_dir + f'20200916_dataset_{year}.csv')
print(test_df.shape)
test_df.head(3)

100%|██████████| 13/13 [03:39<00:00, 16.90s/it]


(67612, 41)
Complete cases %: 100.0
(67131, 41)


Unnamed: 0,pixelated_urban_area_id,id,geometry,urbanity,centroid_geometry,adm1_name,adm2_name,nearest_waterway,nearest_commercial,nearest_restaurant,...,lag_aridity_cgiarv2,lag_temperature,lag_nighttime_lights,lag_population,lag_elevation,lag_urban_index,lag_nearest_highway,nighttime_lights_area_mean,x,y
0,,25303605,"POLYGON ((-74.4618779023438 5.4985334602661, -...",r,POINT (-74.46075 5.49741),cundinamarca,caparrapi,1408.745117,40000.0,25372.03125,...,,,,,,,,,-74.460752,5.497407
1,,25272300,"POLYGON ((-74.47088690234379 5.4827677102661, ...",r,POINT (-74.46976 5.48164),cundinamarca,caparrapi,2134.79248,40000.0,21935.181641,...,,,,,,,,,-74.469761,5.481642
2,,21837755,"POLYGON ((-75.45962465234379 1.0976369602661, ...",r,POINT (-75.45850 1.09651),caquet,milan,3844.412842,40000.0,6694.293945,...,,,,,,,,,-75.458499,1.096511


## Train full model on 2018

In [21]:
test_df, top_features = modelutils.model_rollout(train_df, test_df)

100%|██████████| 3/3 [00:14<00:00,  4.93s/it]


## Save outputs to local

In [None]:
top_features.to_csv('top_features_2018.csv', index = False)
test_df.to_csv(data_dir + f'20200916_predictions{year}.csv', index = False)