In [1]:
year = 2018

This notebook predicts for the relevant year using the 2018 model, for both urban and rural areas.

## Imports and Setup

In [2]:
import re
import numpy as np
import pandas as pd
from math import sqrt
import geopandas as gpd
import rasterio as rio
from shapely.wkt import loads
from tqdm import tqdm
from datetime import datetime, timedelta
from joblib import Parallel, delayed
from itertools import repeat
import os

# ignore warnings
import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, '../utils')
from settings import *
import geoutils
import modelutils

## Set directories

In [3]:
dept_dir = data_dir + 'by_dept/'
grid250_dir = dept_dir + 'grid_250x250m/'
feats250_dir = dept_dir + 'features/'
preds250_dir = dept_dir + 'predictions/'

## Download data from Cloud Storage

In [10]:
!gsutil -m rsync -r gs://immap-wash-training/grid/grid_250x250m/ {grid250_dir}
!gsutil cp gs://immap-wash-training/grid/grid_250x250m_wadmin.csv {data_dir}
!gsutil cp gs://immap-wash-training/grid/grids_in_urban_and_rural_areas.csv {data_dir}
!gsutil cp gs://immap-wash-training/features/2020_*.tif {feats_dir}
!gsutil cp gs://immap-wash-training/features/2019_*.tif {feats_dir}
!gsutil cp gs://immap-wash-training/features/2018_colombia_aridity_cgiarv2.tif {feats_dir}2020_colombia_aridity_cgiarv2.tif
!gsutil cp gs://immap-wash-training/features/2018_colombia_nearest_highway.tif {feats_dir}2020_colombia_nearest_highway.tif

## Get list of admin bounds to iterate over

In [4]:
adm1s = ['amazonas', 'antioquia', 'arauca', 'atlntico', 'bogot_dc', 'bolvar', 'boyac', 'caldas', 'caquet', 'casanare', 'cauca', 'cesar', 'choc', 'crdoba', 'cundinamarca', 'guaina', 'guaviare', 'huila', 'la_guajira', 'magdalena', 'meta', 'nario', 'norte_de_santander', 'putumayo', 'quindo', 'risaralda', 'san_andrs_y_providencia', 'santander', 'sucre', 'tolima', 'valle_del_cauca', 'vaups', 'vichada']
#adm1s = list(set(adm1s) - {'amazonas', 'bogot_dc'})
adm1s.sort()

## Load 2018 data

In [5]:
df = pd.read_csv(data_dir + '20200916_dataset.csv')
train_df = df.copy()
print(train_df.shape)

(57036, 45)


## Rollout by department chunk (takes 24 hours)

For each department, predict on 30K rows (chunk) at a time

In [None]:
for adm1 in tqdm(adm1s):
    try:
        modelutils.predict_by_chunk(adm1)
    except:
        f = open(preds250_dir + f'failed-{adm1}.txt', 'w')
        f.close()

  0%|          | 0/33 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [02:04, 124.77s/it][A
2it [04:11, 125.31s/it][A
3it [06:14, 124.72s/it][A

## Combine chunks to one raster (takes 1h30m)

For All departments Amazonas took 7mins

In [6]:
for adm1 in tqdm(adm1s):
    modelutils.gpkgs_to_raster(adm1, verbose = False)

100%|██████████| 33/33 [1:41:16<00:00, 184.13s/it]


## Copy to gcs folder

In [4]:
outnames = [
    fname for fname in os.listdir(preds250_dir) 
    if '.tif' in fname
]
outnames.sort()

for outname in tqdm(outnames):
    !gsutil -q cp {preds250_dir}{outname} gs://immap-output/20201008/

100%|██████████| 99/99 [03:39<00:00,  2.22s/it]
