# Model Prediction
This notebook implements prediction using sliding window approach.

## Imports and Setup

In [1]:
import os
import joblib
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
version = '20200509'
data_dir = "../data/"
model_dir = '../models/'
output_dir = "../outputs/probmaps/"
input_file = data_dir + '{}_dataset.csv'.format(version)

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
tmp_dir = data_dir + 'tmp/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

In [4]:
!gsutil -q -m cp gs://immap-images/20200531/*.tif {images_dir}
!gsutil -q -m cp gs://immap-indices/20200531/*.tif {indices_dir}

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
features = [column  for column in data.columns[:-3]]
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (1029869, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,target,uid,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.438625,0.151655,0.209614,0.479385,-0.665204,-0.060444,0.948025,3,39,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.43545,0.151655,0.213526,0.48833,-0.66301,-0.059064,0.952352,3,39,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.447333,0.145385,0.222971,0.473118,-0.661291,-0.060496,0.911748,3,39,0


## Instantiate Model

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(penalty='l1', C=1.0, solver='warn') #scikit-learn==0.21.3
rf = RandomForestClassifier(
    n_estimators=800, 
    max_depth=12, 
    min_samples_split=15,
    min_samples_leaf=2,
    random_state=42
)

## Train and Save Models

In [None]:
clfs = {
    'model_LR_30k': lr, # trains for 30mins
    'model_RF_30k': rf, # trains for <2mins
}
for f, clf in clfs.items():
    print(f)
    model, features = model_utils.train_model(clf, data, num_neg_samples=30000, random_state=SEED)
    filename = model_dir + f'{f}.sav'
    joblib.dump(model, filename)

## Load Model

In [7]:
model_names = ['LR_30k', 'RF_30k']
models = []
for model_name in model_names:
    filename = '{}model_{}.sav'.format(model_dir, model_name)
    models.append(joblib.load(filename))

## Prediction

In [8]:
areas = list(set([image.split('_')[0] for image in os.listdir(images_dir)]))
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir)

for area in areas:
    for model, model_name in zip(models, model_names):
        out_dir = output_dir + model_name + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        output = '{}{}_{}_{}.tif'.format(
            out_dir, version, area, model_name
        )
        if not os.path.isfile(output):
            geoutils.get_preds_windowing(
                area=area, 
                area_dict=area_dict,
                model=model, 
                tmp_dir=tmp_dir,
                best_features=features,  
                output=output, 
                grid_blocks=9,
                threshold=0
            )

Processing soacha...: 100%|██████████| 81/81 [01:18<00:00,  1.03it/s]


## Get Ensemble

In [4]:
areas = list(set([image.split('_')[0] for image in os.listdir(images_dir)]))
for area in areas:
    filename1 = '{0:}{3:}/{1:}_{2:}_{3:}.tif'.format(
        output_dir, version, area, model_names[0]
    )
    filename2 = '{0:}{3:}/{1:}_{2:}_{3:}.tif'.format(
        output_dir, version, area, model_names[1]
    )
    out_dir = output_dir + 'ensembled/'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_file = '{}{}_{}.tif'.format(out_dir, version, area)
    geoutils.get_rasters_merged(
        filename1,
        filename2,
        output_file,
        tmp_dir,
        grid_blocks=9
    )

100%|██████████| 81/81 [00:00<00:00, 82.51it/s]
100%|██████████| 81/81 [00:18<00:00,  4.28it/s]
100%|██████████| 81/81 [00:03<00:00, 26.84it/s]
100%|██████████| 81/81 [00:01<00:00, 44.62it/s]
100%|██████████| 81/81 [00:01<00:00, 53.62it/s]
100%|██████████| 81/81 [00:00<00:00, 86.13it/s]
100%|██████████| 81/81 [00:01<00:00, 44.98it/s]
100%|██████████| 81/81 [00:01<00:00, 52.84it/s]
100%|██████████| 81/81 [00:05<00:00, 14.53it/s]
100%|██████████| 81/81 [00:00<00:00, 86.72it/s]
100%|██████████| 81/81 [00:12<00:00,  6.57it/s]
100%|██████████| 81/81 [00:09<00:00,  8.94it/s]
100%|██████████| 81/81 [00:11<00:00,  7.23it/s]
100%|██████████| 81/81 [00:02<00:00, 31.18it/s]
100%|██████████| 81/81 [00:04<00:00, 18.01it/s]
100%|██████████| 81/81 [00:04<00:00, 17.57it/s]
100%|██████████| 81/81 [00:04<00:00, 18.16it/s]
100%|██████████| 81/81 [00:02<00:00, 30.81it/s]
100%|██████████| 81/81 [00:01<00:00, 43.78it/s]
100%|██████████| 81/81 [00:01<00:00, 53.59it/s]
100%|██████████| 81/81 [00:15<00:00,  5.

## Upload to GCS

In [None]:
for model in os.listdir(model_dir):
    filename = model_dir + model
    !gsutil -q cp {filename} gs://immap-models/

In [6]:
for model_name in ['ensembled']: 
    out_dir = output_dir + model_name + '/'
    for filename in os.listdir(out_dir):
        bucket = 'gs://immap-output/{}/{}/'.format(version, model_name)
        !gsutil -q cp {out_dir + filename} {bucket}