# Model Prediction
This notebook implements prediction using sliding window approach.

## Imports and Setup

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
output_dir = "../outputs/"
input_file = data_dir + '20200422_dataset.csv'

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
tmp_dir = data_dir + 'tmp/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

areas = ['maicao']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (965034, 113)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0


## Resample Dataset

In [4]:
neg_dist = {'Formal settlement': (2/5), 'Unoccupied land': (3/5)}
data = model_utils.resample(data, num_neg_samples=50000, neg_dist=neg_dist, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (339647, 113)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1681,0.1326,0.1329,0.1304,0.1571,0.2687,0.3205,0.2784,0.3581,0.037,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.1974,0.1994,0.2195,0.2191,0.268,0.3012,0.27,0.328,0.0384,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1946,0.1782,0.1828,0.1908,0.2174,0.2747,0.3062,0.291,0.3234,0.0307,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0


## Machine Learning Pipeline

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (339647, 113)
Class distribution:
0    318350
1     21297
Name: target, dtype: int64


## Define Best Feature Set

In [9]:
clf =  LogisticRegression()
cv, _ = model_utils.get_cv_iterator(splits)
best_features = model_utils.rfecv_feature_selection(
    clf, X, y, cv, scoring='f1', step=10, verbose=0
)
best_features

['B1_2015-2016',
 'B2_2015-2016',
 'B7_2015-2016',
 'B12_2015-2016',
 'savi_2015-2016',
 'mndwi_2015-2016',
 'ui_2015-2016',
 'nbai_2015-2016',
 'B1_2017',
 'nbi_2017',
 'mbi_2017',
 'B3_2018',
 'B5_2018',
 'B7_2018',
 'B12_2018',
 'nbi_2018',
 'mbi_2018',
 'B1_2019',
 'B2_2019',
 'B3_2019',
 'B6_2019',
 'B7_2019',
 'B9_2019',
 'B10_2019',
 'B12_2019',
 'savi_2019',
 'mndwi_2019',
 'brba_2019',
 'mbi_2019',
 'B1_2020',
 'B2_2020',
 'B6_2020',
 'B7_2020',
 'B9_2020',
 'B10_2020',
 'B11_2020',
 'B12_2020',
 'savi_2020',
 'mndwi_2020',
 'brba_2020',
 'mbi_2020']

## Hyperparameter Tuning

In [15]:
pipe_clf = Pipeline([
    ('scaler',  MinMaxScaler()),
    ('classifier', clf)
])
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
cv = GridSearchCV(
    estimator=pipe_clf, 
    param_grid=param_grid,
    cv=cv,
    verbose=1, 
    scoring='f1',
    n_jobs=-1
)
cv.fit(X[best_features], y)
best_estimator = cv.best_estimator_
print(best_estimator)

Fitting 7 folds for each of 5 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.2min finished


Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


## Train Model

In [16]:
X = data[best_features].fillna(0)
y = data[label]
best_estimator.fit(X, y);

## Prediction: Sliding Window Approach

In [23]:
#areas = ['uribia', 'riohacha', 'maicao', 'cucuta', 'arauca', 'arauquita', 'tibu']
areas = ['villadelrosario', 'soacha']
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)
grid_blocks = 9

for area in areas:
    output = output_dir + '20200425_{}.tif'.format(area)
    geoutils.get_preds_windowing(
        area=area, 
        area_dict=area_dict,
        model=best_estimator, 
        tmp_dir=tmp_dir,
        best_features=best_features,  
        output=output, 
        grid_blocks=grid_blocks,
        threshold=0
    )

  0%|          | 0/81 [00:00<?, ?it/s]

Reading villadelrosario...


100%|██████████| 81/81 [01:56<00:00,  1.43s/it]


Saving to ../outputs/20200425_villadelrosario.tif...


  0%|          | 0/81 [00:00<?, ?it/s]

Reading soacha...


100%|██████████| 81/81 [03:17<00:00,  2.44s/it]


Saving to ../outputs/20200425_soacha.tif...


In [None]:
for area in areas:
    filename = output_dir + '20200425_{}.tiff'.format(area)
    !gsutil -q cp {filename} gs://immap-output/