# Model Prediction
This notebook implements prediction using sliding window approach.

## Imports and Setup

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
version = '20200505'
data_dir = "../data/"
output_dir = "../outputs/"
input_file = data_dir + '{}_dataset.csv'.format(version)

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
tmp_dir = data_dir + 'tmp/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

areas = ['maicao']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (1029869, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.438625,0.151655,0.209614,0.479385,-0.665204,-0.060444,0.948025,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.43545,0.151655,0.213526,0.48833,-0.66301,-0.059064,0.952352,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.447333,0.145385,0.222971,0.473118,-0.661291,-0.060496,0.911748,1,3,0


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=30000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (293756, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.1748,0.1264,0.1259,0.12075,0.1288,0.2291,0.28325,0.2551,0.31165,0.0396,...,-0.325365,0.336843,0.097095,0.493299,-0.683798,-0.120547,1.149291,1,2,0
1,0.2093,0.1981,0.2023,0.217,0.2178,0.2767,0.3224,0.302,0.3478,0.0353,...,-0.212563,0.098564,0.242471,0.71641,-0.62334,-0.022285,1.042249,1,2,0
2,0.2062,0.20505,0.21575,0.2332,0.2083,0.2836,0.3381,0.3109,0.374,0.03575,...,-0.235786,0.266124,0.158668,0.655587,-0.632521,-0.078219,1.1013,1,2,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (293756, 69)
Class distribution:
0    270000
1     23756
Name: target, dtype: int64


## Define Best Feature Set

In [7]:
clf = LogisticRegression()
cv, _ = model_utils.get_cv_iterator(splits)
best_features = model_utils.rfecv_feature_selection(
    clf, X, y, cv, scoring='f1', step=10, verbose=0
)
best_features

['B1_2015-2016',
 'B2_2015-2016',
 'B3_2015-2016',
 'B5_2015-2016',
 'B7_2015-2016',
 'B12_2015-2016',
 'savi_2015-2016',
 'mndwi_2015-2016',
 'nbai_2015-2016',
 'B2_2017-2018',
 'B3_2017-2018',
 'B6_2017-2018',
 'B7_2017-2018',
 'B10_2017-2018',
 'B11_2017-2018',
 'B12_2017-2018',
 'savi_2017-2018',
 'mndwi_2017-2018',
 'ui_2017-2018',
 'nbi_2017-2018',
 'mbi_2017-2018',
 'B1_2019-2020',
 'B2_2019-2020',
 'B3_2019-2020',
 'B5_2019-2020',
 'B6_2019-2020',
 'B7_2019-2020',
 'B8_2019-2020',
 'B10_2019-2020',
 'B11_2019-2020',
 'B12_2019-2020',
 'ndvi_2019-2020',
 'savi_2019-2020',
 'mndwi_2019-2020',
 'brba_2019-2020',
 'nbai_2019-2020',
 'mbi_2019-2020']

## Hyperparameter Tuning

In [8]:
pipe_clf = Pipeline([
    ('scaler',  MinMaxScaler()),
    ('classifier', clf)
])
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
cv = GridSearchCV(
    estimator=pipe_clf, 
    param_grid=param_grid,
    cv=cv,
    verbose=1, 
    scoring='f1',
    n_jobs=-1
)
cv.fit(X[best_features], y)
best_estimator = cv.best_estimator_
print(best_estimator)

Fitting 9 folds for each of 8 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.0min finished


Pipeline(memory=None,
         steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 LogisticRegression(C=100, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


## Train Model

In [9]:
X = data[best_features].fillna(0)
y = data[label]
best_estimator.fit(X, y);

## Prediction: Sliding Window Approach

In [10]:
areas = [
    'soacha', 
    'bogota',
    'uribia', 
    'riohacha', 
    'maicao', 
    'cucuta', 
    'arauca', 
    'arauquita', 
    'tibu'
]

area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)
grid_blocks = 9

for area in areas:
    output = output_dir + '{}_{}.tif'.format(version, area)
    geoutils.get_preds_windowing(
        area=area, 
        area_dict=area_dict,
        model=best_estimator, 
        tmp_dir=tmp_dir,
        best_features=best_features,  
        output=output, 
        grid_blocks=grid_blocks,
        threshold=0
    )

100%|██████████| 81/81 [02:04<00:00,  1.53s/it]
100%|██████████| 81/81 [13:14<00:00,  9.81s/it]
100%|██████████| 81/81 [08:17<00:00,  6.14s/it]
100%|██████████| 81/81 [05:47<00:00,  4.29s/it]
100%|██████████| 81/81 [13:17<00:00,  9.84s/it]
100%|██████████| 81/81 [15:03<00:00, 11.15s/it]
100%|██████████| 81/81 [14:52<00:00, 11.02s/it]


In [11]:
#for area in areas:
#    filename = output_dir + '{}_{}.tiff'.format(version, area)
#    !gsutil -q cp {filename} gs://immap-output/