# Model Prediction
This notebook implements prediction using sliding window approach.

## Imports and Setup

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
output_dir = "../outputs/"
input_file = data_dir + '20200414_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'
indices_dir = data_dir + 'indices/'
tmp_dir = data_dir + 'tmp/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

areas = ['villadelrosario', 'cucuta', 'arauca']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (698597, 113)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0


## Resample Dataset

In [4]:
neg_dist = {'Formal settlement': 0.4, 'Unoccupied land': 0.6}
data = model_utils.resample(data, num_neg_samples=30000, neg_dist=neg_dist, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (170433, 113)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.16795,0.133,0.1379,0.1355,0.1571,0.2686,0.3198,0.2773,0.3579,0.0402,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.2011,0.2039,0.2299,0.227,0.268,0.3094,0.2723,0.3335,0.0401,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1929,0.1867,0.181,0.1958,0.1889,0.2415,0.2695,0.243,0.2892,0.0272,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
SEED = 42

## Define Features and Target Label

In [6]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2]]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0})

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (170433, 113)
Class distribution:
0    150000
1     20433
Name: target, dtype: int64


## Define Best Feature Set

In [7]:
lr_best_features = ['B1_2016', 'B3_2016', 'B5_2016', 'B6_2016', 'B7_2016', 'B9_2016', 'B10_2016', 'B11_2016', 'B12_2016', 'ndvi_2016', 'ndbi_2016', 'savi_2016', 'ui_2016', 'nbi_2016', 'brba_2016', 'nbai_2016', 'mbi_2016', 'B1_2017', 'B2_2017', 'B3_2017', 'B8_2017', 'B9_2017', 'B10_2017', 'B11_2017', 'B12_2017', 'mndwi_2017', 'ui_2017', 'nbi_2017', 'brba_2017', 'mbi_2017', 'baei_2017', 'B2_2018', 'B3_2018', 'B4_2018', 'B5_2018', 'B6_2018', 'B7_2018', 'B8_2018', 'B10_2018', 'B11_2018', 'B12_2018', 'ndvi_2018', 'ndbi_2018', 'savi_2018', 'mndwi_2018', 'ui_2018', 'nbi_2018', 'brba_2018', 'nbai_2018', 'mbi_2018', 'baei_2018', 'B1_2019', 'B2_2019', 'B3_2019', 'B5_2019', 'B6_2019', 'B7_2019', 'B8_2019', 'B9_2019', 'B10_2019', 'B11_2019', 'B12_2019', 'ndvi_2019', 'ndbi_2019', 'savi_2019', 'mndwi_2019', 'ui_2019', 'brba_2019', 'nbai_2019', 'mbi_2019', 'B1_2020', 'B2_2020', 'B3_2020', 'B4_2020', 'B5_2020', 'B6_2020', 'B7_2020', 'B8_2020', 'B9_2020', 'B10_2020', 'B11_2020', 'B12_2020', 'ndbi_2020', 'savi_2020', 'mndwi_2020', 'ui_2020', 'nbi_2020', 'brba_2020', 'nbai_2020', 'mbi_2020', 'baei_2020']

## Train Model

In [8]:
lr = LogisticRegression(C=5.0)

X = data[lr_best_features].fillna(0)
y = data[label]

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

lr.fit(X, y);

## Prediction: Sliding Window Approach

In [10]:
area_dict = geoutils.get_filepaths(areas, sentinel_dir, pos_mask_dir, neg_mask_dir)
grid_blocks = 9

for area in areas:
    output = output_dir + '20200414_{}.tif'.format(area)
    geoutils.get_preds_windowing(
        area=area, 
        area_dict=area_dict,
        model=lr, 
        tmp_dir=tmp_dir,
        best_features=lr_best_features, 
        scaler=scaler, 
        output=output, 
        grid_blocks=grid_blocks,
        threshold=0
    )

  0%|          | 0/81 [00:00<?, ?it/s]

Reading villadelrosario...


100%|██████████| 81/81 [00:55<00:00,  1.45it/s]


Saving to ../outputs/20200414_villadelrosario.tif...
Stitching all rasters into one


  0%|          | 0/81 [00:00<?, ?it/s]

Reading cucuta...


100%|██████████| 81/81 [04:08<00:00,  3.07s/it]


Saving to ../outputs/20200414_cucuta.tif...
Stitching all rasters into one


  0%|          | 0/81 [00:00<?, ?it/s]

Reading arauca...


100%|██████████| 81/81 [22:38<00:00, 16.77s/it] 


Saving to ../outputs/20200414_arauca.tif...
Stitching all rasters into one


In [None]:
#for area in areas:
#    filename = output_dir + '20200331_{}.tif'.format(area)
#    !gsutil -q cp {filename} gs://immap-output/