# Model Prediction
This notebook implements prediction using sliding window approach.

## Imports and Setup

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [None]:
data_dir = "../data/"
output_dir = "../outputs/"
input_file = data_dir + '20200422_dataset.csv'

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
tmp_dir = data_dir + 'tmp/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

areas = ['maicao']

## Load Dataset

In [None]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

## Resample Dataset

In [None]:
neg_dist = {'Formal settlement': (2/5), 'Unoccupied land': (3/5)}
data = model_utils.resample(data, num_neg_samples=50000, neg_dist=neg_dist, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

## Machine Learning Pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
SEED = 42

## Define Features and Target Label

In [None]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

## Define Best Feature Set

In [None]:
clf = 
cv = model_utils.get_cv_iterator()
best_features = model_utils.rfecv_feature_selection(
    clf, X_train, y_train, cv, scoring='f1', step=10, verbose=0
)
best_features

## Hyperparameter Tuning

In [None]:
pipe_clf = Pipeline([
    ('scaler',  MinMaxScaler()),
    ('classifier', clf)
])
cv = GridSearchCV(
    estimator=pipe_clf, 
    param_grid=param_grid,
    cv=inner_cv,
    verbose=verbose, 
    scoring='f1',
    n_jobs=-1
)
cv.fit(X[best_features], y_train)
best_estimator = cv.best_estimator_
print(best_estimator)

## Train Model

In [None]:
X = data[best_features].fillna(0)
y = data[label]
best_estimator.fit(X, y);

## Prediction: Sliding Window Approach

In [None]:
import geopandas as gpd
areas = ['maicao', 'riohacha', 'uribia', 'arauca', 'cucuta', 'arauquita', 'tibu']
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)

gdfs = []
for area in area_dict:
    pos = area_dict[area]['pos_mask_gpkg']
    gdfs.append(gpd.read_file(pos))
rdf = gpd.GeoDataFrame( pd.concat( gdfs, ignore_index=True) )
#rdf.to_file('positive_training_examples.gpkg', driver='GPKG')
print(len(rdf))

In [None]:
areas = ['villadelrosario', 'soacha', 'arauquita', 'maicao', 'uribia']
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)
grid_blocks = 9

for area in ['villadelrosario']:
    output = output_dir + '20200422_{}.tif'.format(area)
    geoutils.get_preds_windowing(
        area=area, 
        area_dict=area_dict,
        model=best_estimator, 
        tmp_dir=tmp_dir,
        best_features=best_features, 
        scaler=scaler, 
        output=output, 
        grid_blocks=grid_blocks,
        threshold=0
    )

In [None]:
areas = ['villadelrosario', 'soacha', 'arauquita', 'maicao', 'uribia']
for area in areas:
    filename = output_dir + '20200422_{}.tiff'.format(area)
    !gsutil -q cp {filename} gs://immap-output/