# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200326_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'

areas = ['maicao', 'riohacha', 'uribia']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (334524, 112)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,0.187614,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,0.177058,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,0.179191,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,3,0


## Resample Dataset
Resamples 30,000 negative examples per area.

In [4]:
data = model_utils.resample(data, num_neg_samples=30000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (104556, 112)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.2262,0.183,0.179,0.1831,0.1886,0.2509,0.2981,0.2653,0.3291,0.0431,...,0.231549,-0.59423,0.211756,0.102407,0.3214,-0.798628,-0.067515,1.196927,3,0
1,0.14955,0.12935,0.127,0.1223,0.13935,0.2424,0.2846,0.2662,0.3236,0.0388,...,0.219911,-0.529955,0.153344,0.208832,0.384915,-0.705785,-0.057656,0.87143,3,0
2,0.1686,0.1597,0.1612,0.1616,0.1855,0.3246,0.3985,0.3127,0.4143,0.0454,...,0.205571,-0.53619,0.136678,0.237628,0.413957,-0.70611,-0.052031,0.864904,3,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
SEED = 42

## Define Features and Target Label

In [6]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2] if 'mndwi' not in column]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0})

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (104556, 112)
Class distribution:
0    90000
1    14556
Name: target, dtype: int64


## Define Best Feature Set
Based on experiments run in `notebooks/03_Feature_Selection.ipynb`.

## LinearSVC

In [72]:
lsvc_best_features = [
    'B2_2016', 'B11_2016', 'B12_2016', 'nbi_2016', 'mbi_2016', 'B1_2017', 'B5_2017', 
    'B11_2017', 'ui_2017', 'B10_2018', 'ui_2018', 'nbai_2018', 'mbi_2018', 'B2_2019', 
    'B6_2019', 'B7_2019', 'B9_2019', 'B10_2019', 'B12_2019', 'ndbi_2019', 'savi_2019', 
    'nbi_2019', 'brba_2019', 'mbi_2019', 'B1_2020', 'B4_2020', 'B6_2020', 'B9_2020', 
    'B12_2020', 'savi_2020'
]

### [Baseline] LinearSVC Results Sans Hyperparamater Optimization

In [73]:
lsvc = LinearSVC(random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[29698   302]
 [  151   559]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     30000
           1       0.65      0.79      0.71       710

    accuracy                           0.99     30710
   macro avg       0.82      0.89      0.85     30710
weighted avg       0.99      0.99      0.99     30710

MAICAO Results: 
- F1 Score: 0.7116
- Kappa Statistics: 0.7042
- Precision: 0.6492
- Recall: 0.7873
- Accuracy: 0.9852

Test set: RIOHACHA
[[29843   157]
 [  737  2764]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     30000
           1       0.95      0.79      0.86      3501

    accuracy                           0.97     33501
   macro avg       0.96      0.89      0.92     33501
weighted avg       0.97      0.97      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8608
- Kappa Statistics: 0.8462
- Precision: 0.9463
- Recall: 0.7895
- Accuracy: 0.9733

Test 

### Grid Search Cross Validation
Reference: https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

In [74]:
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 1, 10, 100, 1000]}
lsvc_grid_search = model_utils.hyperparameter_optimization(
    data, lsvc_best_features, label, lsvc, param_grid, scoring='f1', verbose=2
)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    6.7s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   13.2s finished


Best Paramaters: {'classifier__C': 100}


### LinearSVC Results with Hyperparameter Optimization

In [75]:
lsvc = LinearSVC(C=100, random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[29741   259]
 [  173   537]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     30000
           1       0.67      0.76      0.71       710

    accuracy                           0.99     30710
   macro avg       0.83      0.87      0.85     30710
weighted avg       0.99      0.99      0.99     30710

MAICAO Results: 
- F1 Score: 0.7131
- Kappa Statistics: 0.7060
- Precision: 0.6746
- Recall: 0.7563
- Accuracy: 0.9859

Test set: RIOHACHA
[[29958    42]
 [  576  2925]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30000
           1       0.99      0.84      0.90      3501

    accuracy                           0.98     33501
   macro avg       0.98      0.92      0.95     33501
weighted avg       0.98      0.98      0.98     33501

RIOHACHA Results: 
- F1 Score: 0.9045
- Kappa Statistics: 0.8943
- Precision: 0.9858
- Recall: 0.8355
- Accuracy: 0.9816

Test 

## Logistic Regression

In [71]:
lr_best_features = [
    'nbi_2016', 'B1_2017', 'B2_2018', 'B12_2018', 'B1_2019', 'B10_2019', 
    'B12_2019', 'savi_2019', 'B1_2020', 'B12_2020'
]

### [Baseline] Logistic Regression Results Sans Hyperparamater Optimization

In [60]:
lr = LogisticRegression()
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[29528   472]
 [  158   552]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     30000
           1       0.54      0.78      0.64       710

    accuracy                           0.98     30710
   macro avg       0.77      0.88      0.81     30710
weighted avg       0.98      0.98      0.98     30710

MAICAO Results: 
- F1 Score: 0.6367
- Kappa Statistics: 0.6265
- Precision: 0.5391
- Recall: 0.7775
- Accuracy: 0.9795

Test set: RIOHACHA
[[29847   153]
 [  679  2822]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     30000
           1       0.95      0.81      0.87      3501

    accuracy                           0.98     33501
   macro avg       0.96      0.90      0.93     33501
weighted avg       0.97      0.98      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8715
- Kappa Statistics: 0.8579
- Precision: 0.9486
- Recall: 0.8061
- Accuracy: 0.9752

Test 

### Grid Search Cross Validation

In [69]:
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C':[0.001, 0.009, 0.01, 0.09, 1.0, 5.0, 10, 25]
}
lr_grid_search = model_utils.hyperparameter_optimization(
    data, lr_best_features, label, lr, param_grid, scoring='f1', verbose=2
)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   24.6s finished


Best Paramaters: {'classifier__C': 1.0, 'classifier__penalty': 'l2'}


In [70]:
lr = LogisticRegression(C=1.0, penalty='l2')
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[29528   472]
 [  158   552]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     30000
           1       0.54      0.78      0.64       710

    accuracy                           0.98     30710
   macro avg       0.77      0.88      0.81     30710
weighted avg       0.98      0.98      0.98     30710

MAICAO Results: 
- F1 Score: 0.6367
- Kappa Statistics: 0.6265
- Precision: 0.5391
- Recall: 0.7775
- Accuracy: 0.9795

Test set: RIOHACHA
[[29847   153]
 [  679  2822]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     30000
           1       0.95      0.81      0.87      3501

    accuracy                           0.98     33501
   macro avg       0.96      0.90      0.93     33501
weighted avg       0.97      0.98      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8715
- Kappa Statistics: 0.8579
- Precision: 0.9486
- Recall: 0.8061
- Accuracy: 0.9752

Test 