# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200409_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'
false_pos_dir = data_dir + 'false_pos/'

areas = ['maicao', 'riohacha', 'uribia']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
data = data[(data['area'] == 0) | (data['area'] == 1) | (data['area'] == 2)]
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (384534, 112)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,0.187614,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,0.177058,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,0.179191,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,3,0


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=30000, false_pos_samples=5000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (104556, 112)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1809,0.1779,0.1972,0.2371,0.2427,0.3009,0.3346,0.2913,0.3655,0.04435,...,0.114438,-0.388713,0.04039,0.386829,0.574649,-0.60326,-0.010778,0.81093,4,0
1,0.1684,0.1369,0.1425,0.1458,0.1686,0.2696,0.30675,0.2652,0.3501,0.0381,...,0.259128,-0.602441,0.098756,0.199285,0.318486,-0.752879,-0.057824,0.809911,4,0
2,0.1635,0.13655,0.1327,0.1346,0.1311,0.1754,0.2049,0.21,0.2202,0.0356,...,0.215928,-0.506141,0.128252,0.180842,0.403433,-0.754468,-0.048452,0.950226,4,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
SEED = 42

## Define Features and Target Label

In [6]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2]]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0, 4:0})

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (104556, 112)
Class distribution:
0    90000
1    14556
Name: target, dtype: int64


## Logistic Regression

### Feature Selection: Recursive Feature Elimination CV

In [7]:
lr = LogisticRegression()
lr_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=10, verbose=0
)
print(lr_best_features)

['B7_2016', 'B10_2016', 'ui_2016', 'nbi_2016', 'B1_2017', 'B2_2017', 'B5_2017', 'savi_2017', 'ui_2017', 'brba_2017', 'mbi_2017', 'B1_2018', 'B2_2018', 'B12_2018', 'ui_2018', 'B1_2019', 'B2_2019', 'B5_2019', 'B6_2019', 'B10_2019', 'B12_2019', 'savi_2019', 'mndwi_2019', 'nbi_2019', 'nbai_2019', 'mbi_2019', 'B1_2020', 'B2_2020', 'B5_2020', 'B8_2020', 'B9_2020', 'B11_2020', 'B12_2020', 'ndvi_2020', 'savi_2020', 'mndwi_2020', 'brba_2020', 'nbai_2020', 'mbi_2020', 'baei_2020']


### [Baseline] Logistic Regression Results Sans Hyperparamater Optimization

In [8]:
lr = LogisticRegression()
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[28248  1752]
 [  295   415]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.97     30000
           1       0.19      0.58      0.29       710

    accuracy                           0.93     30710
   macro avg       0.59      0.76      0.63     30710
weighted avg       0.97      0.93      0.95     30710

MAICAO Results: 
- F1 Score: 0.2885
- Kappa Statistics: 0.2628
- Precision: 0.1915
- Recall: 0.5845
- Accuracy: 0.9333

Test set: RIOHACHA
[[28861  1139]
 [ 1683  1818]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     30000
           1       0.61      0.52      0.56      3501

    accuracy                           0.92     33501
   macro avg       0.78      0.74      0.76     33501
weighted avg       0.91      0.92      0.91     33501

RIOHACHA Results: 
- F1 Score: 0.5630
- Kappa Statistics: 0.5168
- Precision: 0.6148
- Recall: 0.5193
- Accuracy: 0.9158

Test 

### Grid Search Cross Validation

In [9]:
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C':[0.001, 0.01, 1.0, 5.0, 10, 100]
}
lr_grid_search = model_utils.hyperparameter_optimization(
    data, lr_best_features, label, lr, param_grid, scoring='f1', verbose=2
)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    4.2s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    5.9s finished


Best Paramaters: {'classifier__C': 10, 'classifier__penalty': 'l2'}


### Logistic Regression Results with Hyperparameter Optimization

In [15]:
lr = LogisticRegression(C=10)
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[28115  1885]
 [  300   410]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96     30000
           1       0.18      0.58      0.27       710

    accuracy                           0.93     30710
   macro avg       0.58      0.76      0.62     30710
weighted avg       0.97      0.93      0.95     30710

MAICAO Results: 
- F1 Score: 0.2729
- Kappa Statistics: 0.2463
- Precision: 0.1786
- Recall: 0.5775
- Accuracy: 0.9289

Test set: RIOHACHA
[[29321   679]
 [ 1419  2082]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     30000
           1       0.75      0.59      0.66      3501

    accuracy                           0.94     33501
   macro avg       0.85      0.79      0.82     33501
weighted avg       0.93      0.94      0.93     33501

RIOHACHA Results: 
- F1 Score: 0.6650
- Kappa Statistics: 0.6310
- Precision: 0.7541
- Recall: 0.5947
- Accuracy: 0.9374

Test 

## LinearSVC

In [11]:
lsvc = LinearSVC(random_state=SEED)
lsvc_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=10, verbose=0
)
print(lsvc_best_features)

['B1_2017', 'B2_2017', 'ui_2017', 'mbi_2017', 'B1_2019', 'B10_2019', 'mbi_2019', 'B6_2020', 'B11_2020', 'B12_2020']


### [Baseline] LinearSVC Results Sans Hyperparamater Optimization

In [12]:
lsvc = LinearSVC(random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[29065   935]
 [  456   254]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.98     30000
           1       0.21      0.36      0.27       710

    accuracy                           0.95     30710
   macro avg       0.60      0.66      0.62     30710
weighted avg       0.97      0.95      0.96     30710

MAICAO Results: 
- F1 Score: 0.2675
- Kappa Statistics: 0.2457
- Precision: 0.2136
- Recall: 0.3577
- Accuracy: 0.9547

Test set: RIOHACHA
[[29146   854]
 [ 1104  2397]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     30000
           1       0.74      0.68      0.71      3501

    accuracy                           0.94     33501
   macro avg       0.85      0.83      0.84     33501
weighted avg       0.94      0.94      0.94     33501

RIOHACHA Results: 
- F1 Score: 0.7100
- Kappa Statistics: 0.6776
- Precision: 0.7373
- Recall: 0.6847
- Accuracy: 0.9416

Test 

### Grid Search Cross Validation
Reference: https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

In [13]:
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 1, 10, 100, 1000]}
lsvc_grid_search = model_utils.hyperparameter_optimization(
    data, lsvc_best_features, label, lsvc, param_grid, scoring='f1', verbose=2
)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:   10.7s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   13.2s finished


Best Paramaters: {'classifier__C': 10}


### LinearSVC Results with Hyperparameter Optimization

In [14]:
lsvc = LinearSVC(C=100, random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[28019  1981]
 [  359   351]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     30000
           1       0.15      0.49      0.23       710

    accuracy                           0.92     30710
   macro avg       0.57      0.71      0.60     30710
weighted avg       0.97      0.92      0.94     30710

MAICAO Results: 
- F1 Score: 0.2308
- Kappa Statistics: 0.2025
- Precision: 0.1505
- Recall: 0.4944
- Accuracy: 0.9238

Test set: RIOHACHA
[[29323   677]
 [ 1249  2252]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     30000
           1       0.77      0.64      0.70      3501

    accuracy                           0.94     33501
   macro avg       0.86      0.81      0.83     33501
weighted avg       0.94      0.94      0.94     33501

RIOHACHA Results: 
- F1 Score: 0.7005
- Kappa Statistics: 0.6689
- Precision: 0.7689
- Recall: 0.6432
- Accuracy: 0.9425

Test 