# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200409_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'
false_pos_dir = data_dir + 'false_pos/'

areas = ['maicao', 'riohacha', 'uribia']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (690163, 112)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,0.187614,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,0.177058,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,0.179191,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,3,0


## Resample Dataset

In [4]:
la_guajira = data['area'].apply(lambda x: model_utils.get_la_guajira(x)) 
data.insert(len(data.columns)-2, 'la_guajira', la_guajira)

data = model_utils.resample(data, num_neg_samples=50000, false_pos_samples=10000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (223439, 113)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1809,0.1779,0.1972,0.2371,0.2427,0.3009,0.3346,0.2913,0.3655,0.04435,...,-0.388713,0.04039,0.386829,0.574649,-0.60326,-0.010778,0.81093,1,4,0
1,0.1684,0.1369,0.1425,0.1458,0.1686,0.2696,0.30675,0.2652,0.3501,0.0381,...,-0.602441,0.098756,0.199285,0.318486,-0.752879,-0.057824,0.809911,1,4,0
2,0.1635,0.13655,0.1327,0.1346,0.1311,0.1754,0.2049,0.21,0.2202,0.0356,...,-0.506141,0.128252,0.180842,0.403433,-0.754468,-0.048452,0.950226,1,4,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
SEED = 42

## Define Features and Target Label

In [6]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2]]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0, 4:0})

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (223439, 113)
Class distribution:
0    203792
1     19647
Name: target, dtype: int64


## Logistic Regression

### Feature Selection: Recursive Feature Eleimination CV

In [7]:
lr = LogisticRegression()
lr_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=10, verbose=0
)
print(lr_best_features)

['B3_2016', 'B4_2016', 'B7_2016', 'B10_2016', 'B11_2016', 'B12_2016', 'ui_2016', 'nbi_2016', 'nbai_2016', 'mbi_2016', 'baei_2016', 'B1_2017', 'B2_2017', 'B5_2017', 'B6_2017', 'B9_2017', 'B11_2017', 'B12_2017', 'mndwi_2017', 'ui_2017', 'mbi_2017', 'B2_2018', 'B7_2018', 'ndbi_2018', 'savi_2018', 'ui_2018', 'nbi_2018', 'mbi_2018', 'baei_2018', 'B1_2019', 'B2_2019', 'B5_2019', 'B6_2019', 'B7_2019', 'B9_2019', 'B10_2019', 'B12_2019', 'savi_2019', 'mndwi_2019', 'nbi_2019', 'nbai_2019', 'mbi_2019', 'B1_2020', 'B2_2020', 'B5_2020', 'B6_2020', 'B7_2020', 'B8_2020', 'B9_2020', 'B10_2020', 'B11_2020', 'B12_2020', 'ndbi_2020', 'savi_2020', 'mndwi_2020', 'ui_2020', 'brba_2020', 'nbai_2020', 'mbi_2020', 'baei_2020', 'la_guajira']


### [Baseline] Logistic Regression Results Sans Hyperparamater Optimization

In [8]:
lr = LogisticRegression()
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[46976  3024]
 [  361   349]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.97     50000
           1       0.10      0.49      0.17       710

    accuracy                           0.93     50710
   macro avg       0.55      0.72      0.57     50710
weighted avg       0.98      0.93      0.95     50710

MAICAO Results: 
- F1 Score: 0.1710
- Kappa Statistics: 0.1513
- Precision: 0.1035
- Recall: 0.4915
- Accuracy: 0.9332

Test set: RIOHACHA
[[37323  1125]
 [ 2406  1095]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     38448
           1       0.49      0.31      0.38      3501

    accuracy                           0.92     41949
   macro avg       0.72      0.64      0.67     41949
weighted avg       0.90      0.92      0.91     41949

RIOHACHA Results: 
- F1 Score: 0.3828
- Kappa Statistics: 0.3401
- Precision: 0.4932
- Recall: 0.3128
- Accuracy: 0.9158

Test 

### Grid Search Cross Validation

In [9]:
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]
}
lr_grid_search = model_utils.hyperparameter_optimization(
    data, lr_best_features, label, lr, param_grid, scoring='f1', verbose=2
)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  25 | elapsed:   28.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   34.3s finished


Best Paramaters: {'classifier__C': 10, 'classifier__penalty': 'l2'}


### Logistic Regression Results with Hyperparameter Optimization

In [10]:
lr = LogisticRegression(C=10)
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[46331  3669]
 [  296   414]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     50000
           1       0.10      0.58      0.17       710

    accuracy                           0.92     50710
   macro avg       0.55      0.75      0.57     50710
weighted avg       0.98      0.92      0.95     50710

MAICAO Results: 
- F1 Score: 0.1728
- Kappa Statistics: 0.1525
- Precision: 0.1014
- Recall: 0.5831
- Accuracy: 0.9218

Test set: RIOHACHA
[[37783   665]
 [ 2241  1260]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     38448
           1       0.65      0.36      0.46      3501

    accuracy                           0.93     41949
   macro avg       0.80      0.67      0.71     41949
weighted avg       0.92      0.93      0.92     41949

RIOHACHA Results: 
- F1 Score: 0.4644
- Kappa Statistics: 0.4307
- Precision: 0.6545
- Recall: 0.3599
- Accuracy: 0.9307

Test 

## LinearSVC

In [11]:
lsvc = LinearSVC(random_state=SEED)
lsvc_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=10, verbose=0
)
print(lsvc_best_features)

['B10_2016', 'B1_2017', 'B5_2017', 'B10_2017', 'B12_2017', 'B7_2019', 'B9_2019', 'B1_2020', 'B2_2020', 'B6_2020', 'mbi_2020']


### [Baseline] LinearSVC Results Sans Hyperparamater Optimization

In [12]:
lsvc = LinearSVC(random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[48895  1105]
 [  560   150]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     50000
           1       0.12      0.21      0.15       710

    accuracy                           0.97     50710
   macro avg       0.55      0.59      0.57     50710
weighted avg       0.98      0.97      0.97     50710

MAICAO Results: 
- F1 Score: 0.1527
- Kappa Statistics: 0.1372
- Precision: 0.1195
- Recall: 0.2113
- Accuracy: 0.9672

Test set: RIOHACHA
[[35596  2852]
 [ 1682  1819]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     38448
           1       0.39      0.52      0.45      3501

    accuracy                           0.89     41949
   macro avg       0.67      0.72      0.69     41949
weighted avg       0.91      0.89      0.90     41949

RIOHACHA Results: 
- F1 Score: 0.4452
- Kappa Statistics: 0.3867
- Precision: 0.3894
- Recall: 0.5196
- Accuracy: 0.8919

Test 

### Grid Search Cross Validation
Reference: https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

In [13]:
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 1, 10, 100, 1000]}
lsvc_grid_search = model_utils.hyperparameter_optimization(
    data, lsvc_best_features, label, lsvc, param_grid, scoring='f1', verbose=2
)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min finished


Best Paramaters: {'classifier__C': 10}


### LinearSVC Results with Hyperparameter Optimization

In [14]:
lsvc = LinearSVC(C=1000, random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[48523  1477]
 [  544   166]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     50000
           1       0.10      0.23      0.14       710

    accuracy                           0.96     50710
   macro avg       0.54      0.60      0.56     50710
weighted avg       0.98      0.96      0.97     50710

MAICAO Results: 
- F1 Score: 0.1411
- Kappa Statistics: 0.1240
- Precision: 0.1010
- Recall: 0.2338
- Accuracy: 0.9601

Test set: RIOHACHA
[[33570  4878]
 [  955  2546]]
              precision    recall  f1-score   support

           0       0.97      0.87      0.92     38448
           1       0.34      0.73      0.47      3501

    accuracy                           0.86     41949
   macro avg       0.66      0.80      0.69     41949
weighted avg       0.92      0.86      0.88     41949

RIOHACHA Results: 
- F1 Score: 0.4661
- Kappa Statistics: 0.3978
- Precision: 0.3429
- Recall: 0.7272
- Accuracy: 0.8610

Test 