# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
version = '20200504'
data_dir = "../data/"
input_file = data_dir + '{}_dataset.csv'.format(version)

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

## Load Dataset

In [3]:
%%time
data = pd.read_csv(input_file).reset_index(drop=True)
data = data[(data['area'] > 2) & (data['area'] < 7)]
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (409453, 69)
CPU times: user 13.5 s, sys: 842 ms, total: 14.4 s
Wall time: 13.6 s


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
398499,0.1602,0.14945,0.1421,0.1376,0.1659,0.1955,0.2259,0.20305,0.2428,0.0261,...,-0.225699,0.12061,0.26335,0.654699,-0.625412,-0.006081,0.987972,0,2,3
398500,0.1602,0.12885,0.1166,0.1181,0.11595,0.1645,0.18655,0.1884,0.22835,0.0261,...,-0.241379,0.160714,0.255061,0.618806,-0.614882,-0.015396,0.957207,0,2,3
398501,0.1602,0.11045,0.0985,0.0858,0.11595,0.1645,0.18655,0.1761,0.22835,0.0261,...,-0.414146,0.160714,0.170823,0.414435,-0.72166,-0.044105,0.945919,0,2,3


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=20000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (86291, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.2169,0.30795,0.31485,0.31745,0.2474,0.2689,0.28655,0.3195,0.3073,0.0328,...,-0.173727,0.117432,0.321484,0.762909,-0.536715,-0.01648,0.923687,0,2,3
1,0.16875,0.13265,0.1336,0.1554,0.1393,0.21735,0.2452,0.2416,0.2808,0.0295,...,-0.346062,0.238212,0.163422,0.519504,-0.695356,-0.05276,1.065632,0,2,3
2,0.1592,0.13685,0.12515,0.118,0.13105,0.1546,0.1738,0.1544,0.17275,0.0264,...,-0.303948,0.17497,0.143576,0.527582,-0.729153,-0.037904,1.155219,0,2,3


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (86291, 69)
Class distribution:
0    80000
1     6291
Name: target, dtype: int64


## Logistic Regression

In [7]:
%%time
lr =  LogisticRegression()
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
lr_results = model_utils.nested_spatial_cv(lr, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Arauca
Predicted  False  True  __all__
Actual                         
False      12849  7151    20000
True         515  1783     2298
__all__    13364  8934    22298

               precision    recall  f1-score   support

           0       0.96      0.64      0.77     20000
           1       0.20      0.78      0.32      2298

    accuracy                           0.66     22298
   macro avg       0.58      0.71      0.54     22298
weighted avg       0.88      0.66      0.72     22298

F1 Score: 0.3175
Kappa Statistics: 0.1836
Precision: 0.1996
Recall: 0.7759
Accuracy: 0.6562
ROC AUC: 0.7092

Test Set: Cucuta
Predicted  False  True  __all__
Actual                         
False      19625   375    20000
True        1779   706     2485
__all__    21404  1081    22485

               precision    recall  f1-score   support

           0       0.92      0.98      0.95     20000
           1       0.65      0.28      0.40      2485

    accuracy                           0.9

## LinearSVC

In [10]:
%%time
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
lsvc_results = model_utils.nested_spatial_cv(lsvc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Arauca
Predicted  False  True  __all__
Actual                         
False      19298   702    20000
True         905  1393     2298
__all__    20203  2095    22298

               precision    recall  f1-score   support

           0       0.96      0.96      0.96     20000
           1       0.66      0.61      0.63      2298

    accuracy                           0.93     22298
   macro avg       0.81      0.79      0.80     22298
weighted avg       0.93      0.93      0.93     22298

F1 Score: 0.6342
Kappa Statistics: 0.5943
Precision: 0.6649
Recall: 0.6062
Accuracy: 0.9279
ROC AUC: 0.7855

Test Set: Cucuta
Predicted  False  True  __all__
Actual                         
False      19621   379    20000
True        1604   881     2485
__all__    21225  1260    22485

               precision    recall  f1-score   support

           0       0.92      0.98      0.95     20000
           1       0.70      0.35      0.47      2485

    accuracy                           0.9

## Naive Bayes

In [8]:
%%time
gnb = GaussianNB()
gnb_results = model_utils.nested_spatial_cv(
    gnb, X, y, splits=splits, param_grid=None, search_type=None, feature_selection=None, verbose=2
);

Test Set: Arauca
Predicted  False  True  __all__
Actual                         
False      14510  5490    20000
True         628  1670     2298
__all__    15138  7160    22298

               precision    recall  f1-score   support

           0       0.96      0.73      0.83     20000
           1       0.23      0.73      0.35      2298

    accuracy                           0.73     22298
   macro avg       0.60      0.73      0.59     22298
weighted avg       0.88      0.73      0.78     22298

F1 Score: 0.3531
Kappa Statistics: 0.2335
Precision: 0.2332
Recall: 0.7267
Accuracy: 0.7256
ROC AUC: 0.7261

Test Set: Cucuta
Predicted  False  True  __all__
Actual                         
False      19612   388    20000
True        1178  1307     2485
__all__    20790  1695    22485

               precision    recall  f1-score   support

           0       0.94      0.98      0.96     20000
           1       0.77      0.53      0.63      2485

    accuracy                           0.9

## Ridge Classifier

In [9]:
%%time
rc =  RidgeClassifier()
param_grid = {'classifier__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
rc_results = model_utils.nested_spatial_cv(rc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Arauca
Predicted  False  True  __all__
Actual                         
False      19104   896    20000
True        1249  1049     2298
__all__    20353  1945    22298

               precision    recall  f1-score   support

           0       0.94      0.96      0.95     20000
           1       0.54      0.46      0.49      2298

    accuracy                           0.90     22298
   macro avg       0.74      0.71      0.72     22298
weighted avg       0.90      0.90      0.90     22298

F1 Score: 0.4945
Kappa Statistics: 0.4417
Precision: 0.5393
Recall: 0.4565
Accuracy: 0.9038
ROC AUC: 0.7058

Test Set: Cucuta
Predicted  False  True  __all__
Actual                         
False      18760  1240    20000
True         295  2190     2485
__all__    19055  3430    22485

               precision    recall  f1-score   support

           0       0.98      0.94      0.96     20000
           1       0.64      0.88      0.74      2485

    accuracy                           0.9

## Random Forest

In [None]:
%%time
param_grid = {
    'classifier__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [5, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}
rf = RandomForestClassifier(random_state=SEED)
model_utils.nested_spatial_cv(rf, X, y, splits=splits, param_grid=param_grid, search_type='random', verbose=2);