# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
version = '20200504'
data_dir = "../data/"
input_file = data_dir + '{}_dataset.csv'.format(version)

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

## Load Dataset

In [3]:
%%time
data = pd.read_csv(input_file).reset_index(drop=True)
data = data[data['area'] <= 2]
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (398499, 69)
CPU times: user 26.4 s, sys: 1.23 s, total: 27.6 s
Wall time: 39.6 s


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.438625,0.151655,0.209614,0.479385,-0.665204,-0.060444,0.948025,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.43545,0.151655,0.213526,0.48833,-0.66301,-0.059064,0.952352,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.447333,0.145385,0.222971,0.473118,-0.661291,-0.060496,0.911748,1,3,0


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=20000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (74398, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.1748,0.1264,0.1259,0.12075,0.1288,0.2291,0.28325,0.2551,0.31165,0.0396,...,-0.325365,0.336843,0.097095,0.493299,-0.683798,-0.120547,1.149291,1,2,0
1,0.2093,0.1981,0.2023,0.217,0.2178,0.2767,0.3224,0.302,0.3478,0.0353,...,-0.212563,0.098564,0.242471,0.71641,-0.62334,-0.022285,1.042249,1,2,0
2,0.2062,0.20505,0.21575,0.2332,0.2083,0.2836,0.3381,0.3109,0.374,0.03575,...,-0.235786,0.266124,0.158668,0.655587,-0.632521,-0.078219,1.1013,1,2,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (74398, 69)
Class distribution:
0    60000
1    14398
Name: target, dtype: int64


## Logistic Regression

In [9]:
%%time
lr =  LogisticRegression()
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
lr_results = model_utils.nested_spatial_cv(lr, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      18929  1071    20000
True         118   434      552
__all__    19047  1505    20552

               precision    recall  f1-score   support

           0       0.99      0.95      0.97     20000
           1       0.29      0.79      0.42       552

    accuracy                           0.94     20552
   macro avg       0.64      0.87      0.70     20552
weighted avg       0.97      0.94      0.95     20552

F1 Score: 0.4220
Kappa Statistics: 0.3983
Precision: 0.2884
Recall: 0.7862
Accuracy: 0.9421
ROC AUC: 0.8663

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      19863   137    20000
True         674  2827     3501
__all__    20537  2964    23501

               precision    recall  f1-score   support

           0       0.97      0.99      0.98     20000
           1       0.95      0.81      0.87      3501

    accuracy                           0

## LinearSVC

In [10]:
%%time
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
lsvc_results = model_utils.nested_spatial_cv(lsvc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      18887  1113    20000
True          90   462      552
__all__    18977  1575    20552

               precision    recall  f1-score   support

           0       1.00      0.94      0.97     20000
           1       0.29      0.84      0.43       552

    accuracy                           0.94     20552
   macro avg       0.64      0.89      0.70     20552
weighted avg       0.98      0.94      0.95     20552

F1 Score: 0.4344
Kappa Statistics: 0.4110
Precision: 0.2933
Recall: 0.8370
Accuracy: 0.9415
ROC AUC: 0.8907

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      19964    36    20000
True         582  2919     3501
__all__    20546  2955    23501

               precision    recall  f1-score   support

           0       0.97      1.00      0.98     20000
           1       0.99      0.83      0.90      3501

    accuracy                           0

## Naive Bayes

In [7]:
%%time
gnb = GaussianNB()
gnb_results = model_utils.nested_spatial_cv(
    gnb, X, y, splits=splits, param_grid=None, search_type=None, feature_selection=None, verbose=2
);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      17768  2232    20000
True         314   238      552
__all__    18082  2470    20552

               precision    recall  f1-score   support

           0       0.98      0.89      0.93     20000
           1       0.10      0.43      0.16       552

    accuracy                           0.88     20552
   macro avg       0.54      0.66      0.55     20552
weighted avg       0.96      0.88      0.91     20552

F1 Score: 0.1575
Kappa Statistics: 0.1188
Precision: 0.0964
Recall: 0.4312
Accuracy: 0.8761
ROC AUC: 0.6598

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      19941    59    20000
True        2720   781     3501
__all__    22661   840    23501

               precision    recall  f1-score   support

           0       0.88      1.00      0.93     20000
           1       0.93      0.22      0.36      3501

    accuracy                           0

## Ridge Classifier

In [8]:
%%time
rc =  RidgeClassifier()
param_grid = {'classifier__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
rc_results = model_utils.nested_spatial_cv(rc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      18655  1345    20000
True          61   491      552
__all__    18716  1836    20552

               precision    recall  f1-score   support

           0       1.00      0.93      0.96     20000
           1       0.27      0.89      0.41       552

    accuracy                           0.93     20552
   macro avg       0.63      0.91      0.69     20552
weighted avg       0.98      0.93      0.95     20552

F1 Score: 0.4112
Kappa Statistics: 0.3859
Precision: 0.2674
Recall: 0.8895
Accuracy: 0.9316
ROC AUC: 0.9111

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      19986    14    20000
True        2355  1146     3501
__all__    22341  1160    23501

               precision    recall  f1-score   support

           0       0.89      1.00      0.94     20000
           1       0.99      0.33      0.49      3501

    accuracy                           0

## Random Forest

In [None]:
%%time
param_grid = {
    'classifier__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [5, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}
rf = RandomForestClassifier(random_state=SEED)
model_utils.nested_spatial_cv(rf, X, y, splits=splits, param_grid=param_grid, search_type='random', verbose=2);