# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
version = '20200504'
data_dir = "../data/"
input_file = data_dir + '{}_dataset.csv'.format(version)

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

## Load Dataset

In [3]:
%%time
data = pd.read_csv(input_file).reset_index(drop=True)
data = data[(data['area'] >= 7)]
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (257643, 69)
CPU times: user 13.5 s, sys: 748 ms, total: 14.2 s
Wall time: 13.6 s


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
807952,0.092,0.0717,0.0716,0.0478,0.0786,0.2476,0.3539,0.3461,0.3973,0.1131,...,-0.352922,0.411034,0.051601,0.406853,-0.704625,-0.193225,1.254652,0,3,7
807953,0.092,0.0709,0.074,0.0501,0.0786,0.2476,0.3539,0.396,0.3973,0.1131,...,-0.348519,0.411034,0.056131,0.442568,-0.702088,-0.189021,1.27465,0,3,7
807954,0.092,0.0731,0.0768,0.0521,0.0803,0.2694,0.403,0.3964,0.4437,0.1131,...,-0.394558,0.433807,0.055931,0.390687,-0.709093,-0.200245,1.200062,0,3,7


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=20000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (43342, 69)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.1485,0.1346,0.1214,0.1114,0.1209,0.1953,0.2226,0.2098,0.2327,0.0767,...,-0.080677,0.206858,0.097394,0.79711,-0.650667,-0.077388,1.406637,0,2,7
1,0.1802,0.144,0.1519,0.1643,0.1707,0.1852,0.1887,0.1753,0.2029,0.0938,...,-0.262164,0.056144,0.186027,0.622719,-0.715903,-0.0138,1.169384,0,2,7
2,0.1627,0.1536,0.1517,0.1666,0.171,0.1692,0.184,0.1668,0.1805,0.068,...,-0.166154,0.047283,0.208604,0.795514,-0.69261,-0.003535,1.233077,0,2,7


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (43342, 69)
Class distribution:
0    40000
1     3342
Name: target, dtype: int64


## Logistic Regression

In [16]:
%%time
lr =  LogisticRegression()
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
lr_results = model_utils.nested_spatial_cv(lr, X, y, splits=splits, param_grid=param_grid, feature_selection=False, search_type=None, verbose=2);

Test Set: Soacha
Predicted  False  True  __all__
Actual                         
False      17517  2483    20000
True          61   442      503
__all__    17578  2925    20503

               precision    recall  f1-score   support

           0       1.00      0.88      0.93     20000
           1       0.15      0.88      0.26       503

    accuracy                           0.88     20503
   macro avg       0.57      0.88      0.60     20503
weighted avg       0.98      0.88      0.92     20503

F1 Score: 0.2579
Kappa Statistics: 0.2254
Precision: 0.1511
Recall: 0.8787
Accuracy: 0.8759
ROC AUC: 0.8773

Test Set: Bogota
Predicted  False  True  __all__
Actual                         
False      19981    19    20000
True        2741    98     2839
__all__    22722   117    22839

               precision    recall  f1-score   support

           0       0.88      1.00      0.94     20000
           1       0.84      0.03      0.07      2839

    accuracy                           0.8

## LinearSVC

In [None]:
%%time
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
lsvc_results = model_utils.nested_spatial_cv(lsvc, X, y, splits=splits, param_grid=param_grid, verbose=2);

## Naive Bayes

In [None]:
%%time
gnb = GaussianNB()
gnb_results = model_utils.nested_spatial_cv(
    gnb, X, y, splits=splits, param_grid=None, search_type=None, feature_selection=None, verbose=2
);

## Ridge Classifier

In [None]:
%%time
rc =  RidgeClassifier()
param_grid = {'classifier__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
rc_results = model_utils.nested_spatial_cv(rc, X, y, splits=splits, param_grid=param_grid, verbose=2);

## Random Forest

In [None]:
%%time
param_grid = {
    'classifier__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [5, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}
rf = RandomForestClassifier(random_state=SEED)
model_utils.nested_spatial_cv(rf, X, y, splits=splits, param_grid=param_grid, search_type='random', verbose=2);