# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [3]:
data_dir = "../data/"
input_file = data_dir + '20200422_dataset.csv'

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

areas = ['maicao', 'riohacha', 'uribia', 'arauca', 'cucuta', 'arauquita', 'tibu']
value_codes = {value : key.capitalize() for (key,value) in model_utils.VALUE_CODES.items()}

## Load Dataset

In [5]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head()

Data dimensions: (965034, 113)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0
3,0.15895,0.1463,0.1771,0.2424,0.2264,0.28555,0.3268,0.3098,0.3574,0.0416,...,-0.475631,0.073259,0.308045,0.524245,-0.689591,-0.01952,0.905289,1,3,0
4,0.15895,0.15345,0.192,0.2595,0.25225,0.3004,0.3423,0.3351,0.3611,0.0416,...,-0.464686,0.059161,0.352879,0.548867,-0.6733,-0.009437,0.875968,1,3,0


## Resample Dataset

In [6]:
neg_dist = {'Formal settlement': (2/5), 'Unoccupied land': (3/5)}
data = model_utils.resample(data, num_neg_samples=30000, neg_dist=neg_dist, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (225124, 113)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1681,0.1326,0.1329,0.1304,0.1571,0.2687,0.3205,0.2784,0.3581,0.037,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.1974,0.1994,0.2195,0.2191,0.268,0.3012,0.27,0.328,0.0384,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1946,0.1782,0.1828,0.1908,0.2174,0.2747,0.3062,0.291,0.3234,0.0307,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0


## Machine Learning Pipeline

In [17]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
SEED = 42

## Define Features and Target Label

In [8]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2]]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0})

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (225124, 113)
Class distribution:
0    203827
1     21297
Name: target, dtype: int64


## Logistic Regression

### Feature Selection: Recursive Feature Elimination CV

In [22]:
lr = LogisticRegression()
lr_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=5, verbose=0
)
print(lr_best_features)

['B1_2015-2016', 'B2_2015-2016', 'B3_2015-2016', 'B7_2015-2016', 'B12_2015-2016', 'savi_2015-2016', 'mndwi_2015-2016', 'ui_2015-2016', 'nbai_2015-2016', 'B1_2017', 'ui_2017', 'nbi_2017', 'B5_2018', 'B7_2018', 'B12_2018', 'nbi_2018', 'mbi_2018', 'B1_2019', 'B2_2019', 'B3_2019', 'B6_2019', 'B7_2019', 'B9_2019', 'B10_2019', 'B12_2019', 'savi_2019', 'mndwi_2019', 'mbi_2019', 'B1_2020', 'B2_2020', 'B6_2020', 'B7_2020', 'B9_2020', 'B10_2020', 'B11_2020', 'B12_2020', 'savi_2020', 'mndwi_2020', 'brba_2020', 'nbai_2020', 'mbi_2020']


### [Baseline] Logistic Regression Results Sans Hyperparamater Optimization

In [10]:
lr = LogisticRegression()
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[28742  1258]
 [  328   382]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     30000
           1       0.23      0.54      0.33       710

    accuracy                           0.95     30710
   macro avg       0.61      0.75      0.65     30710
weighted avg       0.97      0.95      0.96     30710

MAICAO Results: 
- F1 Score: 0.3251
- Kappa Statistics: 0.3026
- Precision: 0.2329
- Recall: 0.5380
- Accuracy: 0.9484

Test set: RIOHACHA
[[29902    98]
 [  757  2744]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30000
           1       0.97      0.78      0.87      3501

    accuracy                           0.97     33501
   macro avg       0.97      0.89      0.93     33501
weighted avg       0.97      0.97      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8652
- Kappa Statistics: 0.8513
- Precision: 0.9655
- Recall: 0.7838
- Accuracy: 0.9745

Test 

### Grid Search Cross Validation

In [11]:
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]
}
lr_grid_search = model_utils.hyperparameter_optimization(
    data, lr_best_features, label, lr, param_grid, scoring='f1', verbose=2
)

Fitting 7 folds for each of 5 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   21.6s finished


Best Paramaters: {'classifier__C': 10, 'classifier__penalty': 'l2'}


### Logistic Regression Results with Hyperparameter Optimization

In [12]:
lr = LogisticRegression(C=10, penalty='l2')
results, clfs = model_utils.geospatialcv(data, lr_best_features, label, lr, verbose=2);


Test set: MAICAO
[[28673  1327]
 [  404   306]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     30000
           1       0.19      0.43      0.26       710

    accuracy                           0.94     30710
   macro avg       0.59      0.69      0.62     30710
weighted avg       0.97      0.94      0.95     30710

MAICAO Results: 
- F1 Score: 0.2612
- Kappa Statistics: 0.2366
- Precision: 0.1874
- Recall: 0.4310
- Accuracy: 0.9436

Test set: RIOHACHA
[[29933    67]
 [  750  2751]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30000
           1       0.98      0.79      0.87      3501

    accuracy                           0.98     33501
   macro avg       0.98      0.89      0.93     33501
weighted avg       0.98      0.98      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8707
- Kappa Statistics: 0.8574
- Precision: 0.9762
- Recall: 0.7858
- Accuracy: 0.9756

Test 

## LinearSVC

In [13]:
lsvc = LinearSVC(random_state=SEED)
lsvc_best_features = model_utils.rfecv_feature_selection(
    lr, data, features, label, scoring='f1', step=5, verbose=0
)
print(lsvc_best_features)

['B1_2015-2016', 'B4_2015-2016', 'B5_2015-2016', 'B6_2015-2016', 'B11_2015-2016', 'B12_2015-2016', 'mndwi_2015-2016', 'nbi_2015-2016', 'B1_2017', 'B8_2017', 'B10_2017', 'B11_2017', 'B12_2017', 'nbi_2017', 'B3_2018', 'B4_2018', 'B5_2018', 'B6_2018', 'B7_2018', 'mbi_2018', 'B1_2019', 'B3_2019', 'B5_2019', 'B6_2019', 'B7_2019', 'B9_2019', 'B10_2019', 'B12_2019', 'mndwi_2019', 'mbi_2019', 'B1_2020', 'B2_2020', 'B6_2020', 'B7_2020', 'B9_2020', 'B10_2020', 'B11_2020', 'B12_2020', 'savi_2020', 'brba_2020', 'mbi_2020']


### [Baseline] LinearSVC Results Sans Hyperparamater Optimization

In [14]:
lsvc = LinearSVC(random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[28558  1442]
 [  319   391]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     30000
           1       0.21      0.55      0.31       710

    accuracy                           0.94     30710
   macro avg       0.60      0.75      0.64     30710
weighted avg       0.97      0.94      0.95     30710

MAICAO Results: 
- F1 Score: 0.3075
- Kappa Statistics: 0.2836
- Precision: 0.2133
- Recall: 0.5507
- Accuracy: 0.9427

Test set: RIOHACHA
[[29922    78]
 [  733  2768]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30000
           1       0.97      0.79      0.87      3501

    accuracy                           0.98     33501
   macro avg       0.97      0.89      0.93     33501
weighted avg       0.98      0.98      0.97     33501

RIOHACHA Results: 
- F1 Score: 0.8722
- Kappa Statistics: 0.8590
- Precision: 0.9726
- Recall: 0.7906
- Accuracy: 0.9758

Test 

### Grid Search Cross Validation
Reference: https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

In [15]:
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
lsvc_grid_search = model_utils.hyperparameter_optimization(
    data, lsvc_best_features, label, lsvc, param_grid, scoring='f1', verbose=2
)

Fitting 7 folds for each of 8 candidates, totalling 56 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  56 out of  56 | elapsed:  4.4min finished


Best Paramaters: {'classifier__C': 1000}


### LinearSVC Results with Hyperparameter Optimization

In [19]:
lsvc = LinearSVC(C=1000, random_state=SEED)
results, clfs = model_utils.geospatialcv(data, lsvc_best_features, label, lsvc, verbose=2);


Test set: MAICAO
[[29370   630]
 [  420   290]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     30000
           1       0.32      0.41      0.36       710

    accuracy                           0.97     30710
   macro avg       0.65      0.69      0.67     30710
weighted avg       0.97      0.97      0.97     30710

MAICAO Results: 
- F1 Score: 0.3558
- Kappa Statistics: 0.3386
- Precision: 0.3152
- Recall: 0.4085
- Accuracy: 0.9658

Test set: RIOHACHA
[[29981    19]
 [ 1358  2143]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     30000
           1       0.99      0.61      0.76      3501

    accuracy                           0.96     33501
   macro avg       0.97      0.81      0.87     33501
weighted avg       0.96      0.96      0.95     33501

RIOHACHA Results: 
- F1 Score: 0.7568
- Kappa Statistics: 0.7358
- Precision: 0.9912
- Recall: 0.6121
- Accuracy: 0.9589

Test 