# Model Experimentation: Hyperparameter Optimization
This notebook conducts experiments on hyperparameter optimization.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Location

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200422_dataset.csv'

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

areas = ['maicao', 'riohacha', 'uribia', 'arauca', 'cucuta', 'arauquita', 'tibu']

## Load Dataset

In [3]:
%%time
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
data.head()

Data dimensions: (965034, 113)
CPU times: user 20.2 s, sys: 2.25 s, total: 22.4 s
Wall time: 28.7 s


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0
3,0.15895,0.1463,0.1771,0.2424,0.2264,0.28555,0.3268,0.3098,0.3574,0.0416,...,-0.475631,0.073259,0.308045,0.524245,-0.689591,-0.01952,0.905289,1,3,0
4,0.15895,0.15345,0.192,0.2595,0.25225,0.3004,0.3423,0.3351,0.3611,0.0416,...,-0.464686,0.059161,0.352879,0.548867,-0.6733,-0.009437,0.875968,1,3,0


## Resample Dataset

In [4]:
neg_dist = {'Formal settlement': (2/5), 'Unoccupied land': (3/5)}
data = model_utils.resample(data, num_neg_samples=50000, neg_dist=neg_dist, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
data.head(3)

Data dimensions: (371297, 113)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1681,0.1326,0.1329,0.1304,0.1571,0.2687,0.3205,0.2784,0.3581,0.037,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.1974,0.1994,0.2195,0.2191,0.268,0.3012,0.27,0.328,0.0384,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1946,0.1782,0.1828,0.1908,0.2174,0.2747,0.3062,0.291,0.3234,0.0307,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
SEED = 42

## Define Features and Target Label

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (371297, 113)
Class distribution:
0    350000
1     21297
Name: target, dtype: int64


## Ridge Classifier

In [10]:
%%time
rc =  RidgeClassifier()
param_grid = {'classifier__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
rc_results = model_utils.nested_spatial_cv(rc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      48246  1754    50000
True         132   578      710
__all__    48378  2332    50710

               precision    recall  f1-score   support

           0       1.00      0.96      0.98     50000
           1       0.25      0.81      0.38       710

    accuracy                           0.96     50710
   macro avg       0.62      0.89      0.68     50710
weighted avg       0.99      0.96      0.97     50710

F1 Score: 0.3800
Kappa Statistics: 0.3664
Precision: 0.2479
Recall: 0.8141
Accuracy: 0.9628

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      49989    11    50000
True        2076  1425     3501
__all__    52065  1436    53501

               precision    recall  f1-score   support

           0       0.96      1.00      0.98     50000
           1       0.99      0.41      0.58      3501

    accuracy                           0.96     53501
  

## Naive Bayes

In [31]:
%%time
gnb = GaussianNB()
gnb_results = model_utils.nested_spatial_cv(
    gnb, X, y, splits=splits, param_grid=None, search_type=None, feature_selection=None, verbose=2
);

Test Set: Maicao
Predicted  False   True  __all__
Actual                          
False      30782  19218    50000
True          12    698      710
__all__    30794  19916    50710

               precision    recall  f1-score   support

           0       1.00      0.62      0.76     50000
           1       0.04      0.98      0.07       710

    accuracy                           0.62     50710
   macro avg       0.52      0.80      0.41     50710
weighted avg       0.99      0.62      0.75     50710

F1 Score: 0.0677
Kappa Statistics: 0.0418
Precision: 0.0350
Recall: 0.9831
Accuracy: 0.6208

Test Set: Riohacha
Predicted  False   True  __all__
Actual                          
False      39221  10779    50000
True         857   2644     3501
__all__    40078  13423    53501

               precision    recall  f1-score   support

           0       0.98      0.78      0.87     50000
           1       0.20      0.76      0.31      3501

    accuracy                           0.78   

## Logistic Regression

In [12]:
%%time
lr =  LogisticRegression()
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
lr_results = model_utils.nested_spatial_cv(lr, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      47888  2112    50000
True         368   342      710
__all__    48256  2454    50710

               precision    recall  f1-score   support

           0       0.99      0.96      0.97     50000
           1       0.14      0.48      0.22       710

    accuracy                           0.95     50710
   macro avg       0.57      0.72      0.60     50710
weighted avg       0.98      0.95      0.96     50710

F1 Score: 0.2162
Kappa Statistics: 0.1988
Precision: 0.1394
Recall: 0.4817
Accuracy: 0.9511

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      49774   226    50000
True         877  2624     3501
__all__    50651  2850    53501

               precision    recall  f1-score   support

           0       0.98      1.00      0.99     50000
           1       0.92      0.75      0.83      3501

    accuracy                           0.98     53501
  

## LinearSVC
Reference: https://towardsdatascience.com/svm-hyper-parameter-tuning-using-gridsearchcv-49c0bc55ce29

In [13]:
%%time
lsvc = LinearSVC(random_state=SEED)
param_grid = {'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100, 1000]}
lsvc_results = model_utils.nested_spatial_cv(lsvc, X, y, splits=splits, param_grid=param_grid, verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      47601  2399    50000
True         376   334      710
__all__    47977  2733    50710

               precision    recall  f1-score   support

           0       0.99      0.95      0.97     50000
           1       0.12      0.47      0.19       710

    accuracy                           0.95     50710
   macro avg       0.56      0.71      0.58     50710
weighted avg       0.98      0.95      0.96     50710

F1 Score: 0.1940
Kappa Statistics: 0.1757
Precision: 0.1222
Recall: 0.4704
Accuracy: 0.9453

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      49927    73    50000
True        1101  2400     3501
__all__    51028  2473    53501

               precision    recall  f1-score   support

           0       0.98      1.00      0.99     50000
           1       0.97      0.69      0.80      3501

    accuracy                           0.98     53501
  

## Random Forest

In [7]:
%%time
param_grid = {
    'classifier__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [5, 6, 7, 8, 9, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}
rf = RandomForestClassifier(random_state=SEED)
model_utils.nested_spatial_cv(rf, X, y, splits=splits, param_grid=param_grid, search_type='random', verbose=2);

Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      48490  1510    50000
True         286   424      710
__all__    48776  1934    50710

               precision    recall  f1-score   support

           0       0.99      0.97      0.98     50000
           1       0.22      0.60      0.32       710

    accuracy                           0.96     50710
   macro avg       0.61      0.78      0.65     50710
weighted avg       0.98      0.96      0.97     50710

F1 Score: 0.3207
Kappa Statistics: 0.3065
Precision: 0.2192
Recall: 0.5972
Accuracy: 0.9646

Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      49930    70    50000
True        1799  1702     3501
__all__    51729  1772    53501

               precision    recall  f1-score   support

           0       0.97      1.00      0.98     50000
           1       0.96      0.49      0.65      3501

    accuracy                           0.97     53501
  

{'f1_score': [0.32072617246596064,
  0.645552816233643,
  0.023961205667015314,
  0.6793384349543322,
  0.3776754267136278,
  0.25708502024291496,
  0.04602991944764097],
 'kappa': [0.30652178283018716,
  0.6292466746847885,
  0.019888642939203893,
  0.6669749148561039,
  0.3596610877627896,
  0.2531244455792504,
  0.042958874181228346],
 'precision': [0.21923474663908996,
  0.9604966139954854,
  0.9921259842519685,
  0.8467692307692307,
  0.6793372319688109,
  0.7888198757763976,
  0.21978021978021978],
 'recall': [0.5971830985915493,
  0.4861468151956584,
  0.012127045235803657,
  0.5671887881286067,
  0.26153846153846155,
  0.15356711003627568,
  0.02570694087403599],
 'accuracy': [0.964582922500493,
  0.9650660735313359,
  0.8300215267428382,
  0.9752222179834433,
  0.9563846957182189,
  0.9855588565132705,
  0.9836740320611288]}