# Model Experimentation
This notebook contains a minimal template for running ML experiments. 

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Locations

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200422_dataset.csv'
output_dir = "../outputs/"

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

areas = ['maicao', 'riohacha', 'uribia', 'arauca', 'cucuta', 'arauquita', 'tibu']
value_codes = {value : key.capitalize() for (key,value) in model_utils.VALUE_CODES.items()}

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True).fillna(0)
class_dist = data['target'].value_counts().rename(index=value_codes)

print('Data dimensions: {}'.format(data.shape))
print('\nClass distribution:\n{}\n'.format(class_dist))

data.head()

Data dimensions: (965034, 113)

Class distribution:
Unoccupied land        813575
Formal settlement      130162
Informal settlement     21297
Name: target, dtype: int64



Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0
3,0.15895,0.1463,0.1771,0.2424,0.2264,0.28555,0.3268,0.3098,0.3574,0.0416,...,-0.475631,0.073259,0.308045,0.524245,-0.689591,-0.01952,0.905289,1,3,0
4,0.15895,0.15345,0.192,0.2595,0.25225,0.3004,0.3423,0.3351,0.3611,0.0416,...,-0.464686,0.059161,0.352879,0.548867,-0.6733,-0.009437,0.875968,1,3,0


## Resample Dataset

In [4]:
neg_dist = {'Formal settlement': (2/5), 'Unoccupied land': (3/5)}
data = model_utils.resample(data, num_neg_samples=50000, neg_dist=neg_dist, random_state=SEED)

class_dist = data['target'].value_counts().rename(index=value_codes)
area_dist = data['area'].value_counts().rename(index=model_utils.AREA_CODES)

print('Data dimensions: {}'.format(data.shape))
print('\nArea distribution:\n{}'.format(area_dist))
print('\nClass distribution:\n{}'.format(class_dist))
print('\nClass distribution (normalized):\n{}\n'.format(class_dist/len(data)))

data.head(3)

Data dimensions: (339647, 113)

Area distribution:
Uribia       55759
Riohacha     53501
Cucuta       52665
Maicao       50710
Arauca       47580
Tibu         41027
Arauquita    38405
Name: area, dtype: int64

Class distribution:
Unoccupied land        210000
Formal settlement      108350
Informal settlement     21297
Name: target, dtype: int64

Class distribution (normalized):
Unoccupied land        0.618289
Formal settlement      0.319008
Informal settlement    0.062703
Name: target, dtype: float64



Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1681,0.1326,0.1329,0.1304,0.1571,0.2687,0.3205,0.2784,0.3581,0.037,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.1974,0.1994,0.2195,0.2191,0.268,0.3012,0.27,0.328,0.0384,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1946,0.1782,0.1828,0.1908,0.2174,0.2747,0.3062,0.291,0.3234,0.0307,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0


## ML Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression

### Model Training & Evaluation

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Class distribution:\n{}'.format(data['target'].value_counts()))

Class distribution:
0    318350
1     21297
Name: target, dtype: int64


### Nested Spatial Cross Validation

In [7]:
clf =  LogisticRegression()
param_grid = {'classifier__C':[0.001, 0.01, 1.0, 5.0, 10]}
model_utils.nested_spatial_cv(clf, X, y, splits=splits, param_grid=param_grid, verbose=2);

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   31.9s finished


Test Set: Maicao

Predicted  False  True  __all__
Actual                         
False      47790  2210    50000
True         375   335      710
__all__    48165  2545    50710

               precision    recall  f1-score   support

           0       0.99      0.96      0.97     50000
           1       0.13      0.47      0.21       710

    accuracy                           0.95     50710
   macro avg       0.56      0.71      0.59     50710
weighted avg       0.98      0.95      0.96     50710

F1 Score: 0.2058
Kappa Statistics: 0.1881
Precision: 0.1316
Recall: 0.4718
Accuracy: 0.9490

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.4s finished


Test Set: Riohacha

Predicted  False  True  __all__
Actual                         
False      49868   132    50000
True         985  2516     3501
__all__    50853  2648    53501

               precision    recall  f1-score   support

           0       0.98      1.00      0.99     50000
           1       0.95      0.72      0.82      3501

    accuracy                           0.98     53501
   macro avg       0.97      0.86      0.90     53501
weighted avg       0.98      0.98      0.98     53501

F1 Score: 0.8183
Kappa Statistics: 0.8075
Precision: 0.9502
Recall: 0.7187
Accuracy: 0.9791

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   24.4s finished


Test Set: Uribia

Predicted  False  True  __all__
Actual                         
False      45358    11    45369
True        6863  3527    10390
__all__    52221  3538    55759

               precision    recall  f1-score   support

           0       0.87      1.00      0.93     45369
           1       1.00      0.34      0.51     10390

    accuracy                           0.88     55759
   macro avg       0.93      0.67      0.72     55759
weighted avg       0.89      0.88      0.85     55759

F1 Score: 0.5065
Kappa Statistics: 0.4549
Precision: 0.9969
Recall: 0.3395
Accuracy: 0.8767

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   30.5s finished


Test Set: Arauca

Predicted  False  True  __all__
Actual                         
False      44850   304    45154
True        1232  1194     2426
__all__    46082  1498    47580

               precision    recall  f1-score   support

           0       0.97      0.99      0.98     45154
           1       0.80      0.49      0.61      2426

    accuracy                           0.97     47580
   macro avg       0.89      0.74      0.80     47580
weighted avg       0.96      0.97      0.96     47580

F1 Score: 0.6086
Kappa Statistics: 0.5927
Precision: 0.7971
Recall: 0.4922
Accuracy: 0.9677

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   29.3s finished


Test Set: Cucuta

Predicted  False  True  __all__
Actual                         
False      48862  1138    50000
True         711  1954     2665
__all__    49573  3092    52665

               precision    recall  f1-score   support

           0       0.99      0.98      0.98     50000
           1       0.63      0.73      0.68      2665

    accuracy                           0.96     52665
   macro avg       0.81      0.86      0.83     52665
weighted avg       0.97      0.96      0.97     52665

F1 Score: 0.6788
Kappa Statistics: 0.6604
Precision: 0.6320
Recall: 0.7332
Accuracy: 0.9649

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.4s finished


Test Set: Tibu

Predicted  False  True  __all__
Actual                         
False      39986   214    40200
True         331   496      827
__all__    40317   710    41027

               precision    recall  f1-score   support

           0       0.99      0.99      0.99     40200
           1       0.70      0.60      0.65       827

    accuracy                           0.99     41027
   macro avg       0.85      0.80      0.82     41027
weighted avg       0.99      0.99      0.99     41027

F1 Score: 0.6454
Kappa Statistics: 0.6387
Precision: 0.6986
Recall: 0.5998
Accuracy: 0.9867

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   26.6s finished


Test Set: Arauquita

Predicted  False  True  __all__
Actual                         
False      37291   336    37627
True         441   337      778
__all__    37732   673    38405

               precision    recall  f1-score   support

           0       0.99      0.99      0.99     37627
           1       0.50      0.43      0.46       778

    accuracy                           0.98     38405
   macro avg       0.74      0.71      0.73     38405
weighted avg       0.98      0.98      0.98     38405

F1 Score: 0.4645
Kappa Statistics: 0.4543
Precision: 0.5007
Recall: 0.4332
Accuracy: 0.9798


Mean F1 Score: 0.5611
Mean Kappa statistic: 0.5423
Mean Precision: 0.6724
Mean Recall: 0.5412
Mean Accuracy: 0.9577



## Train Model

In [8]:
clf = LogisticRegression()
X = data[features].fillna(0)
y = data[label]
clf.fit(X, y);

## Prediction

In [9]:
pd.set_option('use_inf_as_na', True)
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)
data = geoutils.read_bands(area_dict, 'uribia')
print('Data dimensions: {}'.format(data.shape))
data.head(3)

100%|██████████| 5/5 [00:21<00:00,  4.37s/it]


Data dimensions: (6217512, 111)


Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,ndbi_2020,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
preds = clf.predict_proba(data[features].fillna(0))[:, 1]
preds[(data.iloc[:, :-1].sum(axis=1) == 0)] = -1

geoutils.save_predictions(
    preds, 
    image_src=area_dict['uribia']['images'][0], 
    output_file=output_dir+'20200423_uribia.tiff'
)