# Model Experiments
Author: Issa

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Locations

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200326_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'

areas = ['maicao', 'riohacha', 'uribia']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))
print('Class distribution (normalized):\n{}'.format(data['target'].value_counts()/len(data)))
data.head(3)

Data dimensions: (334524, 112)
Class distribution:
3    248172
2     71796
1     14556
Name: target, dtype: int64
Class distribution (normalized):
3    0.741866
2    0.214621
1    0.043513
Name: target, dtype: float64


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,0.187614,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,0.177058,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,0.179191,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,3,0


## Resample Dataset
Resamples 30,000 negative examples per area.

In [4]:
data_area = []
for area in data['area'].unique():
    neg_sample = data[
        (data['area'] == area) 
        & (data['target'] != 1)
    ].sample(30000, replace=False, random_state=SEED)
    data_area.append(neg_sample)

pos_samples = data[data['target'] == 1]
data_area.append(pos_samples)
data = pd.concat(data_area)

data = data.reset_index(drop=True)
print("Target Distribution per Area: \n{}".format(
    data[data['target'] == 1]['area'].value_counts()
))
print('Area distribution:\n{}'.format(data['area'].value_counts()))
print('Class distribution:\n{}'.format(data['target'].value_counts()))
print('Class distribution (normalized):\n{}'.format(data['target'].value_counts()/len(data)))
data.head(3)

Target Distribution per Area: 
2    10345
1     3501
0      710
Name: area, dtype: int64
Area distribution:
2    40345
1    33501
0    30710
Name: area, dtype: int64
Class distribution:
3    67639
2    22361
1    14556
Name: target, dtype: int64
Class distribution (normalized):
3    0.646916
2    0.213866
1    0.139217
Name: target, dtype: float64


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.2262,0.183,0.179,0.1831,0.1886,0.2509,0.2981,0.2653,0.3291,0.0431,...,0.231549,-0.59423,0.211756,0.102407,0.3214,-0.798628,-0.067515,1.196927,3,0
1,0.14955,0.12935,0.127,0.1223,0.13935,0.2424,0.2846,0.2662,0.3236,0.0388,...,0.219911,-0.529955,0.153344,0.208832,0.384915,-0.705785,-0.057656,0.87143,3,0
2,0.1686,0.1597,0.1612,0.1616,0.1855,0.3246,0.3985,0.3127,0.4143,0.0454,...,0.205571,-0.53619,0.136678,0.237628,0.413957,-0.70611,-0.052031,0.864904,3,0


## Machine Learning Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
SEED = 42

## Define Features and Target Label

In [6]:
# Get target and feature columns
label = 'target'
features = [column for column in data.columns[:-2] if 'mndwi' not in column]

# Convert target labels to binary
data[label] = data[label].replace({2:0, 3:0})

# Get feature matrix X and target vector y
X = data[features]
y = data[label]

# For feature selection
num_features = 55
feature_names = X.columns.tolist()

print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Data dimensions: (104556, 112)
Class distribution:
0    90000
1    14556
Name: target, dtype: int64


## Logistic Regression

In [10]:
lr = LogisticRegression()
rfe_features = model_utils.get_rfe_features(X, y, lr, num_features)
model_utils.geospatialcv(data, rfe_features, label, lr, scale=True);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.

Test set: MAICAO
[[29383   617]
 [  168   542]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     30000
           1       0.47      0.76      0.58       710

    accuracy                           0.97     30710
   macro avg       0.73      0.87      0.78     30710
weighted avg       0.98      0.97      0.98     30710

Accuracy: 0.9744
F1 Score: 0.5800
Precision: 0.4676
Recall: 0.7634
Kappa Statistics: 0.5676

Test set: RIOHACHA
[[29846   154]
 [  901  2600]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     30000
           1       0.94      0.74      0.83      3501

    accuracy                           0.97     33501
   macro avg       0.96      0.87      0.91     33501
weighted avg       0.97     

In [11]:
lr = LogisticRegression()
rfe_features = model_utils.get_rfe_features(X, y, lr, 30)
model_utils.geospatialcv(data, rfe_features, label, lr, scale=True);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.
Fitting estimator with 55 features.
Fitting estimator with 45 features.
Fitting estimator with 35 features.

Test set: MAICAO
[[29465   535]
 [  148   562]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     30000
           1       0.51      0.79      0.62       710

    accuracy                           0.98     30710
   macro avg       0.75      0.89      0.81     30710
weighted avg       0.98      0.98      0.98     30710

Accuracy: 0.9778
F1 Score: 0.6220
Precision: 0.5123
Recall: 0.7915
Kappa Statistics: 0.6111

Test set: RIOHACHA
[[29896   104]
 [  758  2743]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30000
           1       0.96      0.78      0.86      3501

    accuracy                

### Random Forest Classifier

In [12]:
rf = RandomForestClassifier(n_estimators=300, max_depth=8, n_jobs=-1, random_state=SEED)
rfe_features = model_utils.get_rfe_features(X, y, rf, num_features)
model_utils.geospatialcv(data, rfe_features, label, rf, scale=False);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.

Test set: MAICAO
[[29468   532]
 [  226   484]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     30000
           1       0.48      0.68      0.56       710

    accuracy                           0.98     30710
   macro avg       0.73      0.83      0.77     30710
weighted avg       0.98      0.98      0.98     30710

Accuracy: 0.9753
F1 Score: 0.5608
Precision: 0.4764
Recall: 0.6817
Kappa Statistics: 0.5485

Test set: RIOHACHA
[[29990    10]
 [ 1551  1950]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     30000
           1       0.99      0.56      0.71      3501

    accuracy                           0.95     33501
   macro avg       0.97      0.78      0.84     33501
weighted avg       0.96     

## Gaussian Naive Bayes

In [13]:
gnb = GaussianNB()
rfe_features = model_utils.get_rfe_features(X, y, lr, num_features)
model_utils.geospatialcv(data, rfe_features, label, gnb, scale=True);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.

Test set: MAICAO
[[27230  2770]
 [  293   417]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.95     30000
           1       0.13      0.59      0.21       710

    accuracy                           0.90     30710
   macro avg       0.56      0.75      0.58     30710
weighted avg       0.97      0.90      0.93     30710

Accuracy: 0.9003
F1 Score: 0.2140
Precision: 0.1308
Recall: 0.5873
Kappa Statistics: 0.1831

Test set: RIOHACHA
[[29705   295]
 [ 2096  1405]]
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     30000
           1       0.83      0.40      0.54      3501

    accuracy                           0.93     33501
   macro avg       0.88      0.70      0.75     33501
weighted avg       0.92     

## Linear Support Vector Machines

In [14]:
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=SEED)
rfe_features = model_utils.get_rfe_features(X, y, svc, num_features)
model_utils.geospatialcv(data, rfe_features, label, svc, scale=True);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.

Test set: MAICAO
[[29290   710]
 [  184   526]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     30000
           1       0.43      0.74      0.54       710

    accuracy                           0.97     30710
   macro avg       0.71      0.86      0.76     30710
weighted avg       0.98      0.97      0.97     30710

Accuracy: 0.9709
F1 Score: 0.5406
Precision: 0.4256
Recall: 0.7408
Kappa Statistics: 0.5267

Test set: RIOHACHA
[[29855   145]
 [  828  2673]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     30000
           1       0.95      0.76      0.85      3501

    accuracy                           0.97     33501
   macro avg       0.96      0.88      0.91     33501
weighted avg       0.97     

In [15]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
svc = LinearSVC(random_state=SEED)
clf = CalibratedClassifierCV(svc)
rfe_features = model_utils.get_rfe_features(X, y, svc, 30)
model_utils.geospatialcv(data, rfe_features, label, clf, scale=True);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.
Fitting estimator with 55 features.
Fitting estimator with 45 features.
Fitting estimator with 35 features.

Test set: MAICAO
[[29752   248]
 [  195   515]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     30000
           1       0.67      0.73      0.70       710

    accuracy                           0.99     30710
   macro avg       0.83      0.86      0.85     30710
weighted avg       0.99      0.99      0.99     30710

Accuracy: 0.9856
F1 Score: 0.6993
Precision: 0.6750
Recall: 0.7254
Kappa Statistics: 0.6919

Test set: RIOHACHA
[[29898   102]
 [  948  2553]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     30000
           1       0.96      0.73      0.83      3501

    accuracy                

## Predict on Uribia

In [8]:
svc = LinearSVC(random_state=SEED)
rfe_features = model_utils.get_rfe_features(X, y, svc, 30)
clf = CalibratedClassifierCV(svc)
X = data[rfe_features].fillna(0)
y = data[label]
clf.fit(X, y);

Fitting estimator with 105 features.
Fitting estimator with 95 features.
Fitting estimator with 85 features.
Fitting estimator with 75 features.
Fitting estimator with 65 features.
Fitting estimator with 55 features.
Fitting estimator with 45 features.
Fitting estimator with 35 features.


In [9]:
pd.set_option('use_inf_as_na', True)
area_dict = geoutils.get_filepaths(areas, sentinel_dir, pos_mask_dir, neg_mask_dir)
data = geoutils.read_bands(area_dict, 'uribia')
print('Data dimensions: {}'.format(data.shape))
data.head(3)

100%|██████████| 5/5 [00:22<00:00,  4.40s/it]


Data dimensions: (6217512, 110)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,ndvi_2020,ndbi_2020,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
values = (data.sum(axis=1) > 0).astype(int) - 1.0
preds = clf.predict_proba(data[rfe_features].fillna(0))[:, 1]
preds[np.abs(preds) < eps] = 0
preds = preds + values

In [23]:
geoutils.save_predictions(
    preds, 
    image_src=area_dict['uribia']['images_cropped'][0], 
    output_file=data_dir+'uribia_pred_lsvc.tiff'
)