# Model Experimentation
This notebook contains a minimal template for running ML experiments. 

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Locations

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200326_dataset.csv'

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'

areas = ['maicao', 'riohacha', 'uribia']

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True)
print('Data dimensions: {}'.format(data.shape))
print('Class distribution:\n{}'.format(data['target'].value_counts()))
print('Class distribution (normalized):\n{}'.format(data['target'].value_counts()/len(data)))
data.head(3)

Data dimensions: (334524, 112)
Class distribution:
3    248172
2     71796
1     14556
Name: target, dtype: int64
Class distribution (normalized):
3    0.741866
2    0.214621
1    0.043513
Name: target, dtype: float64


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,0.187614,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,0.177058,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,0.179191,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,3,0


## Resample Dataset
Resamples 30,000 negative examples per area.

In [6]:
data = model_utils.resample(data, num_neg_samples=30000, random_state=SEED)
print('Data dimensions: {}'.format(data.shape))
print("Target Distribution per Area: \n{}".format(data[data['target'] == 1]['area'].value_counts()))
print('Area distribution:\n{}'.format(data['area'].value_counts()))
print('Class distribution:\n{}'.format(data['target'].value_counts()))
print('Class distribution (normalized):\n{}'.format(data['target'].value_counts()/len(data)))
data.head(3)

Data dimensions: (104556, 112)
Target Distribution per Area: 
2    10345
1     3501
0      710
Name: area, dtype: int64
Area distribution:
2    40345
1    33501
0    30710
Name: area, dtype: int64
Class distribution:
3    67639
2    22361
1    14556
Name: target, dtype: int64
Class distribution (normalized):
3    0.646916
2    0.213866
1    0.139217
Name: target, dtype: float64


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,target,area
0,0.16905,0.1598,0.1451,0.13975,0.15725,0.219,0.2631,0.2462,0.29745,0.03295,...,0.213062,-0.563341,0.160151,0.14806,0.359208,-0.791975,-0.048659,1.042396,3,0
1,0.1265,0.1005,0.0949,0.0605,0.0964,0.2287,0.3028,0.2927,0.3351,0.0407,...,0.286568,-0.565978,0.286057,0.071664,0.320072,-0.82181,-0.091095,1.300665,3,0
2,0.1753,0.1757,0.2145,0.269,0.2816,0.3463,0.3921,0.3619,0.4239,0.0446,...,0.143441,-0.464328,0.067745,0.347939,0.525376,-0.638106,-0.029467,0.830559,3,0


## ML Pipeline

In [5]:
from sklearn.ensemble import RandomForestClassifier

### Model Training & Evaluation

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]

data['target'] = data["target"].replace({2:0, 3:0})
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Class distribution:
0    90000
1    14556
Name: target, dtype: int64


### Leave-out-area-out Cross Validation

In [7]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED)
model_utils.geospatialcv(data, features, label, clf, verbose=2);


Test set: MAICAO
[[29511   489]
 [  236   474]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     30000
           1       0.49      0.67      0.57       710

    accuracy                           0.98     30710
   macro avg       0.74      0.83      0.78     30710
weighted avg       0.98      0.98      0.98     30710

MAICAO Results: 
- F1 Score: 0.5666
- Kappa Statistics: 0.5548
- Precision: 0.4922
- Recall: 0.6676
- Accuracy: 0.9764

Test set: RIOHACHA
[[29990    10]
 [ 2075  1426]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     30000
           1       0.99      0.41      0.58      3501

    accuracy                           0.94     33501
   macro avg       0.96      0.70      0.77     33501
weighted avg       0.94      0.94      0.93     33501

RIOHACHA Results: 
- F1 Score: 0.5777
- Kappa Statistics: 0.5503
- Precision: 0.9930
- Recall: 0.4073
- Accuracy: 0.9378

Test 

## Train Model

In [8]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED)
X = data[features].fillna(0)
y = data[label]
clf.fit(X, y);

## Prediction

In [9]:
pd.set_option('use_inf_as_na', True)
area_dict = geoutils.get_filepaths(areas, sentinel_dir, pos_mask_dir, neg_mask_dir)
data = geoutils.read_bands(area_dict, 'uribia')
print('Data dimensions: {}'.format(data.shape))
data.head(3)

100%|██████████| 5/5 [00:26<00:00,  5.30s/it]


Data dimensions: (6217512, 110)


Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,ndvi_2020,ndbi_2020,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
eps = 0.00001
values = (data.sum(axis=1) > 0).astype(int) - 1.0
preds = clf.predict_proba(data[features].fillna(0))[:, 1]
preds[np.abs(preds) < eps] = 0
preds = preds + values

In [14]:
geoutils.save_predictions(
    pred, 
    image_src=area_dict['uribia']['images_cropped'][0], 
    output_file=data_dir+'uribia_pred.tiff'
)