# Model Experimentation
This notebook contains a minimal template for running ML experiments. 

## Imports and Setup

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Locations

In [17]:
data_dir = "../data/"
input_file = data_dir + '20200415_dataset.csv'
output_dir = "../outputs/"

pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'
sentinel_dir = data_dir + 'sentinel2/'
false_pos_dir = data_dir + 'false_pos/'

# areas = ['maicao', 'riohacha', 'uribia', 'arauca1', 'cucuta', 'arauquita', 'tibu1']
areas = ['maicao', 'riohacha', 'uribia', 'arauca1', 'cucuta', 'tibu1']
value_codes = {value : key.capitalize() for (key,value) in model_utils.VALUE_CODES.items()}

## Load Dataset

In [19]:
data = (pd.read_csv(input_file).reset_index(drop=True)
       .query("area != 5")) # exclude arauquita
class_dist = data['target'].value_counts().rename(index=value_codes)

print('Data dimensions: {}'.format(data.shape))
print('\nClass distribution:\n{}\n'.format(class_dist))
data.head()

Data dimensions: (830930, 113)

Class distribution:
Unoccupied land        687135
Formal settlement      122535
Informal settlement     21260
Name: target, dtype: int64



Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.1597,0.13735,0.1531,0.187,0.209,0.2632,0.30515,0.26965,0.3327,0.0411,...,-0.509745,0.105128,0.239614,0.449106,-0.718433,-0.042537,0.901237,1,3,0
1,0.1597,0.13905,0.1454,0.17845,0.209,0.2632,0.30515,0.26395,0.3327,0.0411,...,-0.507485,0.105128,0.247826,0.464498,-0.716955,-0.03976,0.91149,1,3,0
2,0.16675,0.14875,0.1589,0.18605,0.2258,0.27945,0.3207,0.28085,0.3452,0.0416,...,-0.524371,0.073259,0.262348,0.446475,-0.722188,-0.033995,0.875915,1,3,0
3,0.16675,0.1524,0.1771,0.2352,0.2258,0.27945,0.3207,0.3074,0.3452,0.0416,...,-0.475631,0.073259,0.308045,0.524245,-0.689591,-0.01952,0.905289,1,3,0
4,0.16095,0.1565,0.192,0.2532,0.25,0.302,0.3419,0.32755,0.36515,0.0421,...,-0.464686,0.059161,0.352879,0.548867,-0.6733,-0.009437,0.875968,1,3,0


## Resample Dataset

In [20]:
neg_dist = {'Formal settlement': 0.4, 'Unoccupied land': 0.6}
data = model_utils.resample(data, num_neg_samples=30000, neg_dist=neg_dist, random_state=SEED)

class_dist = data['target'].value_counts().rename(index=value_codes)
area_dist = data['area'].value_counts().rename(index=model_utils.AREA_CODES)

print('Data dimensions: {}'.format(data.shape))
print('\nArea distribution:\n{}'.format(area_dist))
print('\nClass distribution:\n{}'.format(class_dist))
print('\nClass distribution (normalized):\n{}\n'.format(class_dist/len(data)))

data.head()

Data dimensions: (199460, 113)

Area distribution:
Uribia      41131
Riohacha    33501
Cucuta      32665
Arauca1     32426
Maicao      30710
Tibu        29027
Name: area, dtype: int64

Class distribution:
Unoccupied land        108000
Formal settlement       70200
Informal settlement     21260
Name: target, dtype: int64

Class distribution (normalized):
Unoccupied land        0.541462
Formal settlement      0.351950
Informal settlement    0.106588
Name: target, dtype: float64



Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira,target,area
0,0.16795,0.133,0.1379,0.1355,0.1571,0.2686,0.3198,0.2773,0.3579,0.0402,...,-0.495702,0.213802,0.122766,0.386654,-0.762164,-0.097429,1.005492,1,2,0
1,0.1933,0.2011,0.2039,0.2299,0.227,0.268,0.3094,0.2723,0.3335,0.0401,...,-0.417789,0.102637,0.211351,0.520237,-0.748431,-0.021367,1.030898,1,2,0
2,0.1929,0.1867,0.181,0.1958,0.1889,0.2415,0.2695,0.243,0.2892,0.0272,...,-0.188068,0.079984,0.222447,0.746002,-0.641025,-0.030201,1.056083,1,2,0
3,0.1853,0.1911,0.1836,0.1771,0.196,0.2493,0.2814,0.2453,0.3059,0.0383,...,-0.262399,0.063498,0.2805,0.673429,-0.634472,-0.013013,0.966096,1,2,0
4,0.2303,0.2166,0.2266,0.2458,0.2234,0.2643,0.3084,0.2923,0.3338,0.0419,...,-0.281367,0.067882,0.24619,0.654115,-0.636333,-0.040994,0.967376,1,2,0


## ML Pipeline

In [21]:
from sklearn.linear_model import LogisticRegression

### Model Training & Evaluation

In [22]:
label = 'target'
features = [column  for column in data.columns[:-2]]

data['target'] = data["target"].replace({2:0, 3:0})
print('Class distribution:\n{}'.format(data['target'].value_counts()))

Class distribution:
0    178200
1     21260
Name: target, dtype: int64


### Leave-out-area-out Cross Validation

In [23]:
clf =  LogisticRegression()
model_utils.geospatialcv(data, features, label, clf, verbose=2);


Test set: MAICAO
[[29504   496]
 [  137   573]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     30000
           1       0.54      0.81      0.64       710

    accuracy                           0.98     30710
   macro avg       0.77      0.90      0.82     30710
weighted avg       0.98      0.98      0.98     30710

MAICAO Results: 
- F1 Score: 0.6442
- Kappa Statistics: 0.6340
- Precision: 0.5360
- Recall: 0.8070
- Accuracy: 0.9794

Test set: RIOHACHA
[[29877   123]
 [ 1166  2335]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     30000
           1       0.95      0.67      0.78      3501

    accuracy                           0.96     33501
   macro avg       0.96      0.83      0.88     33501
weighted avg       0.96      0.96      0.96     33501

RIOHACHA Results: 
- F1 Score: 0.7837
- Kappa Statistics: 0.7633
- Precision: 0.9500
- Recall: 0.6670
- Accuracy: 0.9615

Test 

## Train Model

In [16]:
clf = LogisticRegression()
X = data[features].fillna(0)
y = data[label]
clf.fit(X, y);

## Prediction

In [9]:
pd.set_option('use_inf_as_na', True)
area_dict = geoutils.get_filepaths(areas, sentinel_dir, pos_mask_dir, neg_mask_dir)
data = geoutils.read_bands(area_dict, 'uribia')
print('Data dimensions: {}'.format(data.shape))
data.head(3)

100%|██████████| 5/5 [00:21<00:00,  4.30s/it]

Data dimensions: (6217512, 111)





Unnamed: 0,B1_2016,B2_2016,B3_2016,B4_2016,B5_2016,B6_2016,B7_2016,B8_2016,B9_2016,B10_2016,...,ndbi_2020,savi_2020,mndwi_2020,ui_2020,nbi_2020,brba_2020,nbai_2020,mbi_2020,baei_2020,la_guajira
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
preds = clf.predict_proba(data[features].fillna(0))[:, 1]
preds[(data.iloc[:, :-1].sum(axis=1) == 0)] = -1

geoutils.save_predictions(
    preds, 
    image_src=area_dict['uribia']['images_cropped'][0], 
    output_file=output_dir+'20200414_uribia.tiff'
)