# Model Experimentation
This notebook contains a minimal template for running ML experiments. 

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import sys
sys.path.insert(0, '../utils')
import model_utils
import geoutils

import logging
import warnings
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

SEED = 42

%load_ext autoreload
%autoreload 2

## File Locations

In [2]:
data_dir = "../data/"
input_file = data_dir + '20200504_dataset.csv'
output_dir = "../outputs/"

images_dir = data_dir + 'images/'
indices_dir = data_dir + 'indices/'
pos_mask_dir = data_dir + 'pos_masks/'
neg_mask_dir = data_dir + 'neg_masks/'

## Load Dataset

In [3]:
data = pd.read_csv(input_file).reset_index(drop=True).fillna(0)
class_dist = data['target'].value_counts().rename(index=model_utils.VALUE_CODES)

print('Data dimensions: {}'.format(data.shape))
print('\nClass distribution:\n{}\n'.format(class_dist))

data.head()

Data dimensions: (1065595, 69)

Class distribution:
Unoccupied land        810995
Formal settlement      230569
Informal settlement     24031
Name: target, dtype: int64



Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.151,0.13225,0.1424,0.1643,0.1899,0.257,0.296,0.2551,0.32295,0.0396,...,-0.438625,0.151655,0.209614,0.479385,-0.665204,-0.060444,0.948025,1,3,0
1,0.151,0.12885,0.1379,0.16185,0.1899,0.257,0.296,0.25755,0.32295,0.0396,...,-0.43545,0.151655,0.213526,0.48833,-0.66301,-0.059064,0.952352,1,3,0
2,0.15895,0.1373,0.15185,0.18915,0.2264,0.28555,0.3268,0.28085,0.3574,0.0416,...,-0.447333,0.145385,0.222971,0.473118,-0.661291,-0.060496,0.911748,1,3,0
3,0.15895,0.1463,0.1771,0.2424,0.2264,0.28555,0.3268,0.3098,0.3574,0.0416,...,-0.406751,0.145385,0.261163,0.554157,-0.63243,-0.047513,0.943184,1,3,0
4,0.15895,0.15345,0.192,0.2595,0.25225,0.3004,0.3423,0.3351,0.3611,0.0416,...,-0.406525,0.131505,0.289682,0.570201,-0.62392,-0.042356,0.918867,1,3,0


## Resample Dataset

In [4]:
data = model_utils.resample(data, num_neg_samples=20000, random_state=SEED)

class_dist = data['target'].value_counts().rename(index=model_utils.VALUE_CODES)
area_dist = data['area'].value_counts().rename(index=model_utils.AREA_CODES)

print('Data dimensions: {}'.format(data.shape))
print('\nArea distribution:\n{}'.format(area_dist))
print('\nClass distribution:\n{}'.format(class_dist))
print('\nClass distribution (normalized):\n{}\n'.format(class_dist/len(data)))

data.head(3)

Data dimensions: (204031, 69)

Area distribution:
Uribia       30345
Riohacha     23501
Bogota       22839
Cucuta       22485
Arauca       22298
Arauquita    20778
Tibu         20730
Maicao       20552
Soacha       20503
Name: area, dtype: int64

Class distribution:
Unoccupied land        108723
Formal settlement       71277
Informal settlement     24031
Name: target, dtype: int64

Class distribution (normalized):
Unoccupied land        0.532875
Formal settlement      0.349344
Informal settlement    0.117781
Name: target, dtype: float64



Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira,target,area
0,0.1748,0.1264,0.1259,0.12075,0.1288,0.2291,0.28325,0.2551,0.31165,0.0396,...,-0.325365,0.336843,0.097095,0.493299,-0.683798,-0.120547,1.149291,1,2,0
1,0.2093,0.1981,0.2023,0.217,0.2178,0.2767,0.3224,0.302,0.3478,0.0353,...,-0.212563,0.098564,0.242471,0.71641,-0.62334,-0.022285,1.042249,1,2,0
2,0.2062,0.20505,0.21575,0.2332,0.2083,0.2836,0.3381,0.3109,0.374,0.03575,...,-0.235786,0.266124,0.158668,0.655587,-0.632521,-0.078219,1.1013,1,2,0


## ML Pipeline

In [5]:
from sklearn.linear_model import LogisticRegression

### Model Training & Evaluation

In [6]:
label = 'target'
features = [column  for column in data.columns[:-2]]
data[label] = data[label].replace({2:0, 3:0})

splits = data[['area']]
X = data[features]
y = data[label]

print('Class distribution:\n{}'.format(data['target'].value_counts()))

Class distribution:
0    180000
1     24031
Name: target, dtype: int64


## Spatial Cross Validation

In [7]:
clf =  LogisticRegression()
model_utils.spatial_cv(clf, X, y, splits=splits, verbose=2);


Test Set: Maicao
Predicted  False  True  __all__
Actual                         
False      18888  1112    20000
True         228   324      552
__all__    19116  1436    20552

               precision    recall  f1-score   support

           0       0.99      0.94      0.97     20000
           1       0.23      0.59      0.33       552

    accuracy                           0.93     20552
   macro avg       0.61      0.77      0.65     20552
weighted avg       0.97      0.93      0.95     20552

F1 Score: 0.3260
Kappa Statistics: 0.2987
Precision: 0.2256
Recall: 0.5870
Accuracy: 0.9348
ROC AUC: 0.7657


Test Set: Riohacha
Predicted  False  True  __all__
Actual                         
False      19965    35    20000
True         954  2547     3501
__all__    20919  2582    23501

               precision    recall  f1-score   support

           0       0.95      1.00      0.98     20000
           1       0.99      0.73      0.84      3501

    accuracy                          

## Train Model

In [8]:
clf = LogisticRegression()
X = data[features].fillna(0)
y = data[label]
clf.fit(X, y);

## Prediction

In [12]:
areas = ['maicao', 'riohacha', 'uribia', 'arauca', 'cucuta', 'tibu', 'arauquita', 'soacha', 'bogota']
pd.set_option('use_inf_as_na', True)
area_dict = geoutils.get_filepaths(areas, images_dir, indices_dir, pos_mask_dir, neg_mask_dir)
data = geoutils.read_bands(area_dict, 'soacha')
print('Data dimensions: {}'.format(data.shape))
data.head(3)

100%|██████████| 3/3 [00:18<00:00,  6.19s/it]

Data dimensions: (4215294, 67)





Unnamed: 0,B1_2015-2016,B2_2015-2016,B3_2015-2016,B4_2015-2016,B5_2015-2016,B6_2015-2016,B7_2015-2016,B8_2015-2016,B9_2015-2016,B10_2015-2016,...,ndbi_2019-2020,savi_2019-2020,mndwi_2019-2020,ui_2019-2020,nbi_2019-2020,brba_2019-2020,nbai_2019-2020,mbi_2019-2020,baei_2019-2020,la_guajira
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [13]:
preds = clf.predict_proba(data[features].fillna(0))[:, 1]
preds[(data.iloc[:, :-1].sum(axis=1) == 0)] = -1

geoutils.save_predictions(
    preds, 
    image_src=area_dict['soacha']['images'][0], 
    output_file=output_dir+'20200504_soacha.tiff'
)