# Development: Applying 3-Category Classifier for _Localewise_ Scoring
Apply trained model to samples in catalog in order to assess model performance. Generate confusion matrices etc on a locale-by-locale basis in order to distinguish well- and poorly-performing locales vis a vis a given model.  

This notebook addresses only scoring, not mapping.
  
Date: 2019-07-01  
Author: Peter Kerins  

### Import statements
(may be over-inclusive)

In [1]:
# typical, comprehensive imports
import warnings
warnings.filterwarnings('ignore')
#
import os
import sys
import pickle
from pprint import pprint
#
import numpy as np
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
import pandas as pd

from importlib import reload

import tensorflow as tf

import descarteslabs as dl

ULU_REPO = os.environ["ULU_REPO"]
if not ULU_REPO in sys.path:
    sys.path.append(ULU_REPO)
# ulu_utils = ULU_REPO+'/utils'
# if not ulu_utils in sys.path:
#     sys.path.append(ulu_utils)
print (sys.path)

import utils.util_chips as util_chips
import utils.util_workflow as util_workflow
from utils.catalog_generator import CatalogGenerator
import utils.util_scoring as util_scoring


['/home/Peter.Kerins/anaconda3/envs/geoml/lib/python36.zip', '/home/Peter.Kerins/anaconda3/envs/geoml/lib/python3.6', '/home/Peter.Kerins/anaconda3/envs/geoml/lib/python3.6/lib-dynload', '', '/home/Peter.Kerins/anaconda3/envs/geoml/lib/python3.6/site-packages', '/home/Peter.Kerins/anaconda3/envs/geoml/lib/python3.6/site-packages/IPython/extensions', '/home/Peter.Kerins/.ipython', '/home/Peter.Kerins/UrbanLandUse']


## Preparation

### Set variables

In [2]:
# core
data_root='/data/phase_iv/'

resolution = 5  # Lx:15 S2:10

# tiling
tile_resolution = resolution
tile_size = 256
tile_pad = 32

look_window = 17
batch_size = 128

# misc
s2_bands=['blue','green','red','nir','swir1','swir2','alpha']; suffix='BGRNS1S2A'  # S2, Lx

# ground truth source: aue, aue+osm, aue+osm2
label_suffix = 'aue'
label_lot = '0'
resolution = 5
resampling = 'bilinear'
processing = None
source = 's2'

In [3]:
model_id = '3cat_14ct_green_2017_2-img-bl'
unflatten_input = True # is the model a cnn?
categories=[0,1,2]
n_cats = len(categories) # number of categories

***

### Load model

In [4]:
# category_weights_filename = data_root+'models/'+model_id+'_category_weights.pkl'
# category_weights = pickle.load( open( category_weights_filename, "rb" ) )
# weights = list(zip(*category_weights.items())[1])

network_filename = data_root+'models/'+model_id+'.hd5'
network = tf.keras.models.load_model(
    network_filename,
    custom_objects={'loss': 'categorical_crossentropy'},
    compile=True
)
# network = K.load_model(network_filename, custom_objects={'loss': 'categorical-crossentropy'})
network.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 17, 17, 6)    0                                            
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 17, 17, 32)   1760        input_2[0][0]                    
__________________________________________________________________________________________________
activation_7 (Activation)       (None, 17, 17, 32)   0           conv2d_4[0][0]                   
________________________________________________________________________

***

## Apply model: score results
Apply model to some set of chips and compare its predictions to the actual LULC values

### Identify desired images

In [5]:
place_images = {}
place_images['hindupur']=['U']
# place_images['hindupur']=['U', 'V', 'W', 'X', 'Y', 'Z']
# place_images['singrauli']=['O','P','Q','R','S','T','U']
# place_images['vijayawada']=['H','I']
# place_images['jaipur']=['T','U','W','X','Y','Z']
# place_images['hyderabad']=['P','Q','R','S','T','U']
# place_images['sitapur']=['Q','R','T','U','V']
# place_images['kanpur']=['AH', 'AK', 'AL', 'AM', 'AN']
# place_images['belgaum']=['P','Q','R','S','T']
# place_images['parbhani']=['T','V','W','X','Y','Z']
# place_images['pune']=['P', 'Q', 'T', 'U', 'S']
# place_images['ahmedabad']= ['Z', 'V', 'W', 'X', 'Y', 'AA']
# place_images['malegaon']=  ['V', 'W', 'X', 'Y', 'Z']
# place_images['kolkata'] =  ['M','N','O','P','Q','R']
# place_images['mumbai']=['P','Q','R','S','U','V']

### Filter catalog to selected chips

In [6]:
df = util_chips.load_catalog()
print(len(df.index))

39281620


In [7]:
mask = pd.Series(data=np.zeros(len(df.index),dtype='uint8'), index=range(len(df)), dtype='uint8')

for place,image_list in place_images.items():
    for image in image_list:
        mask |= (df['city']==place) & (df['image']==image)

# straight away remove road samples
mask &= (df['lulc']!=6)

# filter others according to specifications
mask &= (df['gt_type']==label_suffix)
mask &= (df['gt_lot']==int(label_lot))
mask &= (df['source']==source)
mask &= (df['resolution']==int(resolution))
mask &= (df['resampling']==resampling)
mask &= (df['processing']==str(processing).lower())

print(np.sum(mask))

107891


In [8]:
#here for example we will just exclude all roads samples
df = df[mask]
df.reset_index(drop=True,inplace=True)
len(df)

107891

### Split catalog into subcatalogs based on city, image, and locale

In [12]:
catalog_dict = util_chips.create_subcatalogs(df)

### Separate training and validation locales (if desired)
Skip for now

In [None]:
place_locales_paths = [
                       '/data/phase_iv/models/3cat_Hin_U-Z_place_locales.pkl'       ,
                       ]
# place_locales_paths = ['/data/phase_iv/models/3cat_Ahm_V-AA_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Bel_P-T_place_locales.pkl'       ,
#                        '/data/phase_iv/models/3cat_Hin_U-Z_place_locales.pkl'       ,
#                        '/data/phase_iv/models/3cat_Hyd_P-U_place_locales.pkl'       ,
#                        '/data/phase_iv/models/3cat_Jai_T-U+W-Z_place_locales.pkl'   ,
#                        '/data/phase_iv/models/3cat_Kan_AH+AK-AN_place_locales.pkl'  ,
#                        '/data/phase_iv/models/3cat_Mal_V-Z_place_locales.pkl'       ,
#                        '/data/phase_iv/models/3cat_Par_T+V-Z_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Pun_P-Q+S-U_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Sin_O-U_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Sit_Q-R+T-V_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Vij_H-I_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Kol_M-R_place_locales.pkl',
#                        '/data/phase_iv/models/3cat_Mum_P-V_place_locales.pkl'
#                        ]

In [None]:
combined_place_locales = {}
for place_locales_filename in place_locales_paths:
    with open(place_locales_filename, "rb") as f:
        place_locales = pickle.load(f,encoding='latin1')
    combined_place_locales.update(place_locales)
print(combined_place_locales)

In [None]:
df_t, df_v = util_chips.mask_locales(df, combined_place_locales)
print(len(df_t), len(df_v))

### Generate predictions on a localewise basis
(Handling for separate training & validation locales yet to be decided)  

Create output storage objects

In [13]:
output_df = pd.DataFrame(columns=['city','image','locale','confusion','accuracy','f_open','f_nonres','f_res','f_avg','rec_open'])
convert_dict = {'city':str, 
                'image':str,
                'locale':int,
                'confusion':object,
                'accuracy':float,
                'f_open':float,
                'f_nonres':float,
                'f_res':float,
                'f_avg':float,
                'rec_open':float,
               } 
  
output_df = output_df.astype(convert_dict) 

output_dict = {}
output_array = np.zeros(shape=(31,3,3),dtype='uint16')

Apply model and record performance and statistics locale by locale

In [15]:
generator = CatalogGenerator(df,remapping='3cat',look_window=look_window,batch_size=batch_size,one_hot=3)
beta = 2
for city in catalog_dict:
    output_dict[city] = {}
    for image in catalog_dict[city]:
        output_dict[city][image] = {}
        for locale in catalog_dict[city][image]:
            subcatalog = catalog_dict[city][image][locale]
            generator._set_data(subcatalog)
            predictions = network.predict_generator(generator, steps=generator.steps, verbose=1,
                                        use_multiprocessing=True, max_queue_size=4, workers=4,
                                                   )
            Yhat = predictions.argmax(axis=-1)
            Y = generator.get_label_series().values
            print(city, image, locale)
            confusion = util_scoring.calc_confusion(Yhat,Y,categories)
            recalls, precisions, accuracy = util_scoring.calc_confusion_details(confusion)
            f_score = (beta**2 + 1) * precisions * recalls / ( (beta**2 * precisions) + recalls )
            f_score_open = f_score[0] 
            f_score_nonres = f_score[1]  
            f_score_res = f_score[2]  
#             f_score_roads = None#f_score[3]  
            f_score_average = np.mean(f_score)
            
            output_dict[city][image][locale] = confusion
            output_array[locale] = confusion
            output_df = output_df.append({
                'city':city,
                'image':image,
                'locale':locale,
                'confusion':confusion,
                'accuracy':confusion.trace()/confusion.sum(),
                'f_open':f_score_open,
                'f_nonres':f_score_nonres,
                'f_res':f_score_res,
                'f_avg':f_score_average,
                'rec_open':recalls[0],
                }, ignore_index=True)
            

hindupur U 0
0 1426
1 41
2 2137
[[1384    4   38]
 [   0   15   26]
 [ 171   10 1956]]
3604 3355 0.9309100998890122
hindupur U 1
0 739
1 1133
2 1516
[[ 616  101   22]
 [  63 1003   67]
 [ 197  305 1014]]
3388 2633 0.7771546635182999
hindupur U 2
0 1058
1 1436
2 846
[[ 970   61   27]
 [ 124 1256   56]
 [  43  117  686]]
3340 2912 0.8718562874251496
hindupur U 3
0 829
1 32
2 1671
[[ 741   31   57]
 [  13   15    4]
 [ 344   98 1229]]
2532 1985 0.7839652448657188
hindupur U 4
0 253
1 117
2 2934
[[ 160   45   48]
 [   6   75   36]
 [  12   13 2909]]
3304 3144 0.9515738498789347
hindupur U 5
0 2274
1 133
2 1360
[[2143   19  112]
 [  32   73   28]
 [ 199   20 1141]]
3767 3357 0.8911600743297053
hindupur U 6
0 1696
1 1111
2 847
[[1617   33   46]
 [ 926  138   47]
 [ 666   33  148]]
3654 1903 0.5207991242474002
hindupur U 7
0 3152
1 0
2 411
[[3062   11   79]
 [   0    0    0]
 [ 297    2  112]]
3563 3174 0.8908223407241089
hindupur U 8
0 2200
1 95
2 1338
[[2115    0   85]
 [   3    5   87]
 [ 

Show results in a dataframe

In [16]:
output_df

Unnamed: 0,city,image,locale,confusion,accuracy,f_open,f_nonres,f_res,f_avg,rec_open
0,hindupur,U,0,"[[1384, 4, 38], [0, 15, 26], [171, 10, 1956]]",0.93091,0.953299,0.388601,0.925435,0.755779,0.970547
1,hindupur,U,1,"[[616, 101, 22], [63, 1003, 67], [197, 305, 10...",0.777155,0.803758,0.844134,0.707409,0.7851,0.833559
2,hindupur,U,2,"[[970, 61, 27], [124, 1256, 56], [43, 117, 686]]",0.871856,0.903334,0.874895,0.825909,0.868046,0.916824
3,hindupur,U,3,"[[741, 31, 57], [13, 15, 4], [344, 98, 1229]]",0.783965,0.839375,0.275735,0.77063,0.62858,0.893848
4,hindupur,U,4,"[[160, 45, 48], [6, 75, 36], [12, 13, 2909]]",0.951574,0.672269,0.62396,0.987508,0.761246,0.632411
5,hindupur,U,5,"[[2143, 19, 112], [32, 73, 28], [199, 20, 1141]]",0.89116,0.934176,0.56677,0.848832,0.783259,0.942392
6,hindupur,U,6,"[[1617, 33, 46], [926, 138, 47], [666, 33, 148]]",0.520799,0.809066,0.148451,0.203913,0.387143,0.95342
7,hindupur,U,7,"[[3062, 11, 79], [0, 0, 0], [297, 2, 112]]",0.890822,0.958853,0.0,0.305177,0.421343,0.971447
8,hindupur,U,8,"[[2115, 0, 85], [3, 5, 87], [28, 26, 1284]]",0.936967,0.966106,0.060827,0.943008,0.656647,0.961364
9,hindupur,U,9,"[[1731, 13, 129], [19, 41, 2], [134, 12, 1305]]",0.908742,0.923101,0.652866,0.901243,0.825737,0.924186


Sort cells according to preferred performance metric

In [19]:
output_df.sort_values(['city','image','accuracy'])

Unnamed: 0,city,image,locale,confusion,accuracy,f_open,f_nonres,f_res,f_avg,rec_open
21,hindupur,U,21,"[[554, 208, 24], [654, 484, 76], [260, 220, 145]]",0.450667,0.600607,0.419556,0.264117,0.428093,0.704835
6,hindupur,U,6,"[[1617, 33, 46], [926, 138, 47], [666, 33, 148]]",0.520799,0.809066,0.148451,0.203913,0.387143,0.95342
23,hindupur,U,23,"[[1126, 72, 289], [261, 47, 563], [368, 49, 789]]",0.550505,0.730884,0.064348,0.610209,0.46848,0.757229
14,hindupur,U,14,"[[1536, 174, 27], [751, 113, 66], [299, 120, 4...",0.593379,0.805538,0.136903,0.558743,0.500395,0.884283
22,hindupur,U,22,"[[1536, 23, 41], [361, 20, 34], [601, 124, 765]]",0.662197,0.863115,0.054735,0.5625,0.49345,0.96
24,hindupur,U,24,"[[894, 21, 83], [87, 118, 7], [477, 45, 842]]",0.72028,0.820184,0.571705,0.659048,0.683646,0.895792
30,hindupur,U,30,"[[1974, 208, 116], [208, 137, 119], [143, 98, ...",0.74032,0.856994,0.297956,0.643049,0.599333,0.859008
16,hindupur,U,16,"[[1461, 69, 236], [82, 129, 19], [371, 78, 1086]]",0.757859,0.813656,0.539298,0.725839,0.692931,0.827293
12,hindupur,U,12,"[[618, 61, 73], [24, 597, 0], [497, 138, 1523]]",0.775418,0.745117,0.910061,0.744525,0.799901,0.821809
1,hindupur,U,1,"[[616, 101, 22], [63, 1003, 67], [197, 305, 10...",0.777155,0.803758,0.844134,0.707409,0.7851,0.833559
