# Core: Train Model
Train a LULC classifier. Number of categories and remapping can be selected dynamically (eg 3-cat vs full 6-cat vs roads).

Workflow sidesteps some problems with TensorFlow by simplifying the training and shifting some components—multiple epochs, callback functionality, validation, etc—to manual coding.

Currently, calls for training in just two epochs, one fast and one slow. But smaller training datasets in particular may require additional epochs.

Date: 2019-01-18  
Author: Peter Kerins  

## Preparation

### Import all modules

In [None]:
# typical, comprehensive imports
import warnings
warnings.filterwarnings('ignore')
#
import os, sys
import json
import itertools, collections
import pickle
from pprint import pprint

get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import geojson
import fiona
import ogr, gdal

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, History
import h5py

import descarteslabs as dl

ULU_REPO = os.environ["ULU_REPO"]
if ULU_REPO not in sys.path:
    sys.path.append(ULU_REPO+'/utils')
    sys.path.append(ULU_REPO)
print(sys.path)

import util_descartes
#import util_ml
import util_rasters
import util_vectors
import util_workflow
import util_chips
import util_training
import util_network
import util_scoring
from catalog_generator import CatalogGenerator

### Set all user-defined variables

#### Base variables

In [None]:
data_root='/data/phase_iv/'

resolution=5

In [None]:
subcatalog_name = 'kampala_2img'

path_train = data_root+'models/'+subcatalog_name+'_train.csv'
path_valid = data_root+'models/'+subcatalog_name+'_valid.csv'

In [None]:
shutdown_system = False

#### Chips variables
Only needed if selecting samples from master catalog, rather than loading subcatalog from file

In [None]:
build_new = True

In [None]:
processing_level = None
source = 's2'
#image_suffix = 'E'

s2_bands=['blue','green','red','nir','swir1','swir2','alpha']; s2_suffix='BGRNS1S2A'  # S2, Lx
# s1_bands=['vv','vh']; s1_suffix='VVVH'  

resampling='bilinear'
processing = None

label_suffix = 'aue'
label_lot = '0'

In [None]:
exclude_locales = True

In [None]:
place_images = {}

In [None]:
# place_images['hindupur']=['U', 'V', 'W', 'X', 'Y', 'Z'],[13]
# place_images['singrauli']=['O','P','Q','R','S','T','U'],[38]
# place_images['vijayawada']=['H','I'],[68]
# place_images['jaipur']=['T','U','W','X','Y','Z'],[27, 72]
# place_images['hyderabad']=['P','Q','R','S','T','U'],[10, 44, 46, 60, 79, 55, 60]
# place_images['sitapur']=['Q','R','T','U','V'],[2, 27, 43]
# place_images['kanpur']=['AH', 'AK', 'AL', 'AM', 'AN'],[6, 19, 57, 67]
# place_images['belgaum']=['P','Q','R','S','T'],[13]
# place_images['parbhani']=['T','V','W','X','Y','Z'],[10, 42, 54]
# place_images['pune']=['P', 'Q', 'T', 'U', 'S'],[9, 54, 73]
# place_images['ahmedabad']= ['Z', 'V', 'W', 'X', 'Y', 'AA'],[22, 25, 45, 65, 70]
# place_images['malegaon']=  ['V', 'W', 'X', 'Y', 'Z'],[6]
# place_images['kolkata'] =  ['M','N','O','P','Q','R'],[16, 90, 105,  195, 218]
# place_images['mumbai']=['P','Q','R','S','U','V'],[24, 42, 73, 98, 99, 103, 123, 131, 133, 152, 160, 172]
# place_images['coimbatore']=['Q','R','S'],[15, 21, 68, 74]
# place_images['jalna']=['AV','AW','AX'],[12, 20, 31, 34, 42, 44, 65, 69, 73]
# place_images['kozhikode']=['J','K','L'],[41]

In [None]:
# place_images['dhaka']=['A','B','C'],[72]
# place_images['saidpur']=['A','B','C'],[2, 21, 32, 39, 43, 47, 52]
# place_images['rajshahi']=['A','B','C'],[17]
# place_images['lahore']=['A','B','C'],[33, 70]
# place_images['karachi']=['A','B','C'],[20, 29, 62]
# place_images['sialkot']=['A','B','C'],[32, 53]

In [None]:
# place_images['gorgan']=['A','B','C'],[36, 59, 69]
# place_images['qom']=['A','B','C'],[1]
# place_images['tehran']=['A','B','C'],[28, 56, 76]
# place_images['shymkent']=['A','B','C'],[62]
# place_images['pokhara']=['A','B','C'],[25, 28, 31, 49, 51]
# place_images['bukhara']=['A','B','C'],[61]
# place_images['tashkent']=['A','B','C'],[42]

In [None]:
# place_images['culiacan']=['A', 'B'],[1, 9, 29, 51, 61, 66, 71]
# place_images['guadalajara']=['A', 'B'],[22, 47, 65]
# place_images['leon']=['A', 'B'],[3, 16, 27, 36, 38, 46, 67]
# place_images['mexico-city']=['A', 'B'],[0, 13, 57, 75, 112, 183, 198]
# place_images['reynosa']=['A', 'B'],[25, 31, 40, 52, 61]
# place_images['tijuana']=['A', 'B'],[9, 45, 49, 53]
# place_images['merida']=['A', 'B'],[25, 55, 57]
# place_images['monterrey']=['A', 'B'],[1]
# place_images['tuxtla']=['A', 'B'],[3, 20, 58, 64]

In [None]:
# place_images['johannesburg']=['A', 'B',],[76, 125, 132, 178, 118, 166, 187]
place_images['kampala']=['A', 'B','C','D',],[59, 23, 22]
# place_images['kigali']=['A', 'B'],[29, 48]
# place_images['addis-ababa']=['A', 'B'],[32, 44, 43, 66, 65]
# place_images['port-elizabeth']=['A', 'B'],[44, 15, 20, 30]
# place_images['arusha']=['A', 'B'],[0, 8]
# place_images['nakuru']=['A', 'B'],[15, 33, 9, 52, 79]

#### Sample construction variables

In [None]:
window = 17

In [None]:
# bands stuff outdated! needs to be reconciled with catalog filtering
# will ignore for the moment since this is a bigger fix...
# haven't done any examples yet incorporating additional chips beyond s2
# into construction of a training sample
bands_vir=s2_bands[:-1]
bands_sar=None
bands_ndvi=None
bands_ndbi=None
bands_osm=None

In [None]:
# needs to be updated completely; bands stuff doesn't make sense right now
stack_label, feature_count = util_workflow.build_stack_label(
        bands_vir=bands_vir,
        bands_sar=bands_sar,
        bands_ndvi=bands_ndvi,
        bands_ndbi=bands_ndbi,
        bands_osm=bands_osm,)
print(stack_label, feature_count)

#### Model & training variables

In [None]:
model_id = '6cat_kampala_2img'
notes = 'just two images from kampala'

In [None]:
remapping = None
n_cats = 6
categories=[0,1,2,3,4,5,]
exclude_roads = True

In [None]:
batch_size = 128
balancing = None

epochs_fast = 1
epochs_slow = 1

max_queue_size = 64
workers = 64

### Specify training & validation samples
Construct subcatalogs containing all target training & validation samples, __or__ load them from file, according to variable `build_new`

#### Option A: Construct subcatalogs by filtering master catalog

In [None]:
if build_new:
    df = util_chips.load_catalog()
    print(len(df.index))
    
    new_places = [
        'dhaka',
        'saidpur',
        'rajshahi',
        'lahore',
        'karachi',
        'sialkot',
        'coimbatore',
        'jalna',
        'kozhikode',
        'bukhara',
        'gorgan',
        'pokhara',
        'qom',
        'shymkent',
        'tashkent',
        'tehran',
        'culiacan',
        'guadalajara',
        'leon',
        'reynosa',
        'tijuana',
        'merida',
        'monterrey',
        'tuxtla',
        'johannesburg',
        'kampala',
        'kigali',
        'addis-ababa',
        'port-elizabeth',
        'arusha',
        'nakuru',
    ]
    
    included_places = list(set(new_places) & set(place_images.keys()))
    
    for place in included_places:
#         print(place)
        place_catalog_path = data_root+'chip_catalog_'+place+'.csv'
        print(place_catalog_path)

        df_place = pd.read_csv(place_catalog_path)
        print('no of chips:', len(df_place))

        df = df.append(df_place, ignore_index=True)
        

    mask = pd.Series(data=np.zeros(len(df.index),dtype='uint8'), index=range(len(df)), dtype='uint8')

    for place,entry in place_images.items():
        image_list = entry[0]
        exclusion_list = entry[1]
        if exclude_locales:
            mask |= (df['city']==place) & (df.image.isin(image_list)) & (~df.locale.isin(exclusion_list))
        else:
            mask |= (df['city']==place) & (df.image.isin(image_list))

    if exclude_roads:
        mask &= (df['lulc']!=6)

    # filter others according to specifications
    mask &= (df['gt_type']==label_suffix)
    mask &= (df['gt_lot']==int(label_lot))
    mask &= (df['source']==source)
    mask &= (df['resolution']==int(resolution))
    mask &= (df['resampling']==resampling)
    mask &= (df['processing']==str(processing).lower())

    print(np.sum(mask))

    df = df[mask]
    df.reset_index(drop=True,inplace=True)
    len(df)



    combined_place_locales = {}
    for place in place_images:
        place_locales_filename = data_root+'models/'+'locales_'+place+'.pkl'
        with open(place_locales_filename, "rb") as f:
            place_locales = pickle.load(f,encoding='latin1')
    #         print(place_locales)
        combined_place_locales.update(place_locales)
    pprint(combined_place_locales)


    df_t, df_v = util_chips.mask_locales(df, combined_place_locales)
    print(len(df_t), len(df_v))

    # save the datasets for future use
    %time df_t.to_csv(path_train,index=False)
    %time df_v.to_csv(path_valid,index=False)

#### Option B: Load existing subcatalog

In [None]:
if not build_new:
    df_t = pd.read_csv(path_train, encoding='utf8')
    df_v = pd.read_csv(path_valid, encoding='utf8')
    print(len(df_t), len(df_v))

### Inspect selected samples

In [None]:
print('train:')
print(util_training.calc_category_counts(df_t,remapping=None), len(df_t))
print('valid:')
print(util_training.calc_category_counts(df_v,remapping=None), len(df_v))
print()
if build_new:
    print('all:')
    print(util_training.calc_category_counts(df,remapping=None), len(df))

In [None]:
pd.set_option('display.max_rows', None)
print(df_t.groupby(['city','image']).size().reset_index().rename(columns={0:'count'}))
pd.set_option('display.max_rows', 10)

---

## Model

### Build loss function

#### Generate class weighting information

In [None]:
category_weights = util_training.generate_category_weights(df_t,remapping=remapping,log=False,mu=1.0,max_score=None)
print(category_weights.items())
weights = list(zip(*category_weights.items()))[1]
print(weights)

In [None]:
category_weights_filename = data_root+'models/'+model_id+'_category_weights.pkl'

if not os.path.exists(category_weights_filename):
    pickle.dump(category_weights, open(category_weights_filename, 'wb'))

#### Use weights to create weighted categorical crossentropy loss function

In [None]:
loss = util_training.make_loss_function_wcc(weights)

### Build convolutional neural network and prepare it for training

In [None]:
#hardcoded params
network=util_network.build_xmodel(input_shape=(17,17,6),output_nodes=n_cats,input_conv_block=True)
util_network.compile_network(network, loss, LR=0.001)

---

## Training

### Conduct "fast" training with high learning rate

#### Create sample "generators" (Keras _sequence_ objects) to serve samples

In [None]:
generator_t = CatalogGenerator(df_t,remapping=remapping,look_window=window,batch_size=batch_size,one_hot=n_cats)
generator_v = CatalogGenerator(df_v,remapping=remapping,look_window=window,batch_size=batch_size,one_hot=n_cats)

#### Initial training

In [None]:
# train fast
#history_fast = network.fit(X_train, Y_t_cat, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, Y_v_cat), shuffle=True,callbacks=callbacks)
#docs: fit_generator(generator, steps_per_epoch=None, epochs=1, verbose=1, callbacks=None, validation_data=None, validation_steps=None,
                    #class_weight=None, max_queue_size=10, workers=1, use_multiprocessing=False, shuffle=True, initial_epoch=0)
history_fast = network.fit_generator(generator_t, epochs=epochs_fast, callbacks=None, steps_per_epoch=generator_t.steps,
                                    #validation_data=generator_v, validation_steps=generator_v.steps,
                                    shuffle=True,use_multiprocessing=True,max_queue_size=max_queue_size,workers=workers,)

# plt.plot(history_fast.history['val_acc'])
# plt.show()
# plt.plot(history_fast.history['val_loss'])
# plt.show()

#### Store trained weights

In [None]:
fast_weights_path = data_root + 'models/' + model_id + '_weights_fast' + '.h5'

In [None]:
print(fast_weights_path)
network.save_weights(fast_weights_path)

---

### Rebuild model and conduct "slow" training with lower learning rate

In [None]:
network=util_network.build_xmodel(input_shape=(17,17,6),output_nodes=n_cats,input_conv_block=True)

#### Load trained weights and prepare network for additional training

In [None]:
network.load_weights(fast_weights_path)
util_network.compile_network(network, loss, LR=0.0001)

#### Reset generators

In [None]:
generator_t.reset()
generator_v.reset()

#### Additional training

In [None]:
history_slow = network.fit_generator(generator_t, epochs=epochs_slow, callbacks=None, steps_per_epoch=generator_t.steps,
                                    #validation_data=generator_v, validation_steps=generator_v.steps,
                                    shuffle=True,use_multiprocessing=True,max_queue_size=max_queue_size,workers=workers,)

# plt.plot(history_slow.history['val_acc'])
# plt.show()
# plt.plot(history_slow.history['val_loss'])
# plt.show()

#### Store further trained weights

In [None]:
slow_weights_path = data_root + 'models/' + model_id + '_weights_slow' + '.h5'
print(slow_weights_path)
network.save_weights(slow_weights_path)

#### Store entire network object

In [None]:
network_filename = data_root+'models/'+model_id+'.hd5'

if os.path.exists(network_filename):
    print('Cannot save network: file already exists at specified path ('+network_filename+')')
else:
    network.save(network_filename)

---

## Scoring

### Apply model to training and validation data

In [None]:
generator_t.reset()
#predict_generator(generator, steps=None, max_queue_size=10, workers=1, use_multiprocessing=False, verbose=0)
predictions_t = network.predict_generator(generator_t, steps=generator_t.steps, verbose=1,
                  use_multiprocessing=True,max_queue_size=max_queue_size,workers=workers,)
print(predictions_t.shape)

generator_v.reset()
#predict_generator(generator, steps=None, max_queue_size=10, workers=1, use_multiprocessing=False, verbose=0)
predictions_v = network.predict_generator(generator_v, steps=generator_v.steps, verbose=1,
                  use_multiprocessing=True,max_queue_size=max_queue_size,workers=workers,)
print(predictions_v.shape)

In [None]:
Yhat_t = predictions_t.argmax(axis=-1)
print(Yhat_t.shape)
Yhat_v = predictions_v.argmax(axis=-1)
print(Yhat_v.shape)

### Extract corresponding _actual_ ground-truth values directly from catalog

In [None]:
Y_t = generator_t.get_label_series().values
print(Y_t.shape)
Y_v = generator_v.get_label_series().values
print(Y_v.shape)

### Generate typical scoring information

In [None]:
print("evaluate training")
# hardcoded categories
categories=[0,1,2,3,4,5,6]
train_confusion = util_scoring.calc_confusion(Yhat_t,Y_t,categories)
train_recalls, train_precisions, train_accuracy = util_scoring.calc_confusion_details(train_confusion)

# Calculate f-score
beta = 2
train_f_scores = (beta**2 + 1) * train_precisions * train_recalls / ( (beta**2 * train_precisions) + train_recalls )
train_f_score_average = np.mean(train_f_scores)

# expanding lists to match expected model_record stuff
train_recalls_expanded = [None,None,None,None,None,None,None,]
train_precisions_expanded = [None,None,None,None,None,None,None,]
train_f_scores_expanded = [None,None,None,None,None,None,None,]

In [None]:
print ("evaluate validation")
valid_confusion = util_scoring.calc_confusion(Yhat_v,Y_v,categories)
valid_recalls, valid_precisions, valid_accuracy = util_scoring.calc_confusion_details(valid_confusion)

# Calculate f-score
beta = 2
valid_f_scores = (beta**2 + 1) * valid_precisions * valid_recalls / ( (beta**2 * valid_precisions) + valid_recalls )
valid_f_score_average = np.mean(valid_f_scores)

# expanding lists to match expected model_record stuff
valid_recalls_expanded = [None,None,None,None,None,None,None,]
valid_precisions_expanded = [None,None,None,None,None,None,None,]
valid_f_scores_expanded = [None,None,None,None,None,None,None,]

In [None]:
for r in range(0,len(train_recalls)):
    train_recalls_expanded[r] = train_recalls[r]
    train_precisions_expanded[r] = train_precisions[r]
    train_f_scores_expanded[r] = train_f_scores[r]
    
    valid_recalls_expanded[r] = valid_recalls[r]
    valid_precisions_expanded[r] = valid_precisions[r]
    valid_f_scores_expanded[r] = valid_f_scores[r]

### Record experiment configuration and results

In [None]:
util_scoring.record_model_creation(
    model_id, notes, place_images, label_suffix+label_lot, resolution, stack_label, feature_count, 
    window, generator_t.remapping, balancing, 
    network.get_config(), epochs_fast+epochs_slow, batch_size,
    train_confusion, train_recalls_expanded, train_precisions_expanded, train_accuracy,
    train_f_scores, train_f_score_average,
    valid_confusion, valid_recalls_expanded, valid_precisions_expanded, valid_accuracy,
    valid_f_scores, valid_f_score_average, 
    )

---

## Cleanup

In [None]:
if shutdown_system:
    print('\n'*4)
    print("========================")
    print("========================")
    print("==== sudo poweroff =====")
    print("========================")
    print("========================")
    print('\n'*4)
    print("!dev-goodbye!")

    os.system('sudo poweroff')