# Master model development

## John Brandt

### Last updated: November 1 2019

*  Package loading
*  Hyperparameter definitions
*  Additional layer definitions
*  Model definition
*  Data loading
*  Data preprocessing
*  K means clustering
*  Augment training data
*  Loss definition
*  Equibatch creation
*  Model training
*  Model validation and sanity checks

In [None]:
# Notes
# The model is very sensitive to zoneout prob, do not go above 0.05
# AdaBound seems to perform worse, stick to Adam with step down loss
# 32 x 24 x 32 model overfits after 100 epochs
# Investigate more hypercolumn parametrizations to increase available data at output 
#    while maintaining low dimensionality of filters
# 5e-4 LR worked, investigating 8e-4 for 60% faster training time
# ONLY CHANGE ONE THING AT A TIME

In [None]:
#TODO Remove imports that aren't needed to save RAM
from tqdm import tqdm_notebook, tnrange
import tensorflow as tf

sess = tf.Session()
from keras import backend as K
K.set_session(sess)

import keras
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.layers import ELU
from keras.losses import binary_crossentropy
from tensorflow.python.ops import array_ops
from tensorflow.python.keras.layers import Conv2D, Lambda, Dense, Multiply, Add, Bidirectional, ConvLSTM2D

import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim import conv2d

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import itertools
from tflearn.layers.conv import global_avg_pool
from tensorflow.contrib.framework import arg_scope
from keras.regularizers import l1
from tensorflow.layers import batch_normalization
from tensorflow.python.util import deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
%run ../src/zoneout.py
%run ../src/convgru.py
%run ../src/lovasz.py
%run ../src/utils.py
%run ../src/adabound.py
%run ../src/slope.py
%run ../src/dropblock.py

# Hyperparameter definitions

In [None]:
ZONE_OUT_PROB = 0.25
L2_REG = 0.0
INITIAL_LR = 2e-4
FINAL_LR = 1e-3
BN_MOMENTUM = 0.9
BATCH_SIZE = 16
TRAIN_RATIO = 0.8
TEST_RATIO = 0.2
MAX_DROPBLOCK = 0.85
INP_FILTERS = 17

gru_flt = 12
fpa_flt = 16
out_conv_flt = 32


AUGMENTATION_RATIO = 4
IMAGE_SIZE = 16
existing = [int(x[:-4]) for x in os.listdir('../data/final/') if ".DS" not in x]
N_SAMPLES = len(existing)

LABEL_SIZE = 14

    
TRAIN_SAMPLES = int((N_SAMPLES * AUGMENTATION_RATIO) * TRAIN_RATIO)
TEST_SAMPLES = int((N_SAMPLES * AUGMENTATION_RATIO) - TRAIN_SAMPLES)
print(TRAIN_SAMPLES // AUGMENTATION_RATIO, N_SAMPLES - (TRAIN_SAMPLES // AUGMENTATION_RATIO))

# Additional layer definitions

In [None]:
def conv_bn_elu(inp, is_training, kernel_size, scope, filter_count = 16, pad = True, padding = 'valid'):
    if kernel_size == 3:
        if pad:
            padded = ReflectionPadding2D((1, 1,))(inp)
        else:
            padded = inp
        padding = padding
    else:
        padded = inp
        padding = padding
    conv = Conv2D(filters = filter_count, kernel_size = (kernel_size, kernel_size),
                      padding = padding, kernel_initializer = 'he_normal')(padded)
    elu = ELU()(conv)
    bn = Batch_Normalization(elu, training=is_training, scope = scope + "bn")
    return bn

def td_conv_bn_elu(inp, is_training, scope, filter_count = 16, pad = True, padding = 'valid'):
    #padded = TimeDistributed(ReflectionPadding2D((1, 1,)))(inp)
    #padded = tf.pad(inp, [[0,0], [0,0], [1,1], [1,1], [0,0] ], 'REFLECT')
    conv = TimeDistributed(Conv2D(filters = filter_count, kernel_size = (1, 1),
                      padding = 'SAME', kernel_initializer = 'he_normal'))(inp)
    elu = TimeDistributed(ELU())(conv)
    #bn = TimeDistributed(Batch_Normalization(elu, training=is_training, scope = scope + "bn")
    return elu

def resnet_block(inp, is_training, scope, flt):
    drop1 = DropBlock2D(keep_prob = keep_rate, block_size = 3)
    drop2 = DropBlock2D(keep_prob = keep_rate, block_size = 3)
    padded = ReflectionPadding2D((1, 1,))(inp)
    conv1 = Conv2D(filters = flt, kernel_size = (3, 3),
                      padding = 'valid', kernel_initializer = 'he_normal')(padded)
    elu1 = ELU()(conv1)
    bn1 = Batch_Normalization(elu1, training=is_training, scope = scope + "bn1")
    d1 = drop1(bn1, is_training)
    
    pad2 = ReflectionPadding2D((1, 1))(d1)
    conv2 = Conv2D(filters = flt, kernel_size = (3, 3),
                      padding = 'valid', kernel_initializer = 'he_normal')(pad2)
    elu2 = ELU()(conv2)
    bn2 = Batch_Normalization(elu2, training=is_training, scope = scope + "bn2")
    d2 = drop2(bn2, is_training)
    skip = tf.add(inp, d2)
    reweighted = csse_block(skip, scope + "csse")
    return reweighted
    
    
def fpa(inp, is_training, filter_count):
    one = conv_bn_elu(inp, is_training, 1, 'forward1', filter_count, False, 'valid')
    five = conv_bn_elu(inp, is_training, 5, 'down1', filter_count, False, 'valid')
    five_f = conv_bn_elu(five, is_training, 5, 'down1_f', filter_count, False, 'valid')
    three = conv_bn_elu(five, is_training, 3, 'down2', filter_count, False, 'valid')
    three_f = conv_bn_elu(three, is_training, 3, 'down2_f', filter_count, False, 'valid')
    
    three_up = get_deconv2d(three_f, filter_count, filter_count, "fpa1", is_training)
    five_up = get_deconv2d(five_f, filter_count, filter_count, "fpa2", is_training)
    
    print("One: {}".format(one.shape))
    print("Five: {}".format(five.shape))
    print("Five_F: {}".format(five_f.shape))
    print("Three: {}".format(three.shape))
    print("Three_f: {}".format(three_f.shape))
    print("Three_up: {}".format(three_up.shape))
    print("Five_up: {}".format(five_up.shape))
    
    # top block
    pooled = tf.keras.layers.GlobalAveragePooling2D()(inp)
    one_top = conv_bn_elu(tf.reshape(pooled, (-1, 1, 1, pooled.shape[-1])),
                          is_training, 1, 'top1', filter_count)
    four_top = tf.keras.layers.UpSampling2D((16, 16))(one_top)
    
    
    concat_1 = tf.multiply(one, tf.add(three_up, five_up))
    concat_2 = tf.add(concat_1, four_top)
    print("Feature pyramid attention shape {}".format(concat_2.shape))
    return concat_2

    
def create_deconv_init(filter_size, num_channels):
    bilinear_kernel = np.zeros([filter_size, filter_size], dtype=np.float32)
    scale_factor = (filter_size + 1) // 2
    if filter_size % 2 == 1:
        center = scale_factor - 1
    else:
        center = scale_factor - 0.5
    for x in range(filter_size):
        for y in range(filter_size):
            bilinear_kernel[x,y] = (1 - abs(x - center) / scale_factor) * \
                                   (1 - abs(y - center) / scale_factor)
    weights = np.zeros((filter_size, filter_size, num_channels, num_channels))
    for i in range(num_channels):
        weights[:, :, i, i] = bilinear_kernel

    #assign numpy array to constant_initalizer and pass to get_variable
    bilinear_init = tf.constant_initializer(value=weights, dtype=tf.float32)
    return bilinear_init


def get_deconv2d(inp, filter_count, num_channels, scope, is_training):
    bilinear_init = create_deconv_init(4, filter_count)
    x = tf.keras.layers.Conv2DTranspose(filters = filter_count, kernel_size = (4, 4),
                                        strides=(2, 2), padding='same', 
                                        kernel_initializer = bilinear_init)(inp)
    x = ELU()(x)
    x = Batch_Normalization(x, training=is_training, scope = scope + "bn")
    return x


def Batch_Normalization(x, training, scope):
    return batch_normalization(inputs=x, 
                               momentum = BN_MOMENTUM, 
                               training=training,
                               renorm = True,
                               reuse=None,
                               name = scope)


def attention(inputs, attention_size, time_major=False, return_alphas=False):
    inputs_orig = inputs
    inputs = tf.reduce_mean(inputs, axis = [2, 3])
    #print("ATT means: {}".format(inputs.shape))
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape
    print("Alphas: {}".format(alphas.shape))
    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs_orig * tf.reshape(alphas, (-1, 24, 1, 1, 1)), axis = 1)
    #output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
    print(output.shape)
    if not return_alphas:
        return output
    else:
        return output, alphas

def cse_block(prevlayer, prefix):
    mean = Lambda(lambda xin: K.mean(xin, axis=[1, 2]))(prevlayer)
    lin1 = Dense(K.int_shape(prevlayer)[3] // 2, name=prefix + 'cse_lin1', activation='relu')(mean)
    lin2 = Dense(K.int_shape(prevlayer)[3], name=prefix + 'cse_lin2', activation='sigmoid')(lin1)
    x = Multiply()([prevlayer, lin2])
    return x

def temporal_attention(inp, scope):
    # This rescales each output
    # Timesteps that are more important get weighted higher
    # Timesteps that are least important get weighted lower --> B, N, H, W, C
    conved = TimeDistributed(Conv2D(50, (1, 1), padding = 'same', kernel_initializer = 'he_normal',
                            activation = 'tanh', strides = (1, 1)))(inp)
    
    conved = TimeDistributed(Conv2D(1, (1, 1), padding = 'same', kernel_initializer = 'he_normal',
                            activation = 'sigmoid', use_bias = False, strides = (1, 1)))(conved)
    
    # We need to calculate the total sum for each pixel for each channel, so that we can combine them
    conved = conved / tf.reduce_sum(conved, axis = 1, keep_dims=True)
    print("Attention weight shapes {}".format(conved.shape))
    
    # This actually multiplies the Conv by the input
    multiplied = tf.reduce_sum(conved * inp, axis = 1)
    return multiplied
    


def sse_block(prevlayer, prefix):
    conv = Conv2D(1, (1, 1), padding="same", kernel_initializer="he_normal",
                  activation='sigmoid', strides=(1, 1),
                  name=prefix + "_conv")(prevlayer)
    conv = Multiply(name=prefix + "_mul")([prevlayer, conv])
    return conv


def csse_block(x, prefix):
    '''
    Implementation of Concurrent Spatial and Channel ‘Squeeze & Excitation’ in Fully Convolutional Networks
    https://arxiv.org/abs/1803.02579
    '''
    cse = cse_block(x, prefix)
    sse = sse_block(x, prefix)
    x = Add(name=prefix + "_csse_mul")([cse, sse])

    return x

class ReflectionPadding2D(Layer):
    def __init__(self, padding=(1, 1), **kwargs):
        self.padding = tuple(padding)
        self.input_spec = [InputSpec(ndim=4)]
        super(ReflectionPadding2D, self).__init__(**kwargs)

    def compute_output_shape(self, s):
        """ If you are using "channels_last" configuration"""
        return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])

    def call(self, x, mask=None):
        w_pad,h_pad = self.padding
        return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')
    
    
def gru_block(inp, length, size, flt, scope, train, normalize = True):
    with tf.variable_scope(scope):
        print("GRU input shape {}, zoneout: {}".format(inp.shape, ZONE_OUT_PROB))
        cell_fw = ConvLSTMCell(shape = size, filters = flt,
                           kernel = [3, 3], normalize = normalize)
        cell_bw = ConvLSTMCell(shape = size, filters = flt,
                           kernel = [3, 3], normalize = normalize)
        cell_fw = ZoneoutWrapper(
           cell_fw, zoneout_drop_prob = ZONE_OUT_PROB, is_training = train)
        cell_bw = ZoneoutWrapper(
            cell_bw, zoneout_drop_prob = ZONE_OUT_PROB, is_training = train)
        output, final_state = convGRU(inp, cell_fw, cell_bw, length)
        final_state = tf.concat(final_state, axis = -1)
        output = tf.concat(output, axis = -1)
        print("Hidden output shape: {}".format(output))
        
        attended = temporal_attention(output, "attention")
        #output = output[:, -1, :, :, :]
        #attended = attention(output, 50)
        means = tf.reduce_mean(inp, axis = 1)
        out = tf.concat([attended, means], axis = -1)
        print("GRU Output shape: {}".format(out.shape))
    return out

# Model definition

In [None]:
reg = keras.regularizers.l2(L2_REG)
inp = tf.placeholder(tf.float32, shape=(None, 24, IMAGE_SIZE, IMAGE_SIZE, INP_FILTERS))
length = tf.placeholder(tf.int32, shape = (None, 1))
labels = tf.placeholder(tf.float32, shape=(None, 14, 14))#, 1))
keep_rate = tf.placeholder_with_default(1.0, ())
length2 = tf.reshape(length, (-1,))
is_training = tf.placeholder_with_default(False, (), 'is_training')
    
    
# GRU -> Conv -> Conv
#feature_engineering = td_conv_bn_elu(inp, is_training, 'feats', filter_count = 16, pad = False, padding = 'valid')
#feats = tf.concat([inp, feature_engineering], axis = -1)
gru_output = gru_block(inp = inp, length = length2, size = [16, 16],
                      flt = gru_flt, scope = 'down_16', train = is_training)
gru_conv = conv_bn_elu(gru_output, is_training, 3, 'gru_conv', out_conv_flt, pad = True)
gru_csse1 = csse_block(gru_conv, "csse_conv1")

# Skip -> FPA
drop2 = DropBlock2D(keep_prob = keep_rate, block_size = 3)
drop = drop2(gru_csse1, is_training)

#fpa = fpa(drop, is_training, fpa_flt)
#fpa_csse = csse_block(fpa, 'csse_fpa')
#fpa_skip = tf.concat([fpa_csse, gru_csse1, gru_output], axis = -1)
#drop3 = DropBlock2D(keep_prob = keep_rate, block_size = 3)
#out_drop1 = drop3(fpa_skip, is_training)

# Conv -> Hyperpyramid -> Conv
out_skip = tf.concat([gru_csse1, drop], axis = -1)
out_conv1 = conv_bn_elu(drop, is_training, 3, "out_conv1", out_conv_flt, pad = False)

print("Initializing last sigmoid bias with -2.94 constant")
init = tf.constant_initializer([-np.log(0.9/0.1)]) # For focal loss
fm = Conv2D(filters = 1,
            kernel_size = (1, 1), 
            padding = 'valid',
            activation = 'sigmoid',
            bias_initializer = init,
           )(out_conv1) # For focal loss
print("Output shape: {}".format(fm.shape))

In [None]:
total_parameters = 0
for variable in tf.trainable_variables():
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    total_parameters += variable_parameters
print("This model has {} parameters".format(total_parameters))

# Data loading

In [None]:
df = pd.read_csv("../data/subplot.csv")
df1 = pd.read_csv("../data/subplot2.csv")
df2 = pd.read_csv("../data/subplot3.csv")
df3 = pd.read_csv("../data/subplot4.csv")

df = df.drop('IMAGERY_TITLE', axis = 1).dropna(axis = 0)
df1 = df1.drop('IMAGERY_TITLE', axis = 1).dropna(axis = 0)
df2 = df2.drop('IMAGERY_TITLE', axis = 1).dropna(axis = 0)
df3 = df3.drop('IMAGERY_TITLE', axis = 1).dropna(axis = 0)

lens = [len(x) for x in [df, df1, df2, df3]]

df = pd.concat([df, df1, df2, df3], ignore_index = True)
df = df.dropna(axis = 0)

existing = [int(x[:-4]) for x in os.listdir('../data/correct_dem/') if ".DS" not in x]
N_SAMPLES = len(existing)

In [None]:
df = df[df['PLOT_ID'].isin(existing)]
N_SAMPLES = int(df.shape[0]/196)
N_YEARS = 1

plot_ids = sorted(df['PLOT_ID'].unique())


def reconstruct_images(plot_id):
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

data = [reconstruct_images(x) for x in plot_ids]


# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []

# Iterate over each plot
for i in tnrange(len(plot_ids)):
    # Load the sentinel imagery
    for year in ["correct_dem"]:  
        x = np.load("../data/" + year + "/" + str(plot_ids[i]) + ".npy")
        x = ndvi(x, image_size = 16)
        x = evi(x, image_size = 16)
        x = savi(x, image_size = 16)
        x = bi(x)
        x = msavi2(x)
        x = si(x)
        #x = ndmi(x)
        x = remove_blank_steps(x)
        y = reconstruct_images(plot_ids[i])
        x[:, :, :, 10] /= 90
        lengths.append(x.shape[0])
        if x.shape[0] < 24:
            padding = np.zeros((24 - x.shape[0], IMAGE_SIZE, IMAGE_SIZE, 14))
            x = np.concatenate((x, padding), axis = 0)
        data_x.append(x)
        data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

In [None]:
np.max(data_x[:, :, :, :, 10])

# Data preprocessing

In [None]:
below_1 = [i for i, val in enumerate(data_x) if np.min(val) < -1.5]
above_1 = [i for i, val in enumerate(data_x) if np.max(val) > 1.5]
min_vals = [np.min(val) for i, val in enumerate(data_x) if np.min(val) < -1.5]
max_vals = [np.max(val) for i, val in enumerate(data_x) if np.max(val) > 1.5]
outliers = below_1 + above_1
print("The outliers are: {}, totalling {}".format(outliers, len(outliers)))
print("\n")
print(min_vals, max_vals)
data_x = data_x[[x for x in range(0, len(data_x)) if x not in outliers]]
data_y = data_y[[x for x in range(0, len(data_y)) if x not in outliers]]
lengths = lengths[[x for x in range(0, len(lengths)) if x not in outliers]]

min_all = []
max_all = []
for x in range(0, data_x.shape[-1]):
    mins, maxs = (np.min(data_x[:, :, :, :, x]), np.max(data_x[:, :, :, :, x]))
    min_all.append(mins)
    max_all.append(maxs)
    
    data_x[:, :, :, :, x] = (data_x[:, :, :, :, x] - mins) / (maxs - mins)
    
print("The data has been scaled to [{}, {}]".format(np.min(data_x), np.max(data_x)))

In [None]:
plot_ids2 = [val for x, val in enumerate(plot_ids) if x not in list(set([x for x in outliers]))]
N_SAMPLES = len(data_x)

region_lengths = []
for x in [df1, df2, df3]:
    subs = [i for i in set(x['PLOT_ID']) if i in plot_ids2]
    region_lengths.append(len(subs))
    
region_lengths = [N_SAMPLES - sum(region_lengths)] + region_lengths

print("The region sample distribution is {}".format(region_lengths))
print(sum(region_lengths))
train_ordering = []
test_ordering = []
ordering = []
total_samples = 0
for r in TRAIN_RATIO, TEST_RATIO:
    for i, val in enumerate(region_lengths):
        start = int(np.sum(region_lengths[:i]))
        end = start + val
        if r == 0.8:
            start = start
            end = end-((end-start)*(1-r))
            start = int(start)
            end = int(end)
            total_samples += (end - start)
            train_ordering += [x for x in range(start, end)]
        if r == 0.2:
            start = start + ((end-start)*(1-r))
            end = end
            start = int(start)
            end = int(end)
            total_samples += (end-start)
            test_ordering += [x for x in range(start, end)]

ordering = train_ordering + test_ordering

data_x = data_x[ordering]
data_y = data_y[ordering]
lengths = lengths[ordering]

# K Means clustering

In [None]:
from sklearn.cluster import KMeans

NONZERO_CLUSTERS = 10
ZERO_CLUSTERS = 6

kmeans = KMeans(n_clusters=NONZERO_CLUSTERS, random_state = 50)
kmeans_zero = KMeans(n_clusters = ZERO_CLUSTERS, random_state = 50)
unaugmented = [x for x in range(0, len(data_y))]
zeros = [x for x in unaugmented if np.sum(data_y[x]) == 0]
nonzero = [x for x in unaugmented if x not in zeros]
kmeans.fit(data_y[nonzero, :, :].reshape((len(nonzero), 14*14)))
kmeans_zero.fit(np.mean(data_x[zeros, :, :], axis = 1).reshape((len(zeros), 16*16*INP_FILTERS)))             

In [None]:
def multiplot(matrices):
    '''Plot multiple heatmaps with subplots'''
    fig, axs = plt.subplots(ncols=4)
    fig.set_size_inches(20, 4)
    for i, matrix in enumerate(matrices):
        sns.heatmap(data = matrix, ax = axs[i], vmin = 0, vmax = 0.9)
        axs[i].set_xlabel("")
        axs[i].set_ylabel("")
        axs[i].set_yticks([])
        axs[i].set_xticks([])
    plt.show()

In [None]:
import random 
samples_x = [val for x, val in enumerate(nonzero) if kmeans.labels_[x] == 1]
print(samples_x)
randoms = random.sample(samples_x, 4)
randoms = [data_y[x] for x in randoms]
randoms = [x.reshape((14, 14)) for x in randoms]
multiplot(randoms)

In [None]:
percs = np.sum(data_y.reshape((-1, 14*14)), axis = 1)
idx = [x for x in range(0, len(data_y))]
print([i for x, i in enumerate(sorted(percs)) if x % (len(data_y)//15) == 0 ])

ids = {
    0: [x for x, z in zip(idx, percs) if 0 < z <= 5],
    1: [x for x, z in zip(idx, percs) if 5 < z <= 9],
    2: [x for x, z in zip(idx, percs) if 9 < z <= 14],
    3: [x for x, z in zip(idx, percs) if 14 < z <= 19],
    4: [x for x, z in zip(idx, percs) if 19 < z <= 27],
    5: [x for x, z in zip(idx, percs) if 27 < z <= 33],
    6: [x for x, z in zip(idx, percs) if 33 < z <= 41],
    7: [x for x, z in zip(idx, percs) if 41 < z <= 56],
    8: [x for x, z in zip(idx, percs) if 56  < z <= 93],
    9: [x for x, z in zip(idx, percs) if 93 < z <= 120],
    10: [x for x, z in zip(idx, percs) if 120 < z]
}

In [None]:
def balance_data(data_y, ids, labels, labels2, unaugmented = unaugmented):
    # Loop over the nonzero clusters
    #for i in range(0, NONZERO_CLUSTERS):
    #    tmp = [val for x, val in enumerate(nonzero) if labels[x] == i]
    #    ids[i] = tmp
    # Loop over the zero clusters
    for i in range(0, ZERO_CLUSTERS):
        tmp = [val for x, val in enumerate(zeros) if labels2[x] == i]
        ids[i + 11] = tmp
    #ids[10] = zeros
    return ids



In [None]:
ids = balance_data(data_y, ids, kmeans.labels_, kmeans_zero.labels_ )
items = [v for k, v in ids.items()]
items = [item for sublist in items for item in sublist]
print("The {} samples have been balanced between the sampling sites".format(len(items)))

In [None]:
train_ids = []
test_ids = []
for i in ids:
    ln = len(ids[i])
    train_len = int(np.floor([ln * TRAIN_RATIO]))
    test_len = ln - train_len
    print(train_len, test_len, ln)
    trains = ids[i][:train_len]
    tests = ids[i][train_len:]
    train_ids += trains
    test_ids += tests
    
train_labels = []
for i in train_ids:
    train_labels.append([k for k, v in ids.items() if i in v][0])

In [None]:
print("Train and test characteristics:")
print("Train mean Y {}".format(np.mean([np.sum(x) for x in data_y[train_ids]])))
print("Test mean Y {}".format(np.mean([np.sum(x) for x in data_y[test_ids]])))
print("Train STD Y {}".format(np.std([np.sum(x) for x in data_y[train_ids]])))
print("Test STD Y {}".format(np.std([np.sum(x) for x in data_y[test_ids]])))
print("Train number with zero trees {}".format(0.2*len([x for x in data_y[train_ids] if np.sum(x) == 0])))
print("Test number with zero trees {}".format(0.8*len([x for x in data_y[test_ids] if np.sum(x) == 0])))
print("Train mean NDVI")
print("Test mean NDVI")
print("There are {} train and {} test samples".format(len(train_ids), len(test_ids)))
print("There is {} overlap between train and test".format(len([x for x in train_ids if x in test_ids])))


# Augment training data

In [None]:
data_x_augmented = []
data_y_augmented = []
lengths_augmented = []
labels_augmented = []
for i, val in enumerate(train_ids):
    data_x_augmented.append(data_x[val])
    data_y_augmented.append(data_y[val])
    lengths_augmented.append(data_x[val].shape[0])
    labels_augmented.append(train_labels[i])
    
    x1 = np.flip(data_x[val], 1)
    y1 = np.flip(data_y[val], 0)
    lengths_augmented.append(x1.shape[0])
    labels_augmented.append(train_labels[i])
    data_x_augmented.append(x1)
    data_y_augmented.append(y1)
    
    x1 = np.flip(data_x[val], [2, 1])
    y1 = np.flip(data_y[val], [1, 0])
    lengths_augmented.append(x1.shape[0])
    labels_augmented.append(train_labels[i])
    data_x_augmented.append(x1)
    data_y_augmented.append(y1)
    
    x1 = np.flip(data_x[val], 2)
    y1 = np.flip(data_y[val], 1)
    lengths_augmented.append(x1.shape[0])
    labels_augmented.append(train_labels[i])
    data_x_augmented.append(x1)
    data_y_augmented.append(y1)

train_x = np.stack(data_x_augmented)
train_y = np.stack(data_y_augmented)
train_y = np.reshape(train_y, (train_y.shape[0], 14, 14, 1))
train_l = np.stack(lengths_augmented)
train_l = np.reshape(train_l, (train_y.shape[0], 1))

In [None]:
test_x = data_x[test_ids]
test_y = data_y[test_ids]
test_lengths = lengths[test_ids]

In [None]:
print("RANDOM TRAIN SAMPLES - SHOULD BE AUGMENTED")
multiplot([x.reshape(14, 14) for x in train_y[25:29]])

In [None]:
print("RANDOM TEST SAMPLES - SHOULD BE NOT AUGMENTED")
multiplot([x.reshape(14, 14) for x in test_y[25:29]])

# Loss definition

In [None]:
from keras.losses import binary_crossentropy
from tensorflow.python.ops import array_ops
import math

def smooth_jaccard(y_true, y_pred, smooth=1):
    y_true = tf.reshape(y_true, (-1, 14*14))
    y_pred = tf.reshape(y_pred, (-1, 14*14))
    intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
    sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return (1 - jac) * smooth

def focal_loss_fixed(y_true, y_pred, gamma = 2., alpha = 0.25):
    y_true = tf.reshape(y_true, (-1, 14*14, 1))
    y_pred = tf.reshape(y_pred, (-1, 14*14, 1))
    y_pred = K.clip(y_pred, 1e-8, 1-1e-8)
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    epsilon = K.epsilon()
        # clip to prevent NaN's and Inf's
    pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
    pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)
    loss = -K.mean(alpha * 1 * K.log(K.epsilon()+pt_1)) - K.mean((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return 2 * tf.reduce_mean(loss)

def bce_jaccard(y_true, y_pred):
    jac = smooth_jaccard(y_true, y_pred)
    bce = binary_crossentropy(y_true, y_pred)
    return bce + 0.15*tf.reshape(jac, (-1, 1, 1, 1))

def weighted_bce_loss(y_true, y_pred, weight, smooth = 0.01):
    epsilon = 1e-7
    y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
    logit_y_pred = K.log(y_pred / (1. - y_pred))
    #y_true = K.clip(y_true, smooth, 1. - smooth)
    loss = tf.nn.weighted_cross_entropy_with_logits(
        y_true,
        logit_y_pred,
        weight,
    )
    return tf.reduce_mean(loss)

def lovasz_foc(y_true, y_pred):
    foc_losses = []
    #bce = weighted_bce_loss(y_true, y_pred, weight = 1.3)
    foc = focal_loss_fixed(y_true, y_pred, gamma = 1.3)
    lv_image = lovasz_softmax(y_pred, tf.reshape(y_true, (-1, 14, 14)), classes=[1], per_image=True)
    return foc + 0.5 * lv_image

def lovasz(y_true, y_pred):
    lv = lovasz_softmax(y_pred, tf.reshape(y_true, (-1, 14, 14)), classes=[1], per_image=True)
    return lv

def calculate_metrics():
    best_f1 = 0
    best_thresh = 0
    p = 0
    r = 0
    error = 0
    for j in range(7, 12):
        tps = []
        fps = []
        fns = []
        perc_error = []
        trues = []
        preds = []
        val_loss = []
        for m in test_ids:
            y, vl = sess.run([fm, test_loss], feed_dict={inp: test_x[m].reshape(1, 24, 16, 16, INP_FILTERS),
                                      length: test_lengths[m].reshape(1, 1),
                                      is_training: False,
                                      labels: test_y[m, :, :].reshape(1, 14, 14),
                                      })
            true = test_y[m].reshape((14, 14))
            pred = y.reshape((14, 14))
            pred[np.where(pred > j*0.05)] = 1
            pred[np.where(pred < j*0.05)] = 0
            true_s = np.sum(true)
            pred_s = np.sum(pred)
            #true_p = true_s - (true_s - pred_s)
            perc_error.append(abs(pred_s - true_s) / 196)
            tp, fp, fn = thirty_meter(true, pred)
            tps.append(tp)
            fps.append(fp)
            fns.append(fn)
            trues.append(true_s)
            preds.append(pred_s)
            val_loss.append(np.mean(vl))
        oa_error = abs(np.sum(preds) - np.sum(trues)) / np.sum(trues)
        precision = np.sum(tps) / (np.sum(tps) + np.sum(fps))
        recall = np.sum(tps) / (np.sum(tps) + np.sum(fns))
        f1 = 2*((precision* recall) / (precision + recall))
        if f1 > best_f1:
            best_f1 = f1
            p = precision
            r = recall
            error = oa_error
            best_thresh = j*0.05
    print("Val loss: {} Thresh: {} F1: {} Recall: {} Precision: {} Error: {}".format(np.around(np.mean(val_loss), 3), np.around(best_thresh, 2),
                                                                                     np.around(best_f1, 3), np.around(p, 3), np.around(r, 3), 
                                                                                     np.around(error, 3)))
    return best_f1

In [None]:
train_ids = [x for x in range(0, len(train_y))]
print(len(train_ids))

# Equibatch creation

In [None]:

def equibatch(train_ids, lovasz = False):
    first_len = 5
    second_len = 9
    third_len = 14
    np.random.shuffle(train_ids)
    ix = train_ids
    percs = [np.sum(x) for x in train_y[ix]]
    zero_ids = [x for x, z in zip(ix, percs) if z == 0]
    one_ids = [x for x, z in zip(ix, percs) if 0 < z <= first_len]
    two_ids = [x for x, z in zip(ix, percs) if first_len < z <= second_len]
    three_ids = [x for x, z in zip(ix, percs) if second_len < z <= third_len]
    four_ids = [x for x, z in zip(ix, percs) if third_len < z <= 19]
    five_ids = [x for x, z in zip(ix, percs) if 19 < z < 27]
    six_ids = [x for x, z in zip(ix, percs) if 27 < z <= 33]
    seven_ids = [x for x, z in zip(ix, percs) if 33 < z <= 41]
    eight_ids = [x for x, z in zip(ix, percs) if 41 < z <= 56]
    nine_ids =  [x for x, z in zip(ix, percs) if 56 < z <= 80]
    ten_ids =  [x for x, z in zip(ix, percs) if 80 < z <= 120]
    eleven_ids = [x for x, z in zip(ix, percs) if 120 < z]
    #ten_ids = [x for x, z in zip(ix, percs) if 125 < z]
    
    

    all_ids = [x for x in [zero_ids, one_ids, two_ids, three_ids, four_ids, five_ids, six_ids,
              seven_ids, eight_ids, nine_ids, ten_ids, eleven_ids]]
    
    new_batches = []
    maxes = [len(zero_ids), len(one_ids), len(two_ids), len(three_ids), len(four_ids),
             len(five_ids), len(six_ids), len(seven_ids), len(eight_ids), len(nine_ids), len(ten_ids), len(eleven_ids)]#, len(ten_ids)]
    cur_ids = [0] * 12
    iter_len = len(train_ids)// 16
    for i in range(0, iter_len):
        random_ids = np.random.randint(0, 12, 3)
        for i, val in enumerate(cur_ids):
            if val > maxes[i] - 5:
                cur_ids[i] = 0
        if cur_ids[0] >= (maxes[0] - 2):
            cur_ids[0] = 0
        to_append = [zero_ids[cur_ids[0]], zero_ids[cur_ids[0] + 1], one_ids[cur_ids[1]], two_ids[cur_ids[2]],
                    three_ids[cur_ids[3]], four_ids[cur_ids[4]], five_ids[cur_ids[5]],
                    six_ids[cur_ids[6]], seven_ids[cur_ids[7]], eight_ids[cur_ids[8]],
                               nine_ids[cur_ids[9]], ten_ids[cur_ids[10]], eleven_ids[cur_ids[11]],
                    all_ids[random_ids[0]][cur_ids[random_ids[0]]+1],
                     all_ids[random_ids[1]][cur_ids[random_ids[1]]+1],
                     all_ids[random_ids[2]][cur_ids[random_ids[2]]]+1]
        np.random.shuffle(to_append)
        new_batches.append(to_append)
        cur_ids = [x + 1 for x in cur_ids]
        cur_ids[0] += 1
        for x in random_ids:
            cur_ids[x] += 1
        
    new_batches = [item for sublist in new_batches for item in sublist]
    #overlap = [x for x in new_batches if x in test_ids]
    #print("There is {} overlap. Error if > 0".format(len(overlap)))
    return new_batches

In [None]:
batch = equibatch(train_ids, True)

In [None]:
multiplot([x.reshape((14, 14)) for x in train_y[batch[:4]]])

In [None]:
multiplot([x.reshape((14, 14)) for x in train_y[batch[4:8]]])

In [None]:
multiplot([x.reshape((14, 14)) for x in train_y[batch[8:12]]])

In [None]:
np.mean([np.sum(x) for x in train_y[batch]])

In [None]:
#new_saver = tf.train.import_meta_graph('../models/f1_auc/model.meta')
#new_saver.restore(sess, tf.train.latest_checkpoint('../models/f1_auc/'))

# Model training

In [None]:
FRESH_START = False
FINE_TUNE = False
from tensorflow.python.keras.optimizers import SGD
learning_rate = tf.placeholder(tf.float32, shape=[])


BATCH_SIZE = 16
print("Starting model with: \n {} zone out \n {} l2 \n {} initial LR \n {} final LR \n {} parameters"
     .format(ZONE_OUT_PROB, L2_REG, INITIAL_LR, FINAL_LR, total_parameters))
best_val = 0.620
if not FRESH_START:
    print("Resuming training with a best validation score of {}".format(best_val))
if FRESH_START:
    print("Restarting training from scratch on {} train and {} test samples, total {}".format(len(train_ids), len(test_ids), N_SAMPLES))

    optimizer = tf.train.AdamOptimizer(2e-4, epsilon = 1e-7)
    optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=1e-1)

    ft_optimizer = tf.train.AdamOptimizer(1e-4, epsilon = 1e-4)
    ft_optimizer = tf.contrib.estimator.clip_gradients_by_norm(ft_optimizer, clip_norm=0.1)
    
    train_loss = lovasz_foc(tf.reshape(labels, (-1, 14, 14, 1)), fm)
    #train_loss = focal_loss_fixed(labels, fm, gamma = 1.1, alpha = 0.5)
    test_loss = weighted_bce_loss(tf.reshape(labels, (-1, 14, 14, 1)), fm, weight = 1.)
    ft_loss = lovasz(tf.reshape(labels, (-1, 14, 14, 1)), fm)
    l2_loss = tf.losses.get_regularization_loss()
    train_loss += l2_loss
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(train_loss)   
        #wu_op = wu_optimizer.minimize(wu_loss)
        ft_op = ft_optimizer.minimize(ft_loss)
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    saver = tf.train.Saver(max_to_keep = 2)

# Run training 1oop
for i in range(1, 200):
    randomize = equibatch(train_ids, lovasz = False)
    randomize = [np.min([x, len(train_y) - 1]) for x in randomize]
        #print("Loss: BCE + 0.30 Lovasz")
    op = train_op
    loss = train_loss
    #op = ft_op
    #loss = ft_loss
    BATCH_SIZE = 16
    test_ids = [x for x in range(0, len(test_x))]
    losses = []
    bce_losses = []
    
    for k in tnrange(int(len(train_ids) // BATCH_SIZE)):
        batch_ids = randomize[k*BATCH_SIZE:(k+1)*BATCH_SIZE]
        batch_y = train_y[batch_ids, :, :].reshape(len(batch_ids), 14, 14)
        opt, tr = sess.run([op, loss],
                              feed_dict={inp: train_x[batch_ids, :, :, :],
                                         length: train_l[batch_ids].reshape((-1, 1)),
                                         labels: batch_y,
                                         is_training: True,
                                         keep_rate: np.max((1 - (i*0.005), 0.85))
                                         })
        losses.append(tr)
    
    print("Epoch {}: Loss {}".format(i, np.around(np.mean(losses[:-1]), 3)))
    f1 = calculate_metrics()
    if f1 > best_val:
        best_val = f1
        print("Saving model with {}".format(f1))
        save_path = saver.save(sess, "../models/f1_auc/model")

# Model validation and sanity checks





In [None]:
start = 0
test_ids = [x for x in range(0, len(test_x))]

def multiplot(matrices, nrows = 2, ncols = 4):
    '''Plot multiple heatmaps with subplots'''
    fig, axs = plt.subplots(ncols=4, nrows = nrows)
    fig.set_size_inches(20, 4*nrows)
    to_iter = [[x for x in range(i, i + ncols + 1)] for i in range(0, nrows*ncols, ncols)]
    for r in range(1, nrows + 1):
        min_i = min(to_iter[r-1])
        max_i = max(to_iter[r-1])
        for i, matrix in enumerate(matrices[min_i:max_i]):
            sns.heatmap(data = matrix, ax = axs[r - 1, i], vmin = 0, vmax = 0.9)
            axs[r - 1, i].set_xlabel("")
            axs[r - 1, i].set_ylabel("")
            axs[r - 1, i].set_yticks([])
            axs[r - 1, i].set_xticks([])
    plt.show()
start = 15




In [None]:
import random 

    
test_losses = []
print(start/len(test_ids))
test_ids = sorted(test_ids)
matrix_ids = [test_ids[start], test_ids[start + 1], test_ids[start + 2], test_ids[start + 3],
              test_ids[start + 4], test_ids[start + 5], test_ids[start + 6], test_ids[start + 7]]
#matrix_ids = random.sample(test_ids, 4)z

preds = []
trues = []
for i in matrix_ids:
    idx = i
    print(i)
    y = sess.run([fm], feed_dict={inp: test_x[idx].reshape(1, 24, IMAGE_SIZE, IMAGE_SIZE, INP_FILTERS),
                                  length: test_lengths[idx].reshape(1, 1),
                                  is_training: False,
                                  })
    y = np.array(y).reshape(14, 14)
    preds.append(y)
    true = test_y[idx].reshape(LABEL_SIZE, LABEL_SIZE)
    trues.append(true)
    
to_plot = trues[0:4] + preds[0:4]# + trues[5:] + preds[5:]
multiplot(to_plot, nrows = 2, ncols = 4)
#plot_ids[ordering[976]//4] 

start = start+ 4


In [None]:
import random 

    
test_losses = []
print(start/len(test_ids))
test_ids = sorted(test_ids)
#matrix_ids = [test_ids[start], test_ids[start + 1], test_ids[start + 2], test_ids[start + 3],
#              test_ids[start + 4], test_ids[start + 5], test_ids[start + 6], test_ids[start + 7]]
matrix_ids = random.sample(train_ids, 4)

preds = []
trues = []
for i in matrix_ids:
    idx = i
    print(i)
    y = sess.run([fm], feed_dict={inp: train_x[idx].reshape(1, 24, IMAGE_SIZE, IMAGE_SIZE, INP_FILTERS),
                                  length: train_l[idx].reshape(1, 1),
                                  is_training: False,
                                  })
    y = np.array(y).reshape(14, 14)
    preds.append(y)
    true = train_y[idx].reshape(LABEL_SIZE, LABEL_SIZE)
    trues.append(true)
    

to_plot = trues[0:4] + preds[0:4]# + trues[5:] + preds[5:]
multiplot(to_plot, nrows = 2, ncols = 4)
#plot_ids[ordering[976]//4] 
start = start + 4

In [None]:
plot_ids2 = [val for x, val in enumerate(plot_ids) if x not in list(set([x // 4 for x in outliers]))]
plot_ids2[ordering[460]//4] 

# Calculate ROC for best threshold selection

In [None]:
calculate_metrics()

## TODO @jombrandt top 10 worst training, test samples by IOU 

These should be written to a tmp/ .txt file and indexed by validate-data.ipynb to ensure that original classifications were correct, and to identify regions that need more training data.