In [1]:
import os
import time

import numpy as np
import pandas as pd
import sklearn

import tensorflow as tf
import tensorflow.keras as keras

from matplotlib import pyplot as plt

print('### Python version: ' + __import__('sys').version)
print('### NumPy version: ' + np.__version__)
print('### Scikit-learn version: ' + sklearn.__version__)
print('### Tensorflow version: ' + tf.__version__)
print('### TF Keras version: ' + keras.__version__)
print('------------')

random_state=123

### Python version: 3.8.5 (default, Jul 28 2020, 12:59:40) 
[GCC 9.3.0]
### NumPy version: 1.19.5
### Scikit-learn version: 0.24.0
### Tensorflow version: 2.4.0
### TF Keras version: 2.4.0
------------


In [2]:
data = pd.read_csv('minispect_dataset.csv')
data = data[data.name != 'ref']
print(data.name.unique())

['1a' '1b' '1c' '2a' '2b' '3a' '3b' '4' '5a']


In [64]:
from sklearn.model_selection import train_test_split
# Thanks to stackoverflowuser2010 https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def minispect_test_val_train_split(data, y='gitelson', stratify_col='name', train=.7, val=.15, test=.15):
    if train+val+test != 1.0:
        raise ValueError('Train, test, and validation do not add to 1 ({},{},{})'.format(train,test,val))
    n=data.shape[0]
    
    df_x = data
    df_y = data[[y]]
    
    train_x, temp_x, train_y, temp_y = train_test_split(df_x, df_y, stratify=data[[stratify_col]], test_size=(1.0-train), random_state=random_state)    
    val_x, test_x, val_y, test_y = train_test_split(np.squeeze(temp_x), np.squeeze(temp_y), stratify=temp_x[[stratify_col]], test_size=(val/(val+test)), random_state=random_state)
    #train_x, temp_x, train_y, temp_y = train_test_split(df_x, df_y, test_size=(1.0-train), random_state=random_state)    
    #val_x, test_x, val_y, test_y = train_test_split(np.squeeze(temp_x), np.squeeze(temp_y), test_size=(val/(val+test)), random_state=random_state)
    
    train_x = train_x.filter(regex='^\d+$')
    test_x = test_x.filter(regex='^\d+$')
    val_x = val_x.filter(regex='^\d+$')
    
    poly_train_x = train_x.filter(regex='^\d+$') ** 2
    poly_test_x = test_x.filter(regex='^\d+$') ** 2
    poly_val_x = val_x.filter(regex='^\d+$') ** 2
    
    for col in train_x.columns:
        train_x['poly{}'.format(col)] = poly_train_x[col]
        test_x['poly{}'.format(col)] = poly_test_x[col]
        val_x['poly{}'.format(col)] = poly_val_x[col]
    #train_x.add(poly_train_x)
    #val_x.add(poly_val_x)
    #test_x.add(poly_test_x)
    
    return train_x, np.squeeze(train_y), val_x, val_y, test_x, test_y
    


In [65]:
train_x, train_y, val_x, val_y, test_x, test_y = minispect_test_val_train_split(data)
print(train_x.shape)
print(val_x.shape)
print(test_x.shape)
print(train_y.shape, val_y.shape, test_y.shape)

(629, 576)
(135, 576)
(136, 576)
(629,) (135,) (136,)


[-2.2 -1.2 -0.2  0.8  1.8]
[4.84 1.44 0.04 0.64 3.24]
2.04


In [67]:
### Mean Baseline -- just guess the mean Gitelson chlorophyll content of the training dataset.
avg_chl_train = np.mean(train_y)

def mse_baseline(train_y, val_y, test_y):
    avg_chl_train = np.mean(train_y)
    print('Average Chl Content -- Training Set', avg_chl_train)
    
    train_mse = np.mean(np.square(train_y-avg_chl_train))
    val_mse = np.mean(np.square(val_y-avg_chl_train))
    test_mse = np.mean(np.square(test_y-avg_chl_train))
    
    print('train_mse',train_mse)
    print('val_mse',val_mse)
    print('test_mse',test_mse)
    
mse_baseline(train_y, val_y, test_y)




Average Chl Content -- Training Set 0.9114403261675911
train_mse 0.13552426565223893
val_mse 0.12525641692086517
test_mse 0.13469139669315097


In [68]:
### Linear Regression Baseline -- Create a linear model with the 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg_baseline = LinearRegression()
lin_reg_baseline.fit(X=train_x,y=train_y)

train_pred = lin_reg_baseline.predict(train_x)
val_pred = lin_reg_baseline.predict(val_x)
test_pred = lin_reg_baseline.predict(test_x)

print(mean_squared_error(train_y, train_pred))
print(mean_squared_error(val_y, val_pred))
print(mean_squared_error(test_y, test_pred))

max_weight_indices = np.array(lin_reg_baseline.coef_).argsort()[-3:]
print(max_weight_indices)
print(np.array(lin_reg_baseline.coef_)[max_weight_indices])



0.000318739149077513
0.23137948577510678
0.17216720423491735
[112  71 131]
[642194.63948634 652873.10948099 849140.43216505]


In [69]:
class PerfEvalCustomCallback(keras.callbacks.Callback):
    
    def __init__(self, perf_data):
        self.perf_data = perf_data
    
    # we define the on_epoch_end callback and save the loss and accuracy in perf_data
    def on_epoch_end(self, epoch, logs=None):
        self.perf_data[epoch,0] = logs['loss']
        self.perf_data[epoch,1] = logs['mse']
        self.perf_data[epoch,2] = logs['val_loss']
        self.perf_data[epoch,3] = logs['val_mse']

    def get_perf_data():
        return self.perf_data

In [70]:
def train_model(model, max_epochs=25, batch_size=100, verbose=0, 
                   dataset=(train_x, train_y, val_x, val_y, test_x, test_y)):

    # unpack dataset
    train_x, train_y, val_x, val_y, test_x, test_y = dataset
    
    # this is the callback we'll use for early stopping
    early_stop_cb = keras.callbacks.EarlyStopping(monitor='loss', mode='min', patience=4)
    
    # setup the performance data callback
    perf_data = np.zeros((max_epochs, 4))
    perf_eval_cb = PerfEvalCustomCallback(perf_data)
    
    hobj = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=max_epochs, batch_size=batch_size, 
                     shuffle=True, callbacks=[perf_eval_cb, early_stop_cb], verbose=verbose)
    
    eff_epochs = len(hobj.history['loss'])
    eval_data = (perf_data[0:eff_epochs,:], dataset) # tuple of evaluation data
    
    return eval_data



In [71]:
def create_compile_model(name='DNN-Minispect', hidden_activation='elu', input_shape=576, num_outputs=1, hidden_widths=[288,72,24]):
    model = keras.models.Sequential(name=name)
    
    model.add(keras.layers.Input(shape=(input_shape,), sparse=False))
    model.add(keras.layers.Dropout(0.2,input_shape=(input_shape,)))
    for i, hw in enumerate(hidden_widths):
        model.add(keras.layers.Dense(hw, activation=hidden_activation, name='hidden_{}'.format(i),kernel_initializer=keras.initializers.RandomNormal(stddev=np.sqrt(1/hw)),
                                     bias_initializer=keras.initializers.Zeros(), use_bias=False))
        
    model.add(keras.layers.Dense(num_outputs, activation='linear', name='output', kernel_initializer=keras.initializers.RandomNormal(stddev=np.sqrt(0.1)), bias_initializer=keras.initializers.Zeros(), use_bias=True))
    
    opt = keras.optimizers.Adam(lr=0.0001)
    
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=opt,metrics=['MeanSquaredError'])
    return model

In [93]:
model = create_compile_model(hidden_activation='relu')
model.fit(train_x,train_y,epochs=100,batch_size=5,validation_data=(val_x,val_y))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7ffa304d5940>

In [94]:
model.evaluate(x=test_x, y=test_y)



[0.03733113408088684, 0.03733113408088684]

In [74]:
np.array(model.weights[0]).shape

(576, 288)

## Export our model out to a queryable TensorFlow lite format.

In [92]:
import tensorflow.lite as tflite
converter = tflite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
open('chl_nn_minispect.tflite', 'wb').write(tflite_model)



INFO:tensorflow:Assets written to: /tmp/tmp39fla71b/assets


INFO:tensorflow:Assets written to: /tmp/tmp39fla71b/assets


755120