<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/split_experiments/dnn/variational_split_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNN regressors

A variational model to find the mean/variance of O18 ratios at a particular lat/lon in the Brazilian Amazon. At the bottom of the colab, there are utilities to generate isoscapes from this model.

In [62]:
from collections import defaultdict
import math
import numpy as np
import pandas as pd
import datetime
from typing import List, Tuple, Dict

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from matplotlib import pyplot as plt
from tensorflow.python.ops import math_ops

#@title Debugging
# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/drive" #@param
TRAIN = "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_train_random_ungrouped.csv" #@param
VALIDATION = "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_validation_random_ungrouped.csv" #@param
TEST = "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_test_random_ungrouped.csv" #@param
RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
MODEL_SAVE_LOCATION = "/MyDrive/amazon_rainforest_files/variational/model/" #@param
OUTPUT_RASTER_BASE = "/MyDrive/amazon_rainforest_files/variational/rasters/" #@param
EXP_ID = "ungrouped_random" #@param

def format_dataframe_path(param) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{param}"

def get_model_save_location(filename) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{MODEL_SAVE_LOCATION}{filename}"

RUN_ID = "variational_ucd_yearly." + \
 (TAXONOMIC_FAMILY if TAXONOMIC_FAMILY else "all") + \
 "." + EXP_ID + "." + str(datetime.date.today())


In [3]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

Mounted at /content/drive


#Data preparation


In [41]:
def load_dataset(path: str):
  df = pd.read_csv(path, encoding="ISO-8859-1", sep=',')
  df = df[df['d18O_cel_variance'].notna()]

  # Family is too sparse. Too many families exist in validation/test that won't
  # exist in train, so drop it.
  X = df.drop(["d18O_cel_mean", "d18O_cel_variance", "Code", "Family", "Unnamed: 0"], axis=1)
  Y = df[["d18O_cel_mean", "d18O_cel_variance"]]
  return X, Y

Standardization

In [43]:
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.compose import ColumnTransformer

def create_feature_scaler(X: pd.DataFrame):
  columns_to_normalize = ['lat', 'long', 'VPD', 'RH', 'PET', 'DEM', 'PA',
       'Mean Annual Temperature', 'Mean Annual Precipitation',
       'Iso_Oxi_Stack_mean_TERZER', 'predkrig_br_lat_ISORG',
       'isoscape_fullmodel_d18O_prec_REGRESSION']
  feature_scaler = ColumnTransformer([
      ('feature_normalizer', Normalizer(), columns_to_normalize)],
      remainder='passthrough')
  feature_scaler.fit(X_train)
  return feature_scaler

def create_label_scaler(Y: pd.DataFrame):
  # CODE REVIEW QUESTION: Standardization of variances will produce negative
  # variances. Any workarounds or should I just not try it?
  label_scaler = ColumnTransformer([
      ('label_std_scaler', StandardScaler(), ['d18O_cel_mean'])],
      remainder='passthrough')
  label_scaler.fit(Y)
  return label_scaler

def scale(X: pd.DataFrame, Y: pd.DataFrame, feature_scaler, label_scaler):
  # transform() outputs numpy arrays :(  need to convert back to DataFrame.
  X_standardized = pd.DataFrame(feature_scaler.transform(X),
                        index=X.index, columns=X.columns)
  Y_standardized = pd.DataFrame(label_scaler.transform(Y),
                                      index=Y.index, columns=Y.columns)
  return X_standardized, Y_standardized

In [65]:
# Just a class organization, holds each scaled dataset and the scaler used.
# Useful for unscaling predictions.
class ScaledDataset():
  def __init__(self, feature_scaler, label_scaler,
               X_train = None, X_val = None, X_test = None,
               Y_train = None, Y_val = None, Y_test = None):
    self.feature_scaler = feature_scaler
    self.label_scaler = label_scaler
    self.X_train = X_train
    self.X_val = X_val
    self.X_test = X_test
    self.Y_train = Y_train
    self.Y_val = Y_val
    self.Y_test = Y_test


def load_and_scale(config: Dict) -> ScaledDataset:
  X_train, Y_train = load_dataset(format_dataframe_path(config['TRAIN']))
  X_val, Y_val = load_dataset(format_dataframe_path(config['VALIDATION']))
  X_test, Y_test = load_dataset(format_dataframe_path(config['TEST']))

  feature_scaler = create_feature_scaler(X_train)
  label_scaler = create_label_scaler(Y_train)
  X_train_scaled, Y_train_scaled = scale(X_train, Y_train, feature_scaler, label_scaler)
  X_val_scaled, Y_val_scaled = scale(X_val, Y_val, feature_scaler, label_scaler)
  X_test_scaled, Y_test_scaled = scale(X_test, Y_test, feature_scaler, label_scaler)
  return ScaledDataset(
      feature_scaler, label_scaler,
      X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled,
      Y_train=Y_train_scaled, Y_val=Y_val_scaled, Y_test=Y_test_scaled)


# Model Definition



The KL Loss function:

In [45]:
# log(σ2/σ1) + ( σ1^2+(μ1−μ2)^2 ) / 2* σ^2   − 1/2
def kl_divergence(real, predicted):
    real_value = tf.gather(real, [0], axis=1)
    real_std = tf.math.sqrt(tf.gather(real, [1], axis=1))

    predicted_value = tf.gather(predicted, [0], axis=1)
    predicted_std = tf.math.sqrt(tf.gather(predicted, [1], axis=1))

    kl_loss = -0.5 + tf.math.log(predicted_std/real_std) + \
     (tf.square(real_std) + tf.square(real_value - predicted_value))/ \
     (2*tf.square(predicted_std))

    return tf.math.reduce_mean(kl_loss)

def symmetric_kl(real, predicted):
  return kl_divergence(real, predicted)

Test the loss function:

In [None]:
import pytest

test_real = tf.convert_to_tensor(np.array([[1, 0.02]]))
test_pred = tf.convert_to_tensor(np.array([[0.98, 0.021]]))

# https://screenshot.googleplex.com/5WM9dinAbhR26ZS
assert float(kl_divergence(test_real, test_pred)) == pytest.approx(0.0101094, 1e-5)

test_neg_real = tf.convert_to_tensor(np.array([[32.32, 0.0344]]))
test_neg_pred = tf.convert_to_tensor(np.array([[32.01, -0.322]]))

# Negative variance causes NaN
assert tf.math.is_nan(kl_divergence(test_neg_real, test_neg_pred))

test_real_2d = tf.convert_to_tensor(np.array(
    [[1.00, 0.020],
     [1.01, 0.042]]))
test_pred_2d = tf.convert_to_tensor(np.array(
    [[0.98, 0.021],
     [0.99, 0.012]]))

# Should reduce to the average loss of all rows.
assert float(kl_divergence(test_real_2d, test_pred_2d)) == pytest.approx(
    sum([0.0101094, 0.6402851])/2, 1e-5)

Model definition

In [73]:
from keras.callbacks import ModelCheckpoint

early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=500, min_delta=0.001, verbose=1,
    restore_best_weights=True, start_from_epoch=0)

# I was experimenting with models that took longer to train, and used this
# checkpointing callback to periodically save the model. It's optional.
def get_checkpoint_callback(model_file):
  return ModelCheckpoint(
      get_model_save_location(model_file),
      monitor='val_loss', verbose=0, save_best_only=True, mode='min')

def train_vars(X: pd.DataFrame,
        Y: pd.DataFrame,
        hidden_layers: List[int],
        epochs: int,
        batch_size: int,
        lr: float,
        validation_data: Tuple[pd.DataFrame, pd.DataFrame],
        model_file=None,
        use_checkpoint=False):
  callbacks_list = [early_stop, get_checkpoint_callback(model_file)]
  if not use_checkpoint:
    inputs = keras.Input(shape=(X.shape[1],))
    x = inputs
    for layer_size in hidden_layers:
      x = keras.layers.Dense(
          layer_size, activation='relu')(x)
    mean_output = keras.layers.Dense(1, name='mean_output')(x)

    # We can not have negative variance. Apply very little variance.
    var_output = keras.layers.Dense(1, name='var_output')(x)
    abs_var = keras.layers.Lambda(lambda t: tf.abs(t))(var_output)

    # Output mean, |variance| tuples.
    outputs = keras.layers.concatenate([mean_output, abs_var])
    model = keras.Model(inputs=inputs, outputs=outputs)

    # Later epochs seem to benefit from lower learning rate... but it takes
    # a while to get there.
    decay = keras.optimizers.schedules.ExponentialDecay(
       lr, decay_steps=100, decay_rate=0.5, staircase=True)

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss=symmetric_kl)
    model.summary()
  else:
    model = keras.models.load_model(get_model_save_location(model_file),
                                    custom_objects={"symmetric_kl": symmetric_kl})
  history = model.fit(X, Y, verbose=0, epochs=epochs, batch_size=batch_size,
                      validation_data=validation_data, shuffle=True, callbacks=callbacks_list)
  return history, model

In [69]:
from sklearn.metrics import mean_squared_error

def render_plot_loss(history, name):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title(name + ' model loss')
  plt.ylabel('loss')
  plt.yscale("log")
  plt.xlabel('epoch')
  plt.legend(['loss', 'val_loss'], loc='upper left')
  plt.show()

def destandardize_predictions(df: pd.DataFrame, sd: ScaledDataset):
  means = pd.DataFrame(
      sd.label_scaler.named_transformers_['label_std_scaler'].inverse_transform(df[['d18O_cel_mean']]),
      index=df.index, columns=['d18O_cel_mean'])
  vars = df['d18O_cel_variance']
  return means.join(vars)

def train_and_evaluate(sd: ScaledDataset, run_id: str):
  print("==================")
  print(run_id)
  history, vars_model = train_vars(sd.X_train, sd.Y_train, hidden_layers=[20, 20],
                                 epochs=5000, batch_size=5, lr=0.0001,
                                 validation_data=(sd.X_val, sd.Y_val),
                                 model_file=run_id+".h5", use_checkpoint=False)
  render_plot_loss(history, run_id+" kl_loss")
  vars_model.save(get_model_save_location(run_id+".h5"), save_format="h5")

  vars_model.evaluate(x=sd.X_test, y=sd.Y_test)
  predictions = vars_model.predict_on_batch(sd.X_test)
  print("EXPECTED:")
  print(sd.Y_test.to_string())
  print()
  print("PREDICTED:")
  predictions =  destandardize_prediction(pd.DataFrame(predictions, columns=['d18O_cel_mean', 'd18O_cel_variance']))
  print(predictions.to_string())

  rmse = np.sqrt(mean_squared_error(Y_test['d18O_cel_mean'], predictions['d18O_cel_mean']))
  print("RMSE: "+ str(rmse))

# Load and evaluate the model with each set of data.

1

In [None]:
ungrouped_random = {
    'TRAIN' : "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_train_random_ungrouped.csv",
    'TEST' : "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_test_random_ungrouped.csv",
    'VALIDATION' : "/MyDrive/amazon_rainforest_files/amazon_sample_data/uc_davis_2023_08_12_validation_random_ungrouped.csv",
}

ungrouped_random_scaled = load_and_scale(ungrouped_random)
train_and_evaluate(ungrouped_random_scaled, "ungrouped_random")

ungrouped_random
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 12)]         0           []                               
                                                                                                  
 dense_14 (Dense)               (None, 20)           260         ['input_8[0][0]']                
                                                                                                  
 dense_15 (Dense)               (None, 20)           420         ['dense_14[0][0]']               
                                                                                                  
 var_output (Dense)             (None, 1)            21          ['dense_15[0][0]']               
                                                                           