<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/symmetrical_kl/dnn/variational.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNN regressors

This file trains 3 different NN Regressors and a Gaussian Process Regressor.

1) A DNN to predict mean O18 ratios.

2) A hybrid output DNN to predict mean and variances.

3) A DNN to predict just variances of O18 ratios.

4) A GPR regressor to predict mean O18 ratios and variances.

At the end of the colab, there's driver code to translate the model to isoscapes, and some additional code to render GIFs. Every model seems to predict means very well, but with mixed results to predict variance. The hybrid model seems to underestimate variance. The variance-only model produces gibberish. And GPR *I can't tell* looks good but intuitively it seems to underproduce variances as well.

In [1]:
from collections import defaultdict
import math
import numpy as np
import pandas as pd
from typing import List

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from matplotlib import pyplot as plt
from tensorflow.python.ops import math_ops

#@title Debugging
# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/drive" #@param
DATAFRAME_PATH = "/MyDrive/amazon_rainforest_files/monthly_large.csv" #@param
RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
MODEL_SAVE_LOCATION = "/MyDrive/amazon_rainforest_files/" #@param
OUTPUT_RASTER_BASE = "/MyDrive/amazon_rainforest_files/" #@param

def get_dataframe_path_from_params() -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{DATAFRAME_PATH}"

def get_model_save_location(filename) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{MODEL_SAVE_LOCATION}{filename}"

def get_raster_path_from_params(filename) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{RASTER_BASE}{filename}"

def get_output_raster_path_from_params(filename) -> str:
    root = GDRIVE_BASE if GDRIVE_BASE else ""
    return f"{root}{OUTPUT_RASTER_BASE}{filename}"


In [2]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

Mounted at /content/drive


In [3]:
def render_plot_loss(history, name):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title(name + ' model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['loss', 'val_loss'], loc='upper left')
  plt.show()

#Data preparation


In [4]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(get_dataframe_path_from_params())
df = df.rename(columns={
    "cellulose_oxygen_ratio": "O18",
})
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

# Reformat the "month_of_year" column into 12 separate binary columns.
def categorize_months(df: pd.DataFrame):
  return pd.get_dummies(df, columns=['month_of_year'])
df_expanded = categorize_months(df)

group_on = ['sample_site_lon', 'sample_site_lat', 'month_of_year']
grouped = df.groupby(group_on)

# ASSUMPTION: Taking the mean and variance of a sample site doesn't lower quality of the data.
# We need to do this to use KL-divergence loss.
means = grouped.mean()
O18_var = grouped.var()['O18']

# Merging results in some unreadable column names. Rename the oxygen columns.
merged = pd.merge(means, O18_var, on=group_on, how='inner').reset_index()
merged = merged.rename(columns={
    'O18_x': 'O18_mean',
    'O18_y': 'O18_var'})

# ...and drop sample_site_lon/sample_site_lat. These were keys used to identify
# sample sites. They are basically duplicates of the 'lat' 'lon' columns.
merged.drop(merged.columns[merged.columns.str.contains('unnamed',case = False)], axis = 1, inplace = True)
merged.drop('sample_site_lon', axis = 1, inplace = True)
merged.drop('sample_site_lat', axis = 1, inplace = True)

merged = categorize_months(merged)

train, test = train_test_split(merged, test_size=0.25, random_state=25)
Y_train_vars = train[["O18_mean", "O18_var"]]
Y_test_vars = test[["O18_mean", "O18_var"]]

# Features: Everything besides mean and variance
train.drop(train.columns[train.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
X_train_vars = train.drop(["O18_mean", "O18_var"], axis=1)
X_test_vars = test.drop(["O18_mean", "O18_var"], axis=1)

# Normalization
# for col in ["rh", "vpd", "temp", "atmosphere_oxygen_ratio", "lat", "lon"]:
#   X_train_vars[col] = X_train_vars[col]/X_train_vars[col].std()
#   X_test_vars[col] = X_test_vars[col]/X_test_vars[col].std()

# Model Definition



The KL Loss function:

In [5]:
# log(σ2/σ1) + ( σ1^2+(μ1−μ2)^2 ) / 2* σ2^2   − 1/2
def kl_divergence(real, predicted):
    real_value = tf.gather(real, [0], axis=1)
    real_std = tf.math.sqrt(tf.gather(real, [1], axis=1))

    predicted_value = tf.gather(predicted, [0], axis=1)
    predicted_std = tf.math.sqrt(tf.gather(predicted, [1], axis=1))

    kl_loss = -0.5 + tf.math.log(predicted_std/real_std) + \
     (tf.square(real_std) + tf.square(real_value - predicted_value))/ \
     (2*tf.square(predicted_std))

    return tf.math.reduce_mean(kl_loss)

def symmetric_kl(real, predicted):
  return kl_divergence(real, predicted) + kl_divergence(predicted, real)

Test the loss function:

In [6]:
import pytest

test_real = tf.convert_to_tensor(np.array([[1, 0.02]]))
test_pred = tf.convert_to_tensor(np.array([[0.98, 0.021]]))

# https://screenshot.googleplex.com/5WM9dinAbhR26ZS
assert float(kl_divergence(test_real, test_pred)) == pytest.approx(0.0101094, 1e-5)

test_neg_real = tf.convert_to_tensor(np.array([[32.32, 0.0344]]))
test_neg_pred = tf.convert_to_tensor(np.array([[32.01, -0.322]]))

# Negative variance causes NaN
assert tf.math.is_nan(kl_divergence(test_neg_real, test_neg_pred))

test_real_2d = tf.convert_to_tensor(np.array(
    [[1.00, 0.020],
     [1.01, 0.042]]))
test_pred_2d = tf.convert_to_tensor(np.array(
    [[0.98, 0.021],
     [0.99, 0.012]]))

# Should reduce to the average loss of all rows.
assert float(kl_divergence(test_real_2d, test_pred_2d)) == pytest.approx(
    sum([0.0101094, 0.6402851])/2, 1e-5)

Model definition

In [12]:
from keras.callbacks import ModelCheckpoint

early_stop = keras.callbacks.EarlyStopping(
    monitor='loss', patience=1000, min_delta=0.001, verbose=1,
    restore_best_weights=True, start_from_epoch=1000)

# I was experimenting with models that took longer to train, and used this
# checkpointing callback to periodically save the model. It's optional.
def get_checkpoint_callback(model_file):
  return ModelCheckpoint(
      get_model_save_location(model_file),
      monitor='loss', verbose=1, save_best_only=True, mode='min')

def train_vars(X: pd.DataFrame,
        Y: pd.DataFrame,
        hidden_layers: List[int],
        epochs: int,
        batch_size: int,
        lr: float,
        model_file=None,
        use_checkpoint=False):
  callbacks_list = [early_stop, get_checkpoint_callback(model_file)]
  if not use_checkpoint:
    inputs = keras.Input(shape=(X.shape[1],))
    x = inputs
    for layer_size in hidden_layers:
      x = keras.layers.Dense(
          layer_size, activation='relu',
          kernel_regularizer=keras.regularizers.L2(0.80))(x)
    mean_output = keras.layers.Dense(1, name='mean_output')(x)

    # We can not have negative variance. Apply very little variance.
    var_output = keras.layers.Dense(1, name='var_output',
                                    kernel_regularizer=keras.regularizers.L2(0.000001))(x)
    abs_var = keras.layers.Lambda(lambda t: tf.abs(t))(var_output)

    # Output mean, |variance| tuples.
    outputs = keras.layers.concatenate([mean_output, abs_var])
    model = keras.Model(inputs=inputs, outputs=outputs)

    # Later epochs seem to benefit from lower learning rate... but it takes
    # a while to get there.
    decay = keras.optimizers.schedules.ExponentialDecay(
       lr, decay_steps=1000, decay_rate=0.5, staircase=False)

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss=symmetric_kl)
    model.summary()
  else:
    model = keras.models.load_model(get_model_save_location(model_file),
                                    custom_objects={"symmetric_kl": symmetric_kl})
  history = model.fit(X, Y, verbose=1, epochs=epochs, batch_size=batch_size,
            validation_split=0.2, shuffle=True, callbacks=callbacks_list)
  return history, model

In [None]:
history, vars_model = train_vars(X_train_vars, Y_train_vars, hidden_layers=[200, 200],
                                 epochs=10000, batch_size=10, lr=0.000001,
                                 model_file="variational.h5", use_checkpoint=True)
render_plot_loss(history, "variance_output")

In [16]:
vars_model = keras.models.load_model(get_model_save_location("variational.h5"),
                                    custom_objects={"symmetric_kl": symmetric_kl})

vars_model.evaluate(x=X_test_vars, y=Y_test_vars)
predictions = vars_model.predict_on_batch(X_test_vars)
print("EXPECTED:")
print(Y_test_vars[:12])
print()
print("PREDICTED:")
print(pd.DataFrame(predictions, columns=['O18_mean', 'O18_var'])[:12])

EXPECTED:
      O18_mean   O18_var
85   29.465034  0.013972
42   28.948918  0.020012
43   28.968163  0.035914
40   28.997964  0.026182
75   30.045125  0.017737
93   29.462594  0.013638
21   28.608574  0.018675
133  29.576783  0.041875
103  30.822340  0.018667
141  29.548870  0.035508
125  30.289680  0.006326
112  29.884639  0.017029

PREDICTED:
     O18_mean   O18_var
0   29.010254  0.040785
1   28.995157  0.047017
2   29.301374  0.032878
3   29.042843  0.025283
4   29.809370  0.034440
5   29.830790  0.027089
6   28.497194  0.048038
7   28.512482  0.033802
8   30.049419  0.051782
9   29.463865  0.020826
10  30.249601  0.038203
11  30.072147  0.057806


In [21]:
vars_model.save(get_model_save_location("variational.h5"), save_format="h5")

## Generating GeoTIFFs from the model

All of the code from the following block is (temporarily) copy and pasted from the library files.

In [17]:
from dataclasses import dataclass
from osgeo import gdal, gdal_array
from tqdm import tqdm
import math
import matplotlib.animation as animation

@dataclass
class AmazonGeoTiff:
  """Represents a geotiff from our dataset."""
  gdal_dataset: gdal.Dataset
  image_value_array: np.ndarray # ndarray of floats
  image_mask_array: np.ndarray # ndarray of uint8
  masked_image: np.ma.masked_array
  yearly_masked_image: np.ma.masked_array


@dataclass
class Bounds:
  """Represents geographic bounds and size information."""
  minx: float
  maxx: float
  miny: float
  maxy: float
  pixel_size_x: float
  pixel_size_y: float
  raster_size_x: float
  raster_size_y: float

  def to_matplotlib(self) -> List[float]:
      return [self.minx, self.maxx, self.miny, self.maxy]

def load_raster(path: str, use_only_band_index: int = -1) -> AmazonGeoTiff:
  """
  TODO: Refactor (is_single_band, etc., should be a better design)
  --> Find a way to simplify this logic. Maybe it needs to be more abstract.
  """
  dataset = gdal.Open(path, gdal.GA_ReadOnly)
  image_datatype = dataset.GetRasterBand(1).DataType
  mask_datatype = dataset.GetRasterBand(1).GetMaskBand().DataType
  image = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                   dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))
  mask = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                  dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))

  if use_only_band_index == -1:
    if dataset.RasterCount != 12 and dataset.RasterCount != 1:
      raise ValueError(f"Expected 12 raster bands (one for each month) or one annual average, but found {dataset.RasterCount}")
    if dataset.RasterCount == 1:
      use_only_band_index = 0

  is_single_band = use_only_band_index != -1

  if is_single_band and use_only_band_index >= dataset.RasterCount:
    raise IndexError(f"Specified raster band index {use_only_band_index}"
                     f" but there are only {dataset.RasterCount} rasters")

  for band_index in range(12):
    band = dataset.GetRasterBand(use_only_band_index+1 if is_single_band else band_index+1)
    image[:, :, band_index] = band.ReadAsArray()
    mask[:, :, band_index] = band.GetMaskBand().ReadAsArray()
  masked_image = np.ma.masked_where(mask == 0, image)
  yearly_masked_image = masked_image.mean(axis=2)

  return AmazonGeoTiff(dataset, image, mask, masked_image, yearly_masked_image)

def get_extent(dataset):
  geoTransform = dataset.GetGeoTransform()
  minx = geoTransform[0]
  maxy = geoTransform[3]
  maxx = minx + geoTransform[1] * dataset.RasterXSize
  miny = maxy + geoTransform[5] * dataset.RasterYSize
  return Bounds(minx, maxx, miny, maxy, geoTransform[1], geoTransform[5], dataset.RasterXSize, dataset.RasterYSize)

def coords_to_indices(bounds: Bounds, x: float, y: float):
  if x < bounds.minx or x > bounds.maxx or y < bounds.miny or y > bounds.maxy:
    raise ValueError("Coordinates out of bounds")

  # X => lat, Y => lon
  x_idx = bounds.raster_size_y - int(math.ceil((y - bounds.miny) / abs(bounds.pixel_size_y)))
  y_idx = int((x - bounds.minx) / abs(bounds.pixel_size_x))

  return x_idx, y_idx

def get_data_at_coords(dataset: AmazonGeoTiff, x: float, y: float, month: int) -> float:
  # x = longitude
  # y = latitude
  bounds = get_extent(dataset.gdal_dataset)
  x_idx, y_idx = coords_to_indices(bounds, x, y)
  if month == -1:
    value = dataset.yearly_masked_image[x_idx, y_idx]
  else:
    value = dataset.masked_image[x_idx, y_idx, month]
  if np.ma.is_masked(value):
    raise ValueError("Coordinates are masked")
  else:
    return value

def animate(geotiff: AmazonGeoTiff, nSeconds, fps):
  fig = plt.figure( figsize=(8,8) )

  months = []
  labels = []
  for m in range(12):
    months.append(geotiff.masked_image[:,:,m])
    labels.append(f"Month: {m+1}")
  a = months[0]
  extent = get_extent(geotiff.gdal_dataset).to_matplotlib()
  ax = fig.add_subplot()
  im = fig.axes[0].imshow(a, interpolation='none', aspect='auto', extent = extent)
  txt = fig.text(0.3,0,"", fontsize=24)
  fig.colorbar(im)

  def animate_func(i):
    if i % fps == 0:
      print( '.', end ='' )

    im.set_array(months[i])
    txt.set_text(labels[i])
    return [im, txt]

  anim = animation.FuncAnimation(
      fig,
      animate_func,
      frames = nSeconds * fps,
      interval = 1000 / fps, # in ms
  )
  plt.close()

  return anim


The following code is new stuff, and used to generate a 12 GeoTIFFs (one for each month) from the model.

In [18]:
def get_predictions_at_each_pixel(
    monthly: bool,
    geotiffs: dict[str, AmazonGeoTiff],
    bounds: Bounds,
    model: keras.Model):

  # Initialize a blank plane representing means and variance.
  predicted_means_np = np.ma.array(
      np.zeros([bounds.raster_size_x, bounds.raster_size_y, 12 if monthly else 1], dtype=float),
      mask=np.ones([bounds.raster_size_x, bounds.raster_size_y, 12 if monthly else 1], dtype=bool))
  predicted_vars_np = np.ma.array(
      np.zeros([bounds.raster_size_x, bounds.raster_size_y, 12 if monthly else 1], dtype=float),
      mask=np.ones([bounds.raster_size_x, bounds.raster_size_y, 12 if monthly else 1], dtype=bool))

  for month in range (0, 12 if monthly else 1):
    for x_idx, x in enumerate(tqdm(np.arange(bounds.minx, bounds.maxx, bounds.pixel_size_x, dtype=float))):
      rows = []
      row_indexes = []
      for y_idx, y in enumerate(np.arange(bounds.miny, bounds.maxy, -bounds.pixel_size_y, dtype=float)):
        # Row should contain all the features needed to predict.
        row = {}

        # Surround in try/except as we will be trying to fetch out of bounds data.
        try:
          for geotiff_label, geotiff in geotiffs.items():
            row[geotiff_label] = get_data_at_coords(geotiff, x, y, month)
        except (ValueError, IndexError):
          continue # masked and out-of-bounds coordinates

        # Set all month_of_year binary variables to 0 unless it refers to `month`.
        for i in range(0,12):
          row["month_of_year_" + str(i)] = (1 if i == month else 0)
        row["lon"] = x
        row["lat"] = y
        rows.append(row)
        row_indexes.append((y_idx,month,))
      if (len(rows) > 0):
        X = pd.DataFrame.from_dict(rows)
        predictions = model.predict_on_batch(X)

        means_np = predictions[:, 0]
        for prediction, (y_idx, month_idx) in zip(means_np, row_indexes):
          predicted_means_np.mask[x_idx,y_idx,month_idx] = False # unmask since we have data
          predicted_means_np.data[x_idx,y_idx,month_idx] = prediction
        vars_np = predictions[:, 1]
        for prediction, (y_idx, month_idx) in zip (vars_np, row_indexes):
          predicted_vars_np.mask[x_idx, y_idx, month_idx] = False
          predicted_vars_np.data[x_idx, y_idx, month_idx] = prediction

  return predicted_means_np, predicted_vars_np

In [19]:
model = keras.models.load_model(get_model_save_location("variational.h5"), custom_objects={"symmetric_kl": symmetric_kl})

relative_humidity_geotiff = load_raster(get_raster_path_from_params("R.rh_Stack.tif"))
temperature_geotiff = load_raster(get_raster_path_from_params("Temperatura_Stack.tif"))
vapor_pressure_deficit_geotiff = load_raster(get_raster_path_from_params("R.vpd_Stack.tif"))
atmosphere_isoscape_geotiff = load_raster(get_raster_path_from_params("Iso_Oxi_Stack.tif"))

name_to_geotiff = {
    "rh": relative_humidity_geotiff,
    "temp" : temperature_geotiff,
    "vpd" : vapor_pressure_deficit_geotiff,
    "atmosphere_oxygen_ratio" : atmosphere_isoscape_geotiff,
}

# We need the borders of the map. Pick one geotiff at random and use that as the extent.
bounds =  get_extent(atmosphere_isoscape_geotiff.gdal_dataset)

means_np, vars_np = get_predictions_at_each_pixel(
    monthly=True,
    geotiffs=name_to_geotiff,
    bounds=bounds,
    model=model)

100%|██████████| 940/940 [00:49<00:00, 18.88it/s]
100%|██████████| 940/940 [00:48<00:00, 19.44it/s]
100%|██████████| 940/940 [00:47<00:00, 19.61it/s]
100%|██████████| 940/940 [00:49<00:00, 19.00it/s]
100%|██████████| 940/940 [00:48<00:00, 19.41it/s]
100%|██████████| 940/940 [00:47<00:00, 19.75it/s]
100%|██████████| 940/940 [00:48<00:00, 19.26it/s]
100%|██████████| 940/940 [00:48<00:00, 19.32it/s]
100%|██████████| 940/940 [00:49<00:00, 19.10it/s]
100%|██████████| 940/940 [00:50<00:00, 18.77it/s]
100%|██████████| 940/940 [00:51<00:00, 18.27it/s]
100%|██████████| 940/940 [00:48<00:00, 19.22it/s]


Driver code for saving the generated numpy values to GeoTIFFs, also copied straight from nicholas's colab.

In [21]:
from osgeo import gdal, gdal_array

def save_numpy_to_geotiff(bounds: Bounds, prediction: np.ma.MaskedArray, path: str):
  """Copy metadata from a base geotiff and write raster data + mask from `data`"""
  driver = gdal.GetDriverByName("GTiff")
  metadata = driver.GetMetadata()
  if metadata.get(gdal.DCAP_CREATE) != "YES":
    raise RuntimeError("GTiff driver does not support required method Create().")
  if metadata.get(gdal.DCAP_CREATECOPY) != "YES":
    raise RuntimeError("GTiff driver does not support required method CreateCopy().")

  dataset = driver.Create(path, bounds.raster_size_x, bounds.raster_size_y, prediction.shape[2], eType=gdal.GDT_Float64)
  dataset.SetGeoTransform([bounds.minx, bounds.pixel_size_x, 0, bounds.maxy, 0, bounds.pixel_size_y])
  dataset.SetProjection('GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433],AUTHORITY["EPSG","4326"]]')

  #dataset = driver.CreateCopy(path, base.gdal_dataset, strict=0)
  if len(prediction.shape) != 3 or prediction.shape[0] != bounds.raster_size_x or prediction.shape[1] != bounds.raster_size_y:
    raise ValueError("Shape of prediction does not match base geotiff")
  #if prediction.shape[2] > base.gdal_dataset.RasterCount:
  #  raise ValueError(f"Expected fewer than {dataset.RasterCount} bands in prediction but found {prediction.shape[2]}")

  prediction_transformed = np.flip(np.transpose(prediction, axes=[1,0,2]), axis=0)
  for band_index in range(dataset.RasterCount):
    band = dataset.GetRasterBand(band_index+1)
    if band.CreateMaskBand(0) == gdal.CE_Failure:
      raise RuntimeError("Failed to create mask band")
    mask_band = band.GetMaskBand()
    band.WriteArray(np.choose(prediction_transformed[:, :, band_index].mask, (prediction_transformed[:, :, band_index].data,np.array(band.GetNoDataValue()),)))
    mask_band.WriteArray(np.logical_not(prediction_transformed[:, :, band_index].mask))

save_numpy_to_geotiff(bounds, means_np, get_output_raster_path_from_params("predicted_isoscape_variational_means.tiff"))
save_numpy_to_geotiff(bounds, vars_np, get_output_raster_path_from_params("predicted_isoscape_variational_vars.tiff"))

Animations! We only do this with the hybrid model right now.

In [22]:
from matplotlib import rc
rc('animation', html='jshtml')

dnn2_means = load_raster(get_output_raster_path_from_params("predicted_isoscape_variational_means.tiff"))
animate(dnn2_means, 12, 1)

.............

In [23]:
dnn2_vars = load_raster(get_output_raster_path_from_params("predicted_isoscape_variational_vars.tiff"))
animate(dnn2_vars, 12, 1)

.............