<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/validation_pipeline_rmse/validation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/gdrive" #@param

ISOSCAPE_MEANS_FILENAME = "uc_davis_d18O_cel_kriging_means.tiff" #@param
ISOSCAPE_VARS_FILENAME = "uc_davis_d18O_cel_kriging_vars.tiff" #@param
# Used in unit tests (generated from Kriging)
TEST_ISOSCAPE_FILENAME = "uc_davis_d18O_cel_kriging_means.tiff" #@param

TEST_SET_FILENAME = 'uc_davis_2023_08_12_test_random_grouped.csv' #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

In [None]:
import importlib
import raster
importlib.reload(raster)

# Isoscape: Calculate RMSE

In [None]:
from sklearn.metrics import mean_squared_error
import raster
import pandas as pd
import dataset

In [None]:
# Required to both import raster and read GDrive files
raster.RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
raster.ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

In [None]:
def calculate_rmse(df, means_isoscape, vars_isoscape, mean_true_name, var_true_name, mean_pred_name, var_pred_name):
  '''
  Calculates the mean, variance and overall (mean and variance) RMSE of df using
  the provided columns mean_true_name, var_true_name, mean_pred_name, var_pred_name
  can take any value except 'truth' and 'prediction'
  '''
  # Make sure names do not collide.
  assert(
      len([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction']) ==
      len(set([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction'])))

  df[mean_pred_name] = df.apply(lambda row:raster.get_data_at_coords(means_isoscape, row['long'],row['lat'],-1), axis=1)
  df[var_pred_name] = df.apply(lambda row:raster.get_data_at_coords(vars_isoscape, row['long'],row['lat'],-1), axis=1)

  predictions = list(df.apply(lambda row: [row[mean_pred_name], row[var_pred_name]], axis=1).values)
  truths = list(df.apply(lambda row: [row[mean_true_name], row[var_true_name]], axis=1).values)

  return (mean_squared_error(df[mean_true_name].values, df[mean_pred_name].values, squared=False),
         mean_squared_error(df[var_true_name].values, df[var_pred_name].values, squared=False),
         mean_squared_error(truths, predictions, squared=False))

In [None]:
import pytest

def test_calculate_rmse():
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=0)
  bounds =  raster.get_extent(test_means_isoscape.gdal_dataset)
  print(bounds)
  df = pd.DataFrame({
      'long': [-70, -68],
      'lat': [-4, -3],
      'd18O_cel_mean': [0, 5],
      'd18O_cel_var': [1, 0.5]
  })
  mean_true_name = 'd18O_cel_mean'
  var_true_name = 'd18O_cel_var'
  mean_pred_name = 'd18O_cel_mean_pred'
  var_pred_name = 'd18O_cel_var_pred'
  truth_name = 'd18O_cel_truth'
  pred_name = 'd18O_cel_pred'

  mean_rmse, var_rmse, overall_rmse = calculate_rmse(
      df, test_means_isoscape, test_vars_isoscape,
      mean_true_name, var_true_name, mean_pred_name, var_pred_name)
  print(mean_rmse, var_rmse, overall_rmse)

  assert(mean_rmse == pytest.approx(22.221530876037058))
  assert(var_rmse == pytest.approx(23.85633857749663))
  assert(overall_rmse == pytest.approx(23.038934726766843))

test_calculate_rmse()

In [None]:
means_isoscape = raster.load_raster(raster.get_raster_path(ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
vars_isoscape = raster.load_raster(raster.get_raster_path(ISOSCAPE_VARS_FILENAME), use_only_band_index=0)

In [None]:
eval_dataset = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
mean_rmse, var_rmse, overall_rmse = calculate_rmse(eval_dataset, means_isoscape, vars_isoscape, MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)

In [None]:
print("RMSE of Means:", mean_rmse)
print("RMSE of Vars:", var_rmse)
print("Overall RMSE:", overall_rmse)

# TODO: Fraud Detection Hypothesis Test