<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/validation_pipeline_rmse/validation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param{type:"boolean"}

ISOSCAPE_FILENAME = "uc_davis_d18O_cel_kriging.tiff" #@param
# Used in unit tests (generated from Kriging)
TEST_ISOSCAPE_FILENAME = "test_isoscape.tiff" #@param

TEST_SET_FILENAME = 'uc_davis_2023_08_12_test_random_grouped.csv' #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

# Column name in temporary dataframe (for RMSE calculation) that
# will group mean, variance pairs from true mean and variance samples.
# It shouldn't collide with column names defined above.
assert('truth' not in [TEST_SET_FILENAME, MEAN_TRUTH_NAME, VAR_TRUTH_NAME,
                       MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME])
# Column name in temporary dataframe (for RMSE calculation) that
# will group mean, variance pairs from predicted mean and variance samples.
# It shouldn't collide with column names defined above.
assert('prediction' not in [TEST_SET_FILENAME, MEAN_TRUTH_NAME, VAR_TRUTH_NAME,
                       MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME])

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
ddfimport.ddf_import_common()

# Isoscape: Calculate RMSE

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def calculate_rmse(df, means_isoscape, vars_isoscape, mean_true_name, var_true_name, mean_pred_name, var_pred_name):
  '''
  Calculates the mean, variance and overall (mean and variance) RMSE of df using
  the provided columns mean_true_name, var_true_name, mean_pred_name, var_pred_name
  can take any value except 'truth' and 'prediction'
  '''
  # Make sure names do not collide.
  assert(
      len([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction']) ==
      len(set([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction'])))

  df[mean_pred_name] = df.apply(lambda row:get_data_at_coords(means_isoscape, row['long'],row['lat'],-1), axis=1)
  df[var_pred_name] = df.apply(lambda row:get_data_at_coords(vars_isoscape, row['long'],row['lat'],-1), axis=1)

  print(df.columns)

  df['prediction'] = df.apply(lambda row: [row[mean_pred_name], row[var_pred_name]], axis=1)
  df['truth'] = df.apply(lambda row: [row[mean_true_name], row[var_true_name]], axis=1)

  y_pred = list(df['prediction'].values)
  y_true = list(df['truth'].values)

  return (mean_squared_error(df[mean_true_name].values, df[mean_pred_name].values, squared=False),
         mean_squared_error(df[var_true_name].values, df[var_pred_name].values, squared=False),
         mean_squared_error(y_true, y_pred, squared=False))

In [None]:
import pytest

def test_calculate_rmse():
  test_means_isoscape = load_raster(get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=0)
  test_vars_isoscape = load_raster(get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=1)
  df = pd.DataFrame({
      'lat': [-3, -4],
      'long': [-55, -54],
      'd18O_cel_mean': [0, 5],
      'd18O_cel_var': [1, 0.5]
  })
  mean_true_name = 'd18O_cel_mean'
  var_true_name = 'd18O_cel_var'
  mean_pred_name = 'd18O_cel_mean_pred'
  var_pred_name = 'd18O_cel_var_pred'
  truth_name = 'd18O_cel_truth'
  pred_name = 'd18O_cel_pred'

  mean_rmse, var_rmse, overall_rmse = calculate_rmse(
      df, test_means_isoscape, test_vars_isoscape,
      mean_true_name, var_true_name, mean_pred_name, var_pred_name)

  assert(mean_rmse == pytest.approx(22.524833655412866))
  assert(var_rmse == pytest.approx(11.00233730921582))
  assert(overall_rmse == pytest.approx(16.763585482314344))

test_calculate_rmse()

In [None]:
print(get_raster_path(ISOSCAPE_FILENAME))
means_isoscape = load_raster(get_raster_path(ISOSCAPE_FILENAME), use_only_band_index=0)
vars_isoscape = load_raster(get_raster_path(ISOSCAPE_FILENAME), use_only_band_index=1)

In [None]:
eval_dataset = pd.read_csv(get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
mean_rmse, var_rmse, overall_rmse = calculate_rmse(eval_dataset, means_isoscape, vars_isoscape, MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)

In [None]:
print("RMSE of Means:", mean_rmse)
print("RMSE of Vars:", var_rmse)
print("Overall RMSE:", overall_rmse)

# TODO: Fraud Detection Hypothesis Test