<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/new_fields/validation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/gdrive" #@param

ISOSCAPE_OXYGEN_MEANS_FILENAME = "variational/ensemble_with_carbon_brisoisorix/fixed_isorix_carbon_ensemble.tiff" #@param
ISOSCAPE_OXYGEN_VARS_FILENAME = "variational/ensemble_with_carbon_brisoisorix/fixed_isorix_carbon_ensemble.tiff" #@param

ISOSCAPE_OXYGEN_FOR_COMPARISON = "canonical/d18O_uc_davis_new_train_fixed_ungrouped_zscore_Brazil_stack.tiff" #@param

ISOSCAPE_CARBON_FILENAME = "iso_d13C_map_wood_stack.tiff" #@param
ISOSCAPE_NITROGEN_MEANS_FILENAME = "Raster_Brasil_krig_d15N.tiff" #@param
ISOSCAPE_NITROGEN_VARS_FILENAME = "Brasil_Raster_Krig_SD_d15N.tiff" #@param

TEST_SET_FILENAME = '2023_07_27____2023_09_14_2_test_fixed_grouped.csv' #@param
ORIGINAL_SET_FILENAME = "canonical/2023_07_27_Results_google_relabeled.csv" #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
# ddfimport.ddf_source_control_pane()
ddfimport.ddf_import_common()

In [None]:
import importlib
import raster
import hypothesis
import dataset
import evaluation
importlib.reload(raster)
importlib.reload(hypothesis)
importlib.reload(dataset)
importlib.reload(evaluation)

# Isoscape: Calculate RMSE for Oxygen



In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import numpy as np
from datetime import datetime
import os

In [None]:
# Required to both import raster and read GDrive files
raster.RASTER_BASE = "/Shared drives/TNC Fellowship 🌳/4. Isotope Research & Signals/code/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/Shared drives/TNC Fellowship 🌳/4. Isotope Research & Signals/code/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/Shared drives/TNC Fellowship 🌳/4. Isotope Research & Signals/code/amazon_rainforest_files/amazon_test_data/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

In [None]:
if ISOSCAPE_OXYGEN_MEANS_FILENAME == ISOSCAPE_OXYGEN_VARS_FILENAME:
  oxygen_means_isoscape = raster.load_raster(
      raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME), use_only_band_index=0)
  oxygen_vars_isoscape = raster.load_raster(
      raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME), use_only_band_index=1)
else:
  oxygen_means_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME), use_only_band_index=0)
  oxygen_vars_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME), use_only_band_index=0)

In [None]:
oxygen_means_isoscape_comparison = raster.load_raster(
  raster.get_raster_path(ISOSCAPE_OXYGEN_FOR_COMPARISON), use_only_band_index=0)
oxygen_vars_isoscape_comparison = raster.load_raster(
  raster.get_raster_path(ISOSCAPE_OXYGEN_FOR_COMPARISON), use_only_band_index=1)

In [None]:
eval_dataset = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
import evaluation

In [None]:
def print_rmse(means_isoscape, vars_isoscape, filename_means, filename_vars):
  mean_rmse, var_rmse, overall_rmse = evaluation.calculate_rmse(
    eval_dataset,
    means_isoscape,
    vars_isoscape,
    MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)
  print("RMSE of Means for ",str.upper(filename_means),":", mean_rmse)
  print("RMSE of Vars for ",str.upper(filename_vars),":", var_rmse)
  print("Overall RMSE:", overall_rmse)
  print("=================================================================")

In [None]:
print_rmse(oxygen_means_isoscape,oxygen_vars_isoscape,ISOSCAPE_OXYGEN_MEANS_FILENAME,ISOSCAPE_OXYGEN_VARS_FILENAME)
print_rmse(oxygen_means_isoscape_comparison,oxygen_vars_isoscape_comparison,ISOSCAPE_OXYGEN_FOR_COMPARISON,ISOSCAPE_OXYGEN_FOR_COMPARISON)

# Get Predictions

In [None]:
eval_dataset['fraud'] = False

In [None]:
eval_dataset['d18O_cel_count'] = 5

In [None]:
inferences_df = hypothesis.get_predictions_grouped(
    eval_dataset,
    ['d18O_cel_mean'],
    ['d18O_cel_variance'],
    ['d18O_cel_count'],
    [oxygen_means_isoscape],
    [oxygen_vars_isoscape], 5)

In [None]:
print(inferences_df['d18O_predicted_variance'].mean())
print(inferences_df['d18O_predicted_variance'].std())
print(inferences_df['d18O_predicted_variance'].max())
print(inferences_df['d18O_predicted_variance'].min())

In [None]:
inferences_df.dropna(subset=['d18O_cel_variance', 'd18O_predicted_variance'], inplace=True)

mean_squared_error(
    inferences_df['d18O_cel_variance'],
    inferences_df['d18O_predicted_variance'],
    squared=False
)

In [None]:
inferences_df.to_csv(raster.get_sample_db_path('overall_low_res_inferences.csv'))

# Plot isoscapes

In [None]:
def make_subplot(index: int, isoscape: str,title: str):
 plt.subplot(1, 2, index)
 plt.title(title, fontsize=7)
 plt.xlabel("Longitude",fontsize=7)
 plt.ylabel("Latitude",fontsize=7)

 if len(isoscape.masked_image.shape) > 2:
  raster.plot_band(isoscape,0)
 else:
   raster.plot_band(isoscape,-1)

In [None]:
#Ploting means
make_subplot(1, oxygen_means_isoscape,ISOSCAPE_OXYGEN_MEANS_FILENAME)
make_subplot(2, oxygen_means_isoscape_comparison,ISOSCAPE_OXYGEN_FOR_COMPARISON)

plt.show()

In [None]:
#Ploting variances
make_subplot(1, oxygen_vars_isoscape,ISOSCAPE_OXYGEN_VARS_FILENAME)
make_subplot(2, oxygen_vars_isoscape_comparison,ISOSCAPE_OXYGEN_FOR_COMPARISON)

plt.show()

# Fraud Detection Hypothesis Test
Combines the p values of each element specified below and classifies as fraudulent with the resulting p-value.

Creating fraudulent samples

In [None]:
eval_dataset.shape

In [None]:
# elements = ['d18O_cel', 'd15N_wood', 'd13C_wood']
# isotope_column_names = ['d18O_cel', 'd15N_wood', 'd13C_wood']
elements = ['d18O_cel']
isotope_column_names = ['d18O_cel']

In [None]:
mean_isoscapes_a = [
    oxygen_means_isoscape,
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_NITROGEN_MEANS_FILENAME), use_only_band_index=0),
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=0),
]
vars_isoscapes_a = [
    oxygen_vars_isoscape,
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_NITROGEN_VARS_FILENAME), use_only_band_index=0),
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=1),
]

In [None]:
mean_isoscapes_b = [oxygen_means_isoscape_comparison]
vars_isoscapes_b = [oxygen_vars_isoscape_comparison]

In [None]:
real_samples_data = pd.merge(eval_dataset[['Code','lat','long',MEAN_TRUTH_NAME, VAR_TRUTH_NAME]],
         pd.read_csv(raster.get_sample_db_path(ORIGINAL_SET_FILENAME), index_col=0), how="inner",
                    left_on=['Code', 'lat', 'long'], right_on=['Code', 'lat', 'long'])

Combine fraudulent and real samples in a Dataframe, identified by 'fraud' column

In [None]:
real = real_samples_data[['Code','lat','long'] + elements]
real = real.assign(fraud=False)

In [None]:
# Precision or Recall target to get p values for. It will
# look for the closest target value (may be greater than).
# It can't be both, and precision takes precedence
precision_target = 0.95 #@param
recall_target = None #@param

In [None]:
def isoscape_precision_recall_thresholds(
    test_dataset: pd.DataFrame,
    isotope_column_names: list[str],
    means_isoscapes: list[raster.AmazonGeoTiff],
    vars_isoscapes: list[raster.AmazonGeoTiff]) -> list[list[float]]:
  predictions = hypothesis.get_predictions(
    sample_data=test_dataset,
    isotope_column_names=isotope_column_names,
    means_isoscapes=means_isoscapes,
    variances_isoscapes=vars_isoscapes,
    sample_size_per_location=5)

  predictions.dropna(subset=['fraud', 'fraud_p_value'], inplace=True)

  y_true = predictions['fraud']
  # Fraud p value is lower the more positive a prediction/label is.
  # Inverting it gives us the probability of positive label class (fraud).
  y_pred = 1 - predictions['fraud_p_value']

  return precision_recall_curve(y_true, y_pred)

In [None]:
def plot_isoscape_precision_recall(
    precision: list[float],
    recall: list[float],
    label: str,
    radius: int):
  plt.plot(recall, precision, label=label)
  plt.xlabel("Recall")
  plt.ylabel("Precision")
  plt.title(f"Precision-Recall curve with Max Fraud Radius of {radius} km")
  plt.legend()

In [None]:
def find_p_value(
    precision: list[float],
    recall: list[float],
    thresholds: list[float],
    precision_target: float,
    recall_target: float) -> list[float]:
  assert(precision_target or recall_target)
  if precision_target:
    target_pos = np.argwhere(precision[:-1] >= precision_target)
  else:
    target_pos = np.argwhere(recall[:-1] >= recall_target)
  # No precision/recall is greater than or equal to the target
  if len(target_pos) < 1:
    if precision_target:
      target_pos = [[np.argmax(precision[:-1])]]
    else:
      target_pos = [[np.argmax(recall[:-1])]]

  precision_target_found = precision[:-1][target_pos[0]]
  recall_target_found = recall[:-1][target_pos[0]]
  p_value_found = (1-thresholds)[target_pos[0]]

  return precision_target_found, recall_target_found, p_value_found

In [None]:
# Parameters for creation of fraudulent samples.
MAX_TRUSTED_RADIUS = 0.1
MIN_FRAUD_RADIUS = 5 #@param
START_MAX_FRAUD_RADIUS = 100 #@param
END_MAX_FRAUD_RADIUS = 3000 #@param
RADIUS_PACE = 100 #@param

In [None]:
fake_samples = {}
for max_radius in range(START_MAX_FRAUD_RADIUS, END_MAX_FRAUD_RADIUS+1, RADIUS_PACE):
  fake_samples[max_radius] = dataset.create_fraudulent_samples(
      real_samples_data,
      mean_isoscapes_a+ mean_isoscapes_b + vars_isoscapes_a + vars_isoscapes_b,
      elements,
      MAX_TRUSTED_RADIUS,
      max_radius,
      MIN_FRAUD_RADIUS)

In [None]:
auc_scores_a = {}
auc_scores_b = {}

p_values_found = {}
precisions_target_found = {}
recalls_target_found ={}

for radius, fake_sample in fake_samples.items():
  test_dataset = real.append(fake_sample, ignore_index=True)
  test_dataset = dataset.nudge_invalid_coords(
      df=test_dataset,
      rasters=mean_isoscapes_a + mean_isoscapes_b + vars_isoscapes_a + vars_isoscapes_b
  )

  precision_a, recall_a, thresholds_a = isoscape_precision_recall_thresholds(
      test_dataset=test_dataset,
      isotope_column_names=isotope_column_names,
      means_isoscapes=mean_isoscapes_a,
      vars_isoscapes=vars_isoscapes_a
  )
  plot_isoscape_precision_recall(precision=precision_a, recall=recall_a,
                                 label='model1', radius=radius)

  auc_score_a = auc(recall_a, precision_a)
  print("AUC score:", auc_score_a)
  auc_scores_a[radius] = auc_score_a

  precision_b, recall_b, thresholds_b = isoscape_precision_recall_thresholds(
      test_dataset=test_dataset,
      isotope_column_names=isotope_column_names,
      means_isoscapes=mean_isoscapes_b,
      vars_isoscapes=vars_isoscapes_b
  )
  plot_isoscape_precision_recall(precision=precision_b, recall=recall_b,
                                 label='model2', radius=radius)

  plt.show()

  auc_score_b = auc(recall_b, precision_b)
  print("AUC score:", auc_score_b)
  auc_scores_b[radius] = auc_score_b

  precision_target_found_a, recall_target_found_a, p_value_found_a = find_p_value(
      precision=precision_a,
      recall=recall_a,
      thresholds=thresholds_a,
      precision_target=precision_target,
      recall_target=recall_target
  )
  print("=============== RADIUS", radius, "km ===============")
  print("Considering precision =", precision_target_found_a,"p_value is",p_value_found_a)
  print("Considering recall =", recall_target_found_a,"p_value is", p_value_found_a)

  p_values_found[radius] = p_value_found_a[0]
  precisions_target_found[radius] = precision_target_found_a[0]
  recalls_target_found[radius] = recall_target_found_a[0]

  precision_target_found_b, recall_target_found_b, p_value_found_b = find_p_value(
      precision=precision_b,
      recall=recall_b,
      thresholds=thresholds_b,
      precision_target=precision_target,
      recall_target=recall_target
  )
  print("=============== RADIUS", radius, "km ===============")
  print("Considering precision =", precision_target_found_b,"p_value is",p_value_found_b)
  print("Considering recall =", recall_target_found_b,"p_value is", p_value_found_b)


In [None]:
plt.plot(auc_scores_a.keys(), auc_scores_a.values(), label='model1')
plt.plot(auc_scores_b.keys(), auc_scores_b.values(), label='model2')
plt.xlabel("Max radius of fraudulent samples")
plt.ylabel("AUC of PR")
plt.legend()
plt.show()

# Find canonical P-value threshold

Generate fake samples using min_radius of 5km and max_radius of 3000km. Find the p-value threshold using this configuration.

#Stamping isoscape

In [None]:
def stamp(filename:str, auc_scores, p_values_found, precisions_target_found, recalls_target_found):
  """
    Adds precision, recall, and p-value thresholds to isoscape metadata for every radius tested in the validation pipeline.
    Stamping isoscapes:
    1. p-value threshold where precision = 95% (considered the last radius in the loop to stamp)
    2. the recall at that level
    3. the AUC
    4. the parameters used for validation (% fraud and radius)
    5. the date.time of validation
    Input:
      filename : str
      GeoTIFF filename (with the full path) of the isoscape to be stamped
      e.g: /content/gdrive/Shared drives/TNC Fellowship 🌳/4. Isotope Research & Signals/code/amazon_rainforest_files/amazon_rasters/variational/ensemble_with_carbon_brisoisorix/fixed_isorix_carbon_ensemble.tiff
  """

  for radius in auc_scores.keys():
    #p-value threshold where precision = precision_target_found
    raster.stamp_isoscape(filename, "P_VALUE_THRESHOLD_"+str(radius),  p_values_found[radius])
    raster.stamp_isoscape(filename, "PRECISION_"+str(radius), precisions_target_found[radius])
    raster.stamp_isoscape(filename, "RECALL_"+str(radius), recalls_target_found[radius])
    raster.stamp_isoscape(filename, "AUC_"+str(radius), auc_scores[radius])

    if radius == END_MAX_FRAUD_RADIUS:
      raster.stamp_isoscape(filename, "P_VALUE_THRESHOLD",  p_values_found[radius])
      raster.stamp_isoscape(filename, "PRECISION", precisions_target_found[radius])
      raster.stamp_isoscape(filename, "RECALL", recalls_target_found[radius])
      raster.stamp_isoscape(filename, "AUC", auc_scores[radius])

  #The date/time of validation
  now = datetime.now()
  dt_string = now.strftime("%m/%d/%Y %H:%M:%S")
  metadata_name = "DATE_TIME"
  metadata_value = dt_string
  raster.stamp_isoscape(filename, metadata_name, metadata_value)

  isoscape_filename =  os.path.basename(filename).strip(".tiff")
  raster.stamp_isoscape(filename, "REFERENCE_ISOSCAPE_NAME", isoscape_filename)


In [None]:
#Stamping Oxygen Isoscape
filename = raster.GDRIVE_BASE+raster.RASTER_BASE+ISOSCAPE_OXYGEN_MEANS_FILENAME
stamp(filename, auc_scores_a, p_values_found, precisions_target_found, recalls_target_found)
raster.show_stamps(filename)

In [None]:
if ISOSCAPE_OXYGEN_MEANS_FILENAME != ISOSCAPE_OXYGEN_VARS_FILENAME:
  filename_var = raster.GDRIVE_BASE+raster.RASTER_BASE+ISOSCAPE_OXYGEN_VARS_FILENAME
  stamp(filename_var, auc_scores_a, p_values_found, precisions_target_found, recalls_target_found)
  raster.show_stamps(filename_var)

# Upload to Google Earth Engine

In [None]:
import eeraster

UPLOAD_TO_EARTH_ENGINE = False #@param {type: 'boolean'}

if UPLOAD_TO_EARTH_ENGINE:
  eeraster.ingest_isoscape(
      filename,
      ee_dst_path="projects/river-sky-386919/assets/isoscapes/d18O_isoscape",
      allow_overwrite=True)