<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/validation_pipeline_plot_isoscape/validation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/gdrive" #@param

ISOSCAPE_OXYGEN_MEANS_FILENAME = "variational/overall_low_res_means.tiff" #@param
ISOSCAPE_OXYGEN_VARS_FILENAME = "variational/overall_low_res_vars.tiff" #@param
ISOSCAPE_CARBON_FILENAME = "iso_d13C_map_wood_stack.tiff" #@param
ISOSCAPE_NITROGEN_MEANS_FILENAME = "Raster_Brasil_krig_d15N.tiff" #@param
ISOSCAPE_NITROGEN_VARS_FILENAME = "Brasil_Raster_Krig_SD_d15N.tiff" #@param

TEST_SET_FILENAME = 'canonical/uc_davis_no_partition_test_random_grouped.csv' #@param
ORIGINAL_SET_FILENAME = '2023_06_23_Results_Google.csv' #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

In [None]:
import importlib
import raster
import hypothesis
import dataset
import evaluation
importlib.reload(raster)
importlib.reload(hypothesis)
importlib.reload(dataset)
importlib.reload(evaluation)

# Isoscape: Calculate RMSE for Oxygen



In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import dataset

In [None]:
# Required to both import raster and read GDrive files
raster.RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
raster.ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

In [None]:
if ISOSCAPE_OXYGEN_MEANS_FILENAME == ISOSCAPE_OXYGEN_VARS_FILENAME:
  oxygen_means_isoscape = raster.load_raster(
      raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME), use_only_band_index=0)
  oxygen_vars_isoscape = raster.load_raster(
      raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME), use_only_band_index=1)
else:
  oxygen_means_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME), use_only_band_index=0)
  oxygen_vars_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME), use_only_band_index=0)

In [None]:
eval_dataset = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
import evaluation

In [None]:
mean_rmse, var_rmse, overall_rmse = evaluation.calculate_rmse(
    eval_dataset,
    oxygen_means_isoscape,
    oxygen_vars_isoscape,
    MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)

In [None]:
print("RMSE of Means:", mean_rmse)
print("RMSE of Vars:", var_rmse)
print("Overall RMSE:", overall_rmse)

# Get Predictions

In [None]:
eval_dataset['fraud'] = False

In [None]:
eval_dataset['d18O_cel_count'] = 5

In [None]:
inferences_df = hypothesis.get_predictions_grouped(
    eval_dataset,
    ['d18O_cel_mean'],
    ['d18O_cel_variance'],
    ['d18O_cel_count'],
    [oxygen_means_isoscape],
    [oxygen_vars_isoscape], 5)

In [None]:
print(inferences_df['d18O_predicted_variance'].mean())
print(inferences_df['d18O_predicted_variance'].std())
print(inferences_df['d18O_predicted_variance'].max())
print(inferences_df['d18O_predicted_variance'].min())

In [None]:
inferences_df.dropna(subset=['d18O_cel_variance', 'd18O_predicted_variance'], inplace=True)

mean_squared_error(
    inferences_df['d18O_cel_variance'],
    inferences_df['d18O_predicted_variance'],
    squared=False
)

In [None]:
inferences_df.to_csv(raster.get_sample_db_path('overall_low_res_inferences.csv'))

# Plot isoscapes

In [None]:
import matplotlib.pyplot as plt

if len(oxygen_means_isoscape.masked_image.shape) > 2:
  raster.plot_band(
    oxygen_means_isoscape,
    0)
else:
  raster.plot_band(
    oxygen_means_isoscape,
    -1)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
if len(oxygen_means_isoscape.masked_image.shape) > 2:
  raster.plot_band(
    oxygen_vars_isoscape,
    0)
else:
  raster.plot_band(
    oxygen_vars_isoscape,
    -1)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

# Fraud Detection Hypothesis Test
Combines the p values of each element specified below and classifies as fraudulent with the resulting p-value.

Creating fraudulent samples

In [None]:
eval_dataset.shape

In [None]:
elements = ['d18O_cel']
isotope_column_names = ['d18O_cel']
# elements = ['d18O_cel', 'd15N_wood', 'd13C_wood']
# isotope_column_names = ['d18O_cel', 'd15N_wood', 'd13C_wood']
mean_isoscapes = [
    oxygen_means_isoscape,
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_NITROGEN_MEANS_FILENAME), use_only_band_index=0),
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=0),
]
vars_isoscapes = [
    oxygen_vars_isoscape,
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_NITROGEN_VARS_FILENAME), use_only_band_index=0),
    # raster.load_raster(
    #     raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=1),
]

In [None]:
# Parameters for creation of fraudulent samples.
max_trusted_radius = 0.1
max_fraud_radius = 30
min_fraud_radius = 5

In [None]:
real_samples_data = pd.merge(eval_dataset[['Code','lat','long',MEAN_TRUTH_NAME, VAR_TRUTH_NAME]],
         pd.read_csv(raster.get_sample_db_path(ORIGINAL_SET_FILENAME), index_col=0), how="inner",
                    left_on=['Code', 'lat', 'long'], right_on=['Code', 'lat', 'long'])

In [None]:
import matplotlib.pyplot as plt

In [None]:
import random

random.seed(42)

In [None]:
fake_samples = {}
for max_fraud_radius in range(6, 3006, 1000):
  fake_samples[max_fraud_radius] = dataset.create_fraudulent_samples(
      real_samples_data, mean_isoscapes, elements, max_trusted_radius, max_fraud_radius, min_fraud_radius
  )

  plt.scatter(real_samples_data['long'], real_samples_data['lat'], alpha=0.1, label="real")
  plt.scatter(fake_samples[max_fraud_radius]['long'],
           fake_samples[max_fraud_radius]['lat'], alpha=0.1, label="fake")
  plt.xlabel("Longitude")
  plt.ylabel("Latitude")
  plt.legend()
  plt.title(f"Fake coordinates generated at {max_fraud_radius}km max fraud radius with real samples")
  plt.show()

In [None]:
fake_samples.keys()

Combine fraudulent and real samples in a Dataframe, identified by 'fraud' column

In [None]:
real = real_samples_data[['Code','lat','long'] + elements]
real = real.assign(fraud=False)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

import numpy as np

In [None]:
auc_scores = {}

for radius, fake_sample in fake_samples.items():
  test_dataset = pd.concat([real, fake_sample], axis=1, join='outer')
  test_dataset = real.append(fake_sample, ignore_index=True)

  predictions = hypothesis.get_predictions(test_dataset,
    isotope_column_names,
    mean_isoscapes,
    vars_isoscapes,
    5)

  predictions.dropna(subset=['fraud', 'fraud_p_value'], inplace=True)

  y_true = predictions['fraud']
  # Fraud p value is lower the more positive a prediction/label is.
  # Inverting it gives us the probability of positive label class (fraud).
  y_pred = 1 - predictions['fraud_p_value']

  precision, recall, thresholds = precision_recall_curve(y_true, y_pred)

  plt.plot(recall, precision, label="model")
  plt.plot(np.linspace(0, 1, 100), np.ones(100) * 0.5, "--", label="random")
  plt.xlabel("Recall")
  plt.ylabel("Precision")
  plt.title(f"Precision-Recall curve with Max Fraud Radius of {radius} km")
  plt.legend()
  plt.show()

  auc_score = auc(recall, precision)
  print("AUC score:", auc_score)

  auc_scores[radius] = auc_score

In [None]:
plt.plot(auc_scores.keys(), auc_scores.values())
plt.xlabel("Max radius of fraudulent samples")
plt.ylabel("AUC of PR")