<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/validation_pipeline_separate_rmses/validation_pipeline_plot_isoscape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/gdrive" #@param

ISOSCAPE_OXYGEN_MEANS_FILENAME = "canonical/kriging_overall_means.tiff" #@param
ISOSCAPE_OXYGEN_VARS_FILENAME = "canonical/kriging_overall_vars.tiff" #@param
ISOSCAPE_CARBON_FILENAME = "iso_d13C_map_wood_stack.tiff" #@param
ISOSCAPE_NITROGEN_MEANS_FILENAME = "Raster_Brasil_krig_d15N.tiff" #@param
ISOSCAPE_NITROGEN_VARS_FILENAME = "Brasil_Raster_Krig_SD_d15N.tiff" #@param

TEST_SET_FILENAME = 'canonical/uc_davis_no_partition_test_random_grouped.csv' #@param
ORIGINAL_SET_FILENAME = '2023_06_23_Results_Google.csv' #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

# if DEBUG:
#     %pip install -Uqq ipdb
#     import ipdb
#     %pdb on

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

In [None]:
import importlib
import raster
import hypothesis
import dataset
import evaluation
importlib.reload(raster)
importlib.reload(hypothesis)
importlib.reload(dataset)
importlib.reload(evaluation)

# Isoscape: Calculate RMSE for Oxygen



In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import dataset

In [None]:
# Required to both import raster and read GDrive files
raster.RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
raster.ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

In [None]:
oxygen_means_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME), use_only_band_index=0)
oxygen_vars_isoscape = raster.load_raster(
    raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME), use_only_band_index=0)

In [None]:
eval_dataset = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
eval_dataset.shape

In [None]:
raster.get_sample_db_path(TEST_SET_FILENAME)

In [None]:
import evaluation

In [None]:
mean_rmse, var_rmse, overall_rmse = evaluation.calculate_rmse(
    eval_dataset,
    oxygen_means_isoscape,
    oxygen_vars_isoscape,
    MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)

In [None]:
print("RMSE of Means:", mean_rmse)
print("RMSE of Vars:", var_rmse)
print("Overall RMSE:", overall_rmse)

In [None]:
!pip install earthengine-api geemap geopandas geobr rtree pyproj rasterio contextily descartes -q

In [None]:
## Import packages
import rasterio as rio
import rasterio.mask
from rasterio.plot import show
from rasterio.transform import Affine
import contextily as cx
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import geemap
import json
import seaborn as sns
from shapely.geometry import Point
import os

In [None]:
raster.GDRIVE_BASE = "/content/gdrive" #@param
PROJECT_BASE = os.path.join(raster.GDRIVE_BASE, "MyDrive/amazon_rainforest_files")
BIOME_PATH = "christian_files/lm_bioma_250.shp" #@param

In [None]:
biome_path = os.path.join(PROJECT_BASE, BIOME_PATH)
gdf_biome = gpd.read_file(biome_path)

In [None]:
def display_isoscape_two_bands(means_filename, gdf_biome):
  '''
  Given a filename to a RasterIO file containing an isoscape raster,
  display the isoscape masked by the gdf_biome geometry.
  '''
  # Mean Isotope Isoscape Band Image
  src = rasterio.open(means_filename)

  fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 20))
  ax1.imshow(src.read(1))
  ax1.set_title("Mean Isotope Isoscape Band")

  ax2.imshow(src.read(2))
  ax2.set_title("Variance Isotope Isoscape Band")
  plt.show()

In [None]:
def display_isoscape_single_band(means_filename, vars_filename, gdf_biome):
  '''
  Given a filename to a RasterIO file containing an isoscape raster,
  display the isoscape masked by the gdf_biome geometry.
  '''
  # Mean Isotope Isoscape Band Image
  means_raster = rasterio.open(means_filename)
  means_image, means_transform = rio.mask.mask(means_raster, gdf_biome.geometry.values, crop = True)

  fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 20))
  means_raster_images = show(means_image, ax = ax1, transform = means_transform, cmap = "RdYlGn")
  means_raster_image = means_raster_images.get_images()[0]
  fig.colorbar(means_raster_image, ax=ax1, shrink=0.3)

  ax1.set_title("Mean Isotope Isoscape Band")

  # Variance Isotope Isoscape Band Image
  vars_raster = rasterio.open(vars_filename)
  vars_image, vars_transform = rio.mask.mask(vars_raster, gdf_biome.geometry.values, crop = True)

  vars_raster_images = show(vars_image, ax = ax2, transform = vars_transform, cmap = "RdYlGn")
  vars_raster_image = vars_raster_images.get_images()[0]
  fig.colorbar(vars_raster_image, ax=ax2, shrink=0.3)

  ax2.set_title("Variance Isotope Isoscape Band")
  plt.show()

In [None]:
# display_isoscape_single_band(raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME),
#                              raster.get_raster_path(ISOSCAPE_OXYGEN_VARS_FILENAME),
#                              gdf_biome)

In [None]:
display_isoscape_two_bands(raster.get_raster_path(ISOSCAPE_OXYGEN_MEANS_FILENAME),
                           gdf_biome)

# Fraud Detection Hypothesis Test
Combines the p values of each element specified below and classifies as fraudulent with the resulting p-value.

Creating fraudulent samples

In [None]:
eval_dataset.shape

In [None]:
elements = ['d18O_cel', 'd15N_wood', 'd13C_wood']
isotope_column_names = ['d18O_cel', 'd15N_wood', 'd13C_wood']
mean_isoscapes = [
    oxygen_means_isoscape,
    raster.load_raster(
        raster.get_raster_path(ISOSCAPE_NITROGEN_MEANS_FILENAME), use_only_band_index=0),
    raster.load_raster(
        raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=0),
]
vars_isoscapes = [
    oxygen_vars_isoscape,
    raster.load_raster(
        raster.get_raster_path(ISOSCAPE_NITROGEN_VARS_FILENAME), use_only_band_index=0),
    raster.load_raster(
        raster.get_raster_path(ISOSCAPE_CARBON_FILENAME), use_only_band_index=1),
]

In [None]:
# Parameters for creation of fraudulent samples.
max_trusted_radius = 0.1
max_fraud_radius = 30
min_fraud_radius = 5

In [None]:
real_samples_data = pd.merge(eval_dataset[['Code','lat','long',MEAN_TRUTH_NAME, VAR_TRUTH_NAME]],
         pd.read_csv(raster.get_sample_db_path(ORIGINAL_SET_FILENAME), index_col=0), how="inner",
                    left_on=['Code', 'lat', 'long'], right_on=['Code', 'lat', 'long'])

In [None]:
fake_sample = dataset.create_fraudulent_samples(
    real_samples_data, mean_isoscapes, elements, max_trusted_radius, max_fraud_radius, min_fraud_radius
)

Combine fraudulent and real samples in a Dataframe, identified by 'fraud' column

In [None]:
real = real_samples_data[['Code','lat','long'] + elements]
real = real.assign(fraud=False)

In [None]:
test_dataset = pd.concat([real, fake_sample], axis=1, join='outer')
test_dataset = real.append(fake_sample, ignore_index=True)

Calling t-test function that expects the Dataframe with the fraud column

In [None]:
sample_size_per_location = 5
p_value_target = 0.05

In [None]:
assert((
  hypothesis.fraud_metrics(test_dataset,
                [isotope_column_names[0]],
                [mean_isoscapes[0]],
                [vars_isoscapes[0]],
                sample_size_per_location,
                0.0)
).accuracy == 0.5)
assert((
  hypothesis.fraud_metrics(test_dataset,
                [isotope_column_names[0]],
                [mean_isoscapes[0]],
                [vars_isoscapes[0]],
                sample_size_per_location,
                1.0)
).accuracy == 0.5)

In [None]:
assert(test_dataset[test_dataset['fraud'] == True].shape[0] == test_dataset[test_dataset['fraud'] == False].shape[0])

In [None]:
import numpy as np

p_values = np.linspace(0, 1, 1000)

accuracies = []
for p_value_target in p_values:
  fraud_metrics = (
    hypothesis.fraud_metrics(test_dataset,
                  isotope_column_names,
                  mean_isoscapes,
                  vars_isoscapes,
                  sample_size_per_location,
                  p_value_target)
  )
  accuracies.append(fraud_metrics.accuracy)

In [None]:
# Accuracy Graph
import matplotlib.pyplot as plt

plt.plot(p_values, accuracies)
plt.xlabel("p-value")
plt.ylabel("Accuracy")
plt.title("Accuracy Graph")
plt.show()

In [None]:
import numpy as np

predictions = hypothesis.get_predictions(test_dataset,
  isotope_column_names,
  mean_isoscapes,
  vars_isoscapes,
  sample_size_per_location)

In [None]:
y_true = predictions['fraud']
# Fraud p value is lower the more positive a prediction/label is.
# Inverting it gives us the probability of positive label class (fraud).
y_pred = 1 - predictions['fraud_p_value']

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_true, y_pred)

plt.plot(recall, precision, label="model")
plt.plot(np.linspace(0, 1, 100), np.ones(100) * 0.5, "--", label="random")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall curve")
plt.legend()
plt.show()