<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/validation_pipeline/validation_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation Pipeline

In [None]:
DEBUG = False #@param {type:"boolean"}
GDRIVE_BASE = "/content/gdrive" #@param

ISOSCAPE_MEANS_FILENAME = "xgb_means_oxygen_isoscape_ucd_42.tiff" #@param
ISOSCAPE_VARS_FILENAME = "xgb_variances_oxygen_isoscape_ucd_42.tiff" #@param
# Used in unit tests (generated from Kriging)
TEST_ISOSCAPE_FILENAME = "uc_davis_d18O_cel_kriging_means_2.tiff" #@param

TEST_SET_FILENAME = '2023_06_23_Results_Google.csv' #@param
# Columns of values to read ground truths from. Invalid values are 'truth'
# and 'prediction'.
MEAN_TRUTH_NAME = 'd18O_cel_mean' #@param
VAR_TRUTH_NAME = 'd18O_cel_variance' #@param
# Columns of values to write temporary predictions to (for RMSE calculation).
# Invalid values are 'truth' and 'prediction'.
MEAN_PREDICTED_NAME = 'd18O_predicted_mean' #@param
VAR_PREDICTED_NAME = 'd18O_predicted_variance' #@param

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

# Import

In [None]:
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

In [None]:
import importlib
import raster
import hypothesis
importlib.reload(raster)
importlib.reload(hypothesis)

# Isoscape: Calculate RMSE

In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import dataset

In [None]:
# Required to both import raster and read GDrive files
raster.RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
raster.ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

In [None]:
def calculate_rmse(df, means_isoscape, vars_isoscape, mean_true_name, var_true_name, mean_pred_name, var_pred_name):
  '''
  Calculates the mean, variance and overall (mean and variance) RMSE of df using
  the provided columns mean_true_name, var_true_name, mean_pred_name, var_pred_name
  can take any value except 'truth' and 'prediction'
  '''
  # Make sure names do not collide.
  assert(
      len([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction']) ==
      len(set([mean_true_name, var_true_name, mean_pred_name, var_pred_name, 'truth', 'prediction'])))

  df[mean_pred_name] = df.apply(lambda row:raster.get_data_at_coords(means_isoscape, row['long'],row['lat'],-1), axis=1)
  df[var_pred_name] = df.apply(lambda row:raster.get_data_at_coords(vars_isoscape, row['long'],row['lat'],-1), axis=1)

  predictions = list(df.apply(lambda row: [row[mean_pred_name], row[var_pred_name]], axis=1).values)
  truths = list(df.apply(lambda row: [row[mean_true_name], row[var_true_name]], axis=1).values)

  return (mean_squared_error(df[mean_true_name].values, df[mean_pred_name].values, squared=False),
         mean_squared_error(df[var_true_name].values, df[var_pred_name].values, squared=False),
         mean_squared_error(truths, predictions, squared=False))

In [None]:
import pytest

def test_calculate_rmse():
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_FILENAME), use_only_band_index=0)
  bounds =  raster.get_extent(test_means_isoscape.gdal_dataset)
  print(bounds)
  df = pd.DataFrame({
      'long': [-70, -68],
      'lat': [-4, -3],
      'd18O_cel_mean': [0, 5],
      'd18O_cel_var': [1, 0.5]
  })
  mean_true_name = 'd18O_cel_mean'
  var_true_name = 'd18O_cel_var'
  mean_pred_name = 'd18O_cel_mean_pred'
  var_pred_name = 'd18O_cel_var_pred'
  truth_name = 'd18O_cel_truth'
  pred_name = 'd18O_cel_pred'

  mean_rmse, var_rmse, overall_rmse = calculate_rmse(
      df, test_means_isoscape, test_vars_isoscape,
      mean_true_name, var_true_name, mean_pred_name, var_pred_name)
  print(mean_rmse, var_rmse, overall_rmse)

  assert(mean_rmse == pytest.approx(22.221530876037058))
  assert(var_rmse == pytest.approx(23.85633857749663))
  assert(overall_rmse == pytest.approx(23.038934726766843))

test_calculate_rmse()

In [None]:
means_isoscape = raster.load_raster(raster.get_raster_path(ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
vars_isoscape = raster.load_raster(raster.get_raster_path(ISOSCAPE_VARS_FILENAME), use_only_band_index=0)

In [None]:
eval_dataset = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), index_col=0)
eval_dataset.head()

In [None]:
mean_rmse, var_rmse, overall_rmse = calculate_rmse(eval_dataset, means_isoscape, vars_isoscape, MEAN_TRUTH_NAME, VAR_TRUTH_NAME, MEAN_PREDICTED_NAME, VAR_PREDICTED_NAME)

In [None]:
print("RMSE of Means:", mean_rmse)
print("RMSE of Vars:", var_rmse)
print("Overall RMSE:", overall_rmse)

# TODO: Fraud Detection Hypothesis Test

Utility function for randomly sampling a point around a sample site

In [None]:
#verify if we need to change this coords_to_indices and get_data_at_coords on raster.py
def coords_to_indices(bounds: raster.Bounds, x: float, y: float):
  if x < bounds.minx or x > bounds.maxx or y < bounds.miny or y > bounds.maxy:
    return None, None

  lat_idx = bounds.raster_size_y - int(math.ceil((y - bounds.miny) / abs(bounds.pixel_size_y)))
  lon_idx = int((x - bounds.minx) / abs(bounds.pixel_size_x))

  return lat_idx, lon_idx

def get_data_at_coords(dataset: raster.AmazonGeoTiff, x: float, y: float, month: int) -> float:
  # x = longitude
  # y = latitude
  bounds = raster.get_extent(dataset.gdal_dataset)
  x_idx, y_idx = coords_to_indices(bounds, x, y)
  if not x_idx and not y_idx:
    return None
  if month == -1:
    value = dataset.yearly_masked_image[x_idx, y_idx]
  else:
    value = dataset.masked_image[x_idx, y_idx, month]
  if np.ma.is_masked(value):
    raise ValueError("Coordinates are masked")
  else:
    return value

from geopy import distance
import numpy as np
import math
import random
import pytest
import matplotlib.pyplot as plt

def is_valid_point(lat: float, lon: float, reference_isocape: raster.AmazonGeoTiff):
  return True if get_data_at_coords(reference_isocape, lon, lat, 0) else False

# Pick a random point around (lat, lon) within max_distance_km. If edge_only is
# true, only pick points exactly max_distance_km away from (lat, lon).
def random_nearby_point(lat, lon, max_distance_km, edge_only=False):
  # Pick a random angle pointing outward from the origin.
  # 0 == North, 90 == East, 180 == South, 270 == West
  angle = 360 * random.random()

  # sqrt() is required for an equal radial distribution, otherwise samples
  # cluster around origin.
  dist = max_distance_km if edge_only else max_distance_km * math.sqrt(random.random())

  # WGS-84 is the most accurate ellipsoidal model of Earth, but we should double
  # check to make sure this matches the model used by our sample collectors.
  point = distance.geodesic(
      ellipsoid='WGS-84', kilometers=dist).destination((lat, lon), bearing=angle)
  return point.latitude, point.longitude

# Given a list of real_points, returns true if (lat, lon) is within threshold
# of any of those points.
def is_nearby_real_point(lat, lon, real_points, threshold_km):
  for point, _ in real_points:
    if distance.geodesic((lat, lon), point).km < threshold_km:
      return True
  return False

Creating fraudulent samples

In [None]:
def create_fraudulent_samples(real_samples_data, mean_iso,element,max_trusted_radius,max_fraud_radius,min_fraud_radius):
    '''
    This function creates a dataset based on real samples adding a Fraud column, where True represents a real lat/lon and False represents a fraudulent lat/lon
    Input:
    - real_samples_data: real samples
    - element: element e.g d18O_cel
    - mean_iso: isoscape averages
    - max_trusted_radius, In km, the maximum distance from a real point where its value is still considered legitimate.
    - max_fraud_radius: In km, the maximum distance from a real point to randomly sample a fraudalent coordinate.
    - min_fraud_radius: In km, the minimum distance from a real point to randomly sample a fraudalent coordinate.
    Output:
    - fake_data: pd.DataFrame with lat, long, isotope_value and fraudulent columns
    '''

    real_samples_data.dropna(subset=[element], how='all', inplace=True)

    real_samples = real_samples_data.groupby(['lat','long'])[element]

    count = 0
    lab_samp = real_samples

    if max_fraud_radius <= min_fraud_radius:
      raise ValueError("max_fraud_radius {} <= min_fraud_radius {}".format(
          max_fraud_radius, min_fraud_radius))

    fake_sample = pd.DataFrame(columns=['Code',
            'lat',
            'long',
            element,
            'fraud'])

    # Max number of times to attempt to generate random coordinates.
    MAX_RANDOM_SAMPLE_ATTEMPTS = 1000

    for coord, lab_samp in real_samples:
      if lab_samp.size <= 1:
        continue

      lat, lon, attempts = 0, 0, 0
      while((not is_valid_point(lat, lon, mean_iso) or
            is_nearby_real_point(lat, lon, real_samples, min_fraud_radius)) and
            attempts < MAX_RANDOM_SAMPLE_ATTEMPTS):
        lat, lon = random_nearby_point(coord[0], coord[1], max_fraud_radius)
        new_row = {'Code': 'mad5000', 'lat': lat, 'long': lon,element: lab_samp.mean(),'fraud': True, }
        #Hardcoded the code as mad5000 for fake coordinates
        fake_sample.loc[len(fake_sample)] = new_row
        attempts += 1

    return(fake_sample)

In [None]:
real_samples_data = pd.read_csv(raster.get_sample_db_path(TEST_SET_FILENAME), encoding="ISO-8859-1", sep=',')
mean_iso = means_isoscape
element = 'd18O_cel'


fake_sample = create_fraudulent_samples(real_samples_data,mean_iso,element,0.1,30,5)

Combine fraudulent and real samples in a Dataframe, identified by 'fraud' column

In [None]:
real = real_samples_data[['Code','lat','long',element]]

real = real.assign(fraud=False)

test_dataset = pd.concat([real, fake_sample], axis=1, join='outer')

test_dataset = real.append(fake_sample, ignore_index=True)

Calling t-test function that expects the Dataframe with the fraud column

In [None]:
isotope_column_name = 'd18O_cel'
test_means_isoscape = means_isoscape
test_vars_isoscape = vars_isoscape
sample_size_per_location = 5
p_value_target = 0.05

accuracy, precision, recall = (
  hypothesis.fraud_metrics(test_dataset,
                isotope_column_name,
                test_means_isoscape,
                test_vars_isoscape,
                sample_size_per_location,
                p_value_target)
)

print("accuracy", accuracy)
print("precision", precision)
print("recall", recall)
