<a href="https://colab.research.google.com/github/tnc-br/ddf_common/blob/less-fake/dataset_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m T

In [2]:
import dataset
import raster
import importlib
importlib.reload(dataset)

<module 'dataset' from '/content/gdrive/MyDrive/nudge/ddf_common/dataset.py'>

# preprocess_sample_data Tests

In [3]:
from pandas.util.testing import assert_frame_equal

# Average and variance test
def average_variance_test():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, np.nan, -3.0]}
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "x_count": [2, 1],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_count": [1, 1],
      "y_mean": [8.0, -3.0],
      "y_variance": [np.nan, np.nan],
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_no_grouping():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0],
      }
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=False

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x_count": [2, 2, 1],
      "x_mean": [3.0, 3.0, 3.0],
      "x_variance": [0.0, 0.0, np.nan],
      "y_count": [2, 2, 1],
      "y_mean": [4.0, 4.0, -3.0],
      "y_variance": [32.0, 32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_keep_nonnumerical_columns():
  test_df = pd.DataFrame({
      "code": ["a", "b", "c"],
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["code", "lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "code": ["a", "c"],
      "x_count": [2, 1],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_count": [2, 1],
      "y_mean": [4.0, -3.0],
      "y_variance": [32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

average_variance_test()
average_variance_test_no_grouping()
average_variance_test_keep_nonnumerical_columns()

  from pandas.util.testing import assert_frame_equal
  means = grouped.mean().reset_index()
  variances = grouped.var().reset_index()
  means = grouped.mean().reset_index()
  variances = grouped.var().reset_index()


In [4]:
# @title Load example CSV
jun23_reference_data = """
ID,Cod Lab,Code,Species,Scientific_name,Genus,Family,Point,Origin,State,date sample havest,lat,long,d15N_wood,%N_wood,d13C_wood,%C_wood,d13C_cel,%C_cel,d18O_cel_CENA,d18O_cel_Davis,d18O_cel_analysis,d18O_cel,VPD,RH,PET,DEM,PA,Mean Annual Temperature,Mean Annual Precipitation,Iso_Oxi_Stack_mean_TERZER
1,ZAC 730,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
2,ZAC 731,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
3,ZAC 732,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
4,ZAC 733,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
5,ZAC 734,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
6,ZAC 735,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
7,ZAC 736,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
8,ZAC 737,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
9,ZAC 738,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
10,ZAC 739,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
11,ZAC 740,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
12,ZAC 741,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
13,ZAC 742,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
14,ZAC 743,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
15,ZAC 744,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
1094,ZAC 623,madsd10,ucuuba_puna,Iryanthera  laevis Markgr,iryanthera,myristicaceae,75,manicoré,amazonas,Mar-14,-6.009706576,-61.8686565,0,0,0,0,0,0,0,0,0,0,0.77083,0.79509,93.975,71,1004.47662,27.2,1996,-4.05694
1095,ZAC 624,madsd10,ucuuba_puna,Iryanthera  laevis Markgr,iryanthera,myristicaceae,100,manicoré,amazonas,Mar-14,-6.009706576,-61.8686565,0,0,0,0,0,0,0,0,0,0,0.77083,0.79509,93.975,71,1004.47662,27.2,1996,-4.05694
"""

In [5]:
def xgb_equivalence():
  with open(raster.get_sample_db_path("scratch_file.txt"), "w") as f:
    f.write(jun23_reference_data)

  pds1 = dataset.partitioned_reference_data("scratch_file.txt")

  # The equivalent...
  df = pd.read_csv(raster.get_sample_db_path("scratch_file.txt"),
    encoding="ISO-8859-1", sep=',')
  df = dataset.partition(df.rename(
      columns={'long': 'lon' })).train
  df = df[['Code', 'lat', 'lon', 'd18O_cel']]
  df = df[df['d18O_cel'].notna()]
  df = df.groupby(['lat', 'lon'])
  df = df.mean().reset_index().rename(
      columns={'d18O_cel': 'cellulose_oxygen_ratio' }).reset_index()
  df.drop('index', inplace=True, axis=1)

  fdf = dataset.add_features_from_rasters(df, [raster.relative_humidity_geotiff(),
      raster.temperature_geotiff(),
      raster.vapor_pressure_deficit_geotiff(),
      raster.atmosphere_isoscape_geotiff()])

  fdf1 = fdf.sort_index(axis=1).reset_index(drop=True)
  fdf2 = pds1.train.sort_index(axis=1).reset_index(drop=True)

  assert_frame_equal(fdf1, fdf2)

  xgb_equivalence()

In [7]:
TEST_ISOSCAPE_MEANS_FILENAME = "canonical/kriging_fixed_means.tiff" #@param
TEST_ISOSCAPE_VARS_FILENAME = "canonical/kriging_fixed_vars.tiff" #@param

In [8]:
import raster

In [9]:
def create_fraudulent_samples_test():
  # The range of expected values is around 25 for most locations here.
  # We can have samples with less than 5 measurements, but more than 1. We
  # still want to consider them.
  real_samples_data = pd.DataFrame({
      'Code': ["a", "a", "a", "a", "a", "b", "b", "b","b","b","c","c","c"],
      'lat': [-2.499,-2.499,-2.499,-2.499,-2.499, -6.009706576,-6.009706576,-6.009706576,-6.009706576,-6.009706576, -5, -5, -5],
      'long': [-59.121, -59.121, -59.121, -59.121, -59.121, -61.8686565,-61.8686565,-61.8686565,-61.8686565,-61.8686565, -60, -60, -60],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 0.5, 0.5, 0.5],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  element = 'd18O_cel'
  max_trusted_radius = 0.1
  max_fraud_radius = 30
  min_fraud_radius = 5

  fake_sample = (
    dataset.create_fraudulent_samples(real_samples_data,
                                      [mean_iso],
                                      [element],
                                      max_trusted_radius,
                                      max_fraud_radius,
                                      min_fraud_radius))
  assert('Code' in fake_sample.columns)
  assert('lat' in fake_sample.columns)
  assert('long' in fake_sample.columns)
  assert('d18O_cel' in fake_sample.columns)
  assert(fake_sample.shape[0] == real_samples_data.shape[0])

create_fraudulent_samples_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [None]:
def create_fraudulent_samples_elements_fake_samples_per_sample_test():
  # The range of expected values is around 25 for most locations here.
  # We can have samples with less than 5 measurements, but more than 1. We
  # still want to consider them.
  real_samples_data = pd.DataFrame({
      'Code': ["a", "a", "a", "a", "a", "b", "b", "b","b","b","c","c","c"],
      'lat': [-2.499,-2.499,-2.499,-2.499,-2.499, -6.009706576,-6.009706576,-6.009706576,-6.009706576,-6.009706576, -5, -5, -5],
      'long': [-59.121, -59.121, -59.121, -59.121, -59.121, -61.8686565,-61.8686565,-61.8686565,-61.8686565,-61.8686565, -60, -60, -60],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 0.5, 0.5, 0.5],
      'd15N_wood': [8.0, 5.0, 0.0, 3.0,-20.0, 25.0, 0.5, 0.5, 0.5, -20.0, 5.0, 100.0, 5.0],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  elements = ['d18O_cel', 'd15N_wood']
  max_trusted_radius = 0.1
  max_fraud_radius = 30
  min_fraud_radius = 5

  fake_sample = dataset.create_fraudulent_samples(
        real_samples_data=real_samples_data,
        mean_isoscapes=[mean_iso],
        elements=elements,
        max_fraud_radius=max_fraud_radius,
        trusted_buffer_radius=max_trusted_radius,
        fake_samples_per_sample=3)
  assert(fake_sample.shape[0] == real_samples_data.shape[0] * 3)
  assert('Code' in fake_sample.columns)
  assert('lat' in fake_sample.columns)
  assert('long' in fake_sample.columns)
  assert('d18O_cel' in fake_sample.columns)
  assert('d15N_wood' in fake_sample.columns)


create_fraudulent_samples_elements_fake_samples_per_sample_test()

In [None]:
def create_fraudulent_samples_dropout_rate_test():
  # The range of expected values is around 25 for most locations here.
  # We can have samples with less than 5 measurements, but more than 1. We
  # still want to consider them.
  real_samples_data = pd.DataFrame({
      'Code': ["a", "a", "a", "a", "a", "b", "b", "b","b","b","c","c","c"],
      'lat': [-2.499,-2.499,-2.499,-2.499,-2.499, -6.009706576,-6.009706576,-6.009706576,-6.009706576,-6.009706576, -5, -5, -5],
      'long': [-59.121, -59.121, -59.121, -59.121, -59.121, -61.8686565,-61.8686565,-61.8686565,-61.8686565,-61.8686565, -60, -60, -60],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 0.5, 0.5, 0.5],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  element = 'd18O_cel'
  max_trusted_radius = 0.1
  max_fraud_radius = 30
  min_fraud_radius = 5

  fake_sample = (
    dataset.create_fraudulent_samples(
        real_samples_data=real_samples_data,
        mean_isoscapes=[mean_iso],
        elements=[element],
        max_fraud_radius=max_fraud_radius,
        trusted_buffer_radius=min_fraud_radius,
        sample_drop_rate=0.5))
  assert('Code' in fake_sample.columns)
  assert('lat' in fake_sample.columns)
  assert('long' in fake_sample.columns)
  assert('d18O_cel' in fake_sample.columns)
  assert(fake_sample.shape[0] == 8) # 1 sample with 5 data points, 1 with 3.

create_fraudulent_samples_dropout_rate_test()

In [None]:
def create_fraudulent_samples_elements_test():
  # The range of expected values is around 25 for most locations here.
  # We can have samples with less than 5 measurements, but more than 1. We
  # still want to consider them.
  real_samples_data = pd.DataFrame({
      'Code': ["a", "a", "a", "a", "a", "b", "b", "b","b","b","c","c","c"],
      'lat': [-2.499,-2.499,-2.499,-2.499,-2.499, -6.009706576,-6.009706576,-6.009706576,-6.009706576,-6.009706576, -5, -5, -5],
      'long': [-59.121, -59.121, -59.121, -59.121, -59.121, -61.8686565,-61.8686565,-61.8686565,-61.8686565,-61.8686565, -60, -60, -60],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 0.5, 0.5, 0.5],
      'd15N_wood': [8.0, 5.0, 0.0, 3.0,-20.0, 25.0, 0.5, 0.5, 0.5, -20.0, 5.0, 100.0, 5.0],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  elements = ['d18O_cel', 'd15N_wood']
  max_trusted_radius = 0.1
  max_fraud_radius = 30
  min_fraud_radius = 5

  fake_sample = (
    dataset.create_fraudulent_samples(real_samples_data,
                                      [mean_iso],
                                      elements,
                                      max_trusted_radius,
                                      max_fraud_radius,
                                      min_fraud_radius))
  assert(fake_sample.shape[0] == real_samples_data.shape[0])
  assert('Code' in fake_sample.columns)
  assert('lat' in fake_sample.columns)
  assert('long' in fake_sample.columns)
  assert('d18O_cel' in fake_sample.columns)
  assert('d15N_wood' in fake_sample.columns)


create_fraudulent_samples_elements_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [11]:
def nudge_test():
  real_samples_data = pd.DataFrame({
    'lat':  [-5,    -34,   -34.4, -5.499,  -6.499],
    'long': [-74.2, -74.6, -50,   -59.121, -59.121],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  assert(not raster.is_valid_point(real_samples_data.loc[0, "lat"], real_samples_data.loc[0, "long"], mean_iso))
  assert(not raster.is_valid_point(real_samples_data.loc[1, "lat"], real_samples_data.loc[1, "long"], mean_iso))
  assert(not raster.is_valid_point(real_samples_data.loc[2, "lat"], real_samples_data.loc[2, "long"], mean_iso))
  assert(raster.is_valid_point(real_samples_data.loc[3, "lat"], real_samples_data.loc[3, "long"], mean_iso))
  assert(raster.is_valid_point(real_samples_data.loc[4, "lat"], real_samples_data.loc[4, "long"], mean_iso))

  dataset.nudge_invalid_coords(real_samples_data, [mean_iso])

  for i, row in real_samples_data.iterrows():
    # Get the lat and long for the current row.
    lat = real_samples_data.loc[i, "lat"]
    lon = real_samples_data.loc[i, "long"]
    assert(raster.is_valid_point(lat, lon, mean_iso))

nudge_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [14]:
import unittest

def nudge_test_fails(max_degrees_deviation: int):
  real_samples_data = pd.DataFrame({
    'lat':  [-5,    -38],
    'long': [-78.2, -74.6],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  assert(not raster.is_valid_point(real_samples_data.loc[0, "lat"], real_samples_data.loc[0, "long"], mean_iso))
  assert(not raster.is_valid_point(real_samples_data.loc[1, "lat"], real_samples_data.loc[1, "long"], mean_iso))

  dataset.nudge_invalid_coords(real_samples_data, [mean_iso], max_degrees_deviation=1)

class NudgeTestFails(unittest.TestCase):
  def test(self, max_degrees_deviation: int, expect_exception: bool):
    with self.assertRaises(Exception) as context:
      nudge_test_fails(max_degrees_deviation)

    if expect_exception:
      self.assertTrue("Failed to nudge coordinates into valid space" in str(context.exception))

NudgeTestFails().test(max_degrees_deviation=1, expect_exception=True)
NudgeTestFails().test(max_degrees_deviation=10, expect_exception=False)


Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
