<a href="https://colab.research.google.com/github/tnc-br/ddf_common/blob/grouped_support/hypothesis_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME = "canonical/kriging_random_means.tiff" #@param
TEST_OXYGEN_ISOSCAPE_VARS_FILENAME = "canonical/kriging_random_means.tiff" #@param

TEST_NITROGEN_ISOSCAPE_MEANS_FILENAME = "plant_nitrogen_isoscape.tiff" #@param
TEST_NITROGEN_ISOSCAPE_VARS_FILENAME = "plant_nitrogen_isoscape.tiff" #@param

In [2]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

Cloning into 'ddf_common_stub'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 11 (delta 4), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (11/11), 5.50 KiB | 1.38 MiB/s, done.
Resolving deltas: 100% (4/4), done.


interactive(children=(Text(value='', description='Email', placeholder='Enter email'), Text(value='', descripti…

# Tests

In [4]:
import pytest
import hypothesis
import raster

In [5]:
import importlib
import hypothesis
importlib.reload(hypothesis)

<module 'hypothesis' from '/content/gdrive/MyDrive/grouped_support/ddf_common/hypothesis.py'>

In [6]:
def sample_ttest_test():
  longitude = -71
  latitude = -5.5
  # The range of expected values is around 25 for most locations here.
  isotope_means = [-20.0]
  isotope_variances = [0.5]
  isotope_counts = [2]
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
  sample_size_per_location = 2
  p_value_target = 0.05

  hypothesis_test = hypothesis.sample_ttest(longitude,
                 latitude,
                 isotope_means,
                 isotope_variances,
                 isotope_counts,
                 [test_means_isoscape],
                 [test_vars_isoscape],
                 sample_size_per_location,
                 p_value_target)

  print(hypothesis_test.p_value)
  assert(hypothesis_test.longitude == pytest.approx(-71))
  assert(hypothesis_test.latitude == pytest.approx(-5.5))
  assert(hypothesis_test.p_value == pytest.approx(0.0458605831575194))
  assert(hypothesis_test.p_value_threshold == pytest.approx(0.05))

sample_ttest_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
0.0458605831575194


In [7]:
def fraud_metrics_test():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 2, 2, 2, 2, 2,],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [0.5, 0.4, 0.3, 0.2, 0.1, 0.8, 0.7, 0.6],
  })
  isotope_column_names = ['d18O_cel']
  test_means_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)]
  test_vars_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  fraud_metrics = (
    hypothesis.fraud_metrics(
                  sample_data=sample_data,
                  isotope_column_names=isotope_column_names,
                  means_isoscapes=test_means_isoscapes,
                  variances_isoscapes=test_vars_isoscapes,
                  sample_size_per_location=sample_size_per_location,
                  p_value_target=p_value_target)
  )

  assert(fraud_metrics.accuracy == pytest.approx(0.75))
  assert(fraud_metrics.precision == pytest.approx(1.0))
  assert(fraud_metrics.recall == pytest.approx(0.5))

fraud_metrics_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [11]:
def fraud_metrics_combine_p_values_test():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 2, 2, 2, 2, 2],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [1.0, 0.5, 0.2, 0.3, 1.0, 0.5, 0.2, 0.3,],
      'd15N_wood_count': [2, 2, 2, 2, 2, 2, 2, 2],
      'd15N_wood_mean': [-20.0, 25.0, 105.0, 25.0, 20.0, 2.0, 0.0, 25.0],
      'd15N_wood_variance': [0.2, 0.3, 0.2, 0.1, 0.2, 0.3, 0.2, 0.1],
  })
  isotope_column_names = ['d18O_cel', 'd15N_wood']
  test_means_isoscapes = [
      raster.load_raster(
          raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME),
          use_only_band_index=0),
      raster.load_raster(
          raster.get_raster_path(TEST_NITROGEN_ISOSCAPE_MEANS_FILENAME),
          use_only_band_index=0)]
  test_vars_isoscapes = [
      raster.load_raster(
          raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME),
          use_only_band_index=0),
      raster.load_raster(
          raster.get_raster_path(TEST_NITROGEN_ISOSCAPE_VARS_FILENAME),
          use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  fraud_metrics = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_names,
                  test_means_isoscapes,
                  test_vars_isoscapes,
                  sample_size_per_location,
                  p_value_target)
  )

  assert(fraud_metrics.accuracy == pytest.approx(0.625))
  assert(fraud_metrics.precision == pytest.approx(0.5714285714285714))
  assert(fraud_metrics.recall == pytest.approx(1.0))

fraud_metrics_combine_p_values_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 3671 x 2631 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-73.9872799892171, 5.26666984632233)
Pixel Size = (0.008333333767853947, -0.008333333767876788)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223

In [13]:
# Test that we skip locations with one measurement.
def fraud_metrics_some_tail_locations():
  # The range of expected values is around 25 for most locations here.
  # One of the (-67, -4.3) data points was removed. This element was a true
  # negative (non-fraud) so accuracy is affected (decreases) but precision
  # and recall weren't affected.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 1, 2, 2, 2, 2,],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [0.5, 0.4, 0.3, np.nan, 0.1, 0.8, 0.7, 0.6],
  })
  isotope_column_name = ['d18O_cel']
  test_means_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME),
      use_only_band_index=0)]
  test_vars_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME),
      use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  fraud_metrics = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location,
                  p_value_target)
  )

  assert(fraud_metrics.accuracy == pytest.approx(0.625))
  assert(fraud_metrics.precision == pytest.approx(1.0))
  assert(fraud_metrics.recall == pytest.approx(0.5))

fraud_metrics_some_tail_locations()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [14]:
# Test where all locations are unique.
def fraud_metrics_unique_locations():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [1, 1, 1, 1, 1, 1, 1, 1,],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [0.5, 0.4, 0.3, 0.2, 0.1, 0.8, 0.7, 0.6],
  })
  isotope_column_name = ['d18O_cel']
  test_means_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME),
      use_only_band_index=0)]
  test_vars_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME),
      use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  fraud_metrics = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location,
                  p_value_target)
  )

  assert(fraud_metrics.accuracy == pytest.approx(0))
  assert(fraud_metrics.precision == pytest.approx(0))
  assert(fraud_metrics.recall == pytest.approx(0))

fraud_metrics_unique_locations()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [15]:
from pandas.util.testing import assert_frame_equal

def get_predictions_test():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",
               "a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0,
               -71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3,
              -5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 105.0, 25.0, 20.0, 2.0, 0.0, 25.0],
  })
  isotope_column_names = ['d18O_cel']
  test_means_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)]
  test_vars_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)]
  sample_size_per_location = 2

  predictions = (
    hypothesis.get_predictions(sample_data,
                  isotope_column_names,
                  test_means_isoscapes,
                  test_vars_isoscapes,
                  sample_size_per_location)
  )

  expected_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 2, 2, 2, 2, 2],
      'd18O_cel_mean': [-20, 15, 102.5, 15, 14, 3.5, 0, 14],
      'd18O_cel_variance': [0, 200, 12.5, 200, 72, 4.5, 0, 242],
      'fraud_p_value': [0.050020, 0.520953, 0.004640, 0.503469, 0.284436, 0.067667,
                          0.088811, 0.474167],
  })
  assert_frame_equal(predictions.reset_index(drop=True),
                     expected_data.reset_index(drop=True), check_dtype=False,
                     check_exact=False, atol=1e-4)

get_predictions_test()

  from pandas.util.testing import assert_frame_equal


Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [16]:
from pandas.util.testing import assert_frame_equal

def get_predictions_grouped_test():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 2, 2, 2, 2, 2,],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [0.5, 0.4, 0.3, 0.2, 0.1, 0.8, 0.7, 0.6],
  })
  isotope_means_column_names = ['d18O_cel_mean']
  isotope_variances_column_names = ['d18O_cel_variance']
  isotope_counts_column_names = ['d18O_cel_count']
  test_means_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)]
  test_vars_isoscapes = [raster.load_raster(raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)]
  sample_size_per_location = 2

  predictions = (
    hypothesis.get_predictions_grouped(
                  sample_data=sample_data,
                  isotope_means_column_names=isotope_means_column_names,
                  isotope_variances_column_names=isotope_variances_column_names,
                  isotope_counts_column_names=isotope_counts_column_names,
                  means_isoscapes=test_means_isoscapes,
                  variances_isoscapes=test_vars_isoscapes,
                  sample_size_per_location=sample_size_per_location)
  )

  expected_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71.0, -69.0, -68.0, -67.0, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 2, 2, 2, 2, 2,],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [0.5, 0.4, 0.3, 0.2, 0.1, 0.8, 0.7, 0.6],
      'fraud_p_value': [0.0458605831575194, 0.10940399189053307,
                        0.027583568820964112, 0.11010751535199495,
                        0.12965915860886087, 0.10221112248373174,
                        0.0817158292215802, 0.0935581737612622],
  })
  assert_frame_equal(predictions.reset_index(drop=True),
                     expected_data.reset_index(drop=True), check_dtype=False,
                     check_exact=False, atol=1e-4)

get_predictions_grouped_test()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [17]:
# Test that we skip locations with one measurement.
def get_predictions_some_tail_locations():
  # The range of expected values is around 25 for most locations here.
  # One of the (-67, -4.3) data points was removed. This element was a true
  # negative (non-fraud) so accuracy is affected (decreases) but precision
  # and recall weren't affected.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "e", "f", "g", "h",
               "a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -66, -65, -64, -63,
               -71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4, -3.9, -3.5, -3,
              -5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, True, False, True, False,
                True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 105.0, 25.0, 20.0, 2.0, 0.0, 25.0],
  })
  isotope_column_name = ['d18O_cel']
  test_means_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME),
      use_only_band_index=0)]
  test_vars_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME),
      use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  predictions = (
    hypothesis.get_predictions(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location)
  )

  expected_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71.0, -69.0, -68.0, -67, -66.0, -65.0, -64.0, -63.0],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [2, 2, 2, 1, 2, 2, 2, 2],
      'd18O_cel_mean': [-20.0, 15.0, 102.5, 25.0, 14.0, 3.5, 0.0, 14.0],
      'd18O_cel_variance': [0.0, 200.0, 12.5, np.nan, 72.0, 4.5, 0.0, 242.0],
      'fraud_p_value': [0.050020, 0.520953, 0.004640, np.nan, 0.284436, 0.067667,
                        0.088811, 0.474167],
  })
  assert_frame_equal(predictions.reset_index(drop=True),
                     expected_data.reset_index(drop=True), check_dtype=False,
                     check_exact=False, atol=1e-4)

get_predictions_some_tail_locations()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [18]:
# Test where all locations are unique and all are skipped because we don't have samples >= 1
def get_predictions_all_unique_locations():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
  })
  isotope_column_name = ['d18O_cel']
  test_means_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_MEANS_FILENAME),
      use_only_band_index=0)]
  test_vars_isoscape = [raster.load_raster(
      raster.get_raster_path(TEST_OXYGEN_ISOSCAPE_VARS_FILENAME),
      use_only_band_index=0)]
  sample_size_per_location = 2
  p_value_target = 0.05

  predictions = (
    hypothesis.get_predictions(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location)
  )

  expected_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel_count': [1, 1, 1, 1, 1, 1, 1, 1],
      'd18O_cel_mean': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
      'd18O_cel_variance': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
      'fraud_p_value': [None, None, None, None, None, None, None, None],
  })
  assert_frame_equal(predictions.reset_index(drop=True),
                     expected_data.reset_index(drop=True))

get_predictions_all_unique_locations()

Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 235 x 234 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
