<a href="https://colab.research.google.com/github/tnc-br/ddf_common/blob/ttest/hypothesis_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
TEST_ISOSCAPE_MEANS_FILENAME = "uc_davis_d18O_cel_kriging_means_2.tiff" #@param
TEST_ISOSCAPE_VARS_FILENAME = "uc_davis_d18O_cel_kriging_vars_2.tiff" #@param

In [2]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()



interactive(children=(Text(value='', description='Email', placeholder='Enter email'), Text(value='', descripti…

# Tests

In [3]:
import pytest
import hypothesis
import raster

In [4]:
import importlib
import hypothesis
importlib.reload(hypothesis)

<module 'hypothesis' from '/content/gdrive/MyDrive/ttest/ddf_common/hypothesis.py'>

In [9]:
def sample_ttest_test():
  longitude = -71
  latitude = -5.5
  # The range of expected values is around 25 for most locations here.
  isotope_values = [-20.0, -15.0]
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
  sample_size_per_location = 2
  p_value_target = 0.05

  hypothesis_test = hypothesis.sample_ttest(longitude,
                 latitude,
                 isotope_values,
                 test_means_isoscape,
                 test_vars_isoscape,
                 sample_size_per_location,
                 p_value_target)

  assert(hypothesis_test.longitude == pytest.approx(-71))
  assert(hypothesis_test.latitude == pytest.approx(-5.5))
  assert(hypothesis_test.p_value == pytest.approx(0.009341288347548294))
  assert(hypothesis_test.p_value_threshold == pytest.approx(0.05))

sample_ttest_test()

Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [10]:
import unittest

class SampleTTestCase(unittest.TestCase):
  def sample_ttest_test_tail_location(self):
    longitude = -71
    latitude = -5.5
    # The range of expected values is around 25 for most locations here.
    isotope_values = [-20.0]
    test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
    test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
    sample_size_per_location = 2
    p_value_target = 0.05

    with self.assertRaises(ValueError):
      hypothesis.sample_ttest(longitude,
                    latitude,
                    isotope_values,
                    test_means_isoscape,
                    test_vars_isoscape,
                    sample_size_per_location,
                    p_value_target)

SampleTTestCase().sample_ttest_test_tail_location()

Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [12]:
def fraud_metrics_test():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h",
               "a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -67, -66, -65, -64, -63,
               -71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3,
              -5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 105.0, 25.0, 20.0, 2.0, 0.0, 25.0],
  })
  isotope_column_name = 'd18O_cel'
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
  sample_size_per_location = 2
  p_value_target = 0.05

  accuracy, precision, recall = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location,
                  p_value_target)
  )

  assert(accuracy == pytest.approx(0.625))
  assert(precision == pytest.approx(0.6666666666666666))
  assert(recall == pytest.approx(0.5))

fraud_metrics_test()

Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)


In [14]:
# Test that we skip locations with one measurement.
def fraud_metrics_some_tail_locations():
  # The range of expected values is around 25 for most locations here.
  # One of the (-67, -4.3) data points was removed. This element was a true
  # negative (non-fraud) so accuracy is affected (decreases) but precision
  # and recall weren't affected.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "e", "f", "g", "h",
               "a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -66, -65, -64, -63,
               -71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4, -3.9, -3.5, -3,
              -5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, True, False, True, False,
                True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0, 105.0, 25.0, 20.0, 2.0, 0.0, 25.0],
  })
  isotope_column_name = 'd18O_cel'
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
  sample_size_per_location = 2
  p_value_target = 0.05

  accuracy, precision, recall = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location,
                  p_value_target)
  )

  print(accuracy, precision, recall)

  assert(accuracy == pytest.approx(0.5714285714285714))
  assert(precision == pytest.approx(0.6666666666666666))
  assert(recall == pytest.approx(0.5))

fraud_metrics_some_tail_locations()

Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
0.5714285714285714 0.6666666666666666 0.5


In [15]:
# Test where all locations are unique.
def fraud_metrics_unique_locations():
  # The range of expected values is around 25 for most locations here.
  sample_data = pd.DataFrame({
      'Code': ["a", "b", "c", "d", "e", "f", "g", "h"],
      'long': [-71, -69, -68, -67, -66, -65, -64, -63],
      'lat': [-5.5, -5, -4.5, -4.3, -4, -3.9, -3.5, -3],
      'fraud': [True, False, True, False, True, False, True, False],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0],
  })
  isotope_column_name = 'd18O_cel'
  test_means_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  test_vars_isoscape = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_VARS_FILENAME), use_only_band_index=0)
  sample_size_per_location = 2
  p_value_target = 0.05

  accuracy, precision, recall = (
    hypothesis.fraud_metrics(sample_data,
                  isotope_column_name,
                  test_means_isoscape,
                  test_vars_isoscape,
                  sample_size_per_location,
                  p_value_target)
  )

  assert(accuracy == pytest.approx(0))
  assert(precision == pytest.approx(0))
  assert(recall == pytest.approx(0))

fraud_metrics_unique_locations()

Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
Driver: GTiff/GeoTIFF
Size is 256 x 256 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-74.0, 5.333333333999995)
Pixel Size = (0.16666666666808508, -0.16666666666808505)
