<a href="https://colab.research.google.com/github/tnc-br/ddf_common/blob/ttest-greta/dataset_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()

executing checkout_branch ttest-greta...
Branch ttest-greta already checked out.
Remember to reload your imports with `importlib.reload(module)`.
b"fatal: destination path 'ddf_common' already exists and is not an empty directory.\nRepository already exists.\n"
b'Already up to date.\n'
b''
ttest-greta branch checked out at "/content/gdrive/MyDrive/ttest-greta/ddf_common". You may now use ddf_common imports and change common files.


interactive(children=(Text(value='required', description='Commit Msg', placeholder='Enter commit message'), Bu…

In [18]:
import dataset
import importlib
import raster
importlib.reload(dataset)

<module 'dataset' from '/content/gdrive/MyDrive/ttest-greta/ddf_common/dataset.py'>

# preprocess_sample_data Tests

In [19]:
from pandas.util.testing import assert_frame_equal

# Average and variance test
def average_variance_test():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_mean": [4.0, -3.0],
      "y_variance": [32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_no_grouping():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=False

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x_mean": [3.0, 3.0, 3.0],
      "x_variance": [0.0, 0.0, np.nan],
      "y_mean": [4.0, 4.0, -3.0],
      "y_variance": [32.0, 32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_keep_nonnumerical_columns():
  test_df = pd.DataFrame({
      "code": ["a", "b", "c"],
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["code", "lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "code": ["a", "c"],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_mean": [4.0, -3.0],
      "y_variance": [32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

average_variance_test()
average_variance_test_no_grouping()
average_variance_test_keep_nonnumerical_columns()

AttributeError: ignored

# create_fraudulent_samples Tests

In [20]:
TEST_ISOSCAPE_MEANS_FILENAME = "xgb_means_oxygen_isoscape_ucd_42.tiff" #@param
TEST_ISOSCAPE_VARS_FILENAME = "xgb_variances_oxygen_isoscape_ucd_42.tiff" #@param

In [24]:
def create_fraudulent_samples_test():
  # The range of expected values is around 25 for most locations here.
  real_samples_data = pd.DataFrame({
      'Code': ["a", "a", "a", "a", "a", "b", "b", "b","b","b"],
      'lat': [-2.499,-2.499,-2.499,-2.499,-2.499, -6.009706576,-6.009706576,-6.009706576,-6.009706576,-6.009706576],
      'long': [-59.121, -59.121, -59.121, -59.121, -59.121, -61.8686565,-61.8686565,-61.8686565,-61.8686565,-61.8686565],
      'd18O_cel': [-20.0, 5.0, 100.0, 5.0, 8.0, 5.0, 0.0, 3.0,
                   -20.0, 25.0],
  })

  mean_iso = raster.load_raster(raster.get_raster_path(TEST_ISOSCAPE_MEANS_FILENAME), use_only_band_index=0)
  element = 'd18O_cel'
  max_trusted_radius = 0.1
  max_fraud_radius = 30
  min_fraud_radius = 5

  fake_sample = (
    dataset.create_fraudulent_samples(real_samples_data,
                                      mean_iso,
                                      element,
                                      max_trusted_radius,
                                      max_fraud_radius,
                                      min_fraud_radius))

create_fraudulent_samples_test()

Driver: GTiff/GeoTIFF
Size is 940 x 936 x 1
Projection is GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Origin = (-73.975139313, 5.266527396)
Pixel Size = (0.0416666665, -0.041666666500000005)
        Code       lat       long  d18O_cel  fraud
0  fake_mad1 -5.857552 -61.930489       5.0   True
1  fake_mad1 -5.857552 -61.930489       0.0   True
2  fake_mad1 -5.857552 -61.930489       3.0   True
3  fake_mad1 -5.857552 -61.930489     -20.0   True
4  fake_mad1 -5.857552 -61.930489      25.0   True
5  fake_mad2 -2.319868 -59.250566     -20.0   True
6  fake_mad2 -2.319868 -59.250566       5.0   True
7  fake_mad2 -2.319868 -59.250566     100.0   True
8  fake_mad2 -2.319868 -59.250566       5.0   True
9  fake_mad2 -2.319868 -59.250566       8.0   True
