<a href="https://colab.research.google.com/github/tnc-br/ddf_common/blob/partition_data_random/dataset_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()



interactive(children=(Text(value='', description='Email', placeholder='Enter email'), Text(value='', descripti…

In [2]:
import dataset

# preprocess_sample_data Tests

In [3]:
from pandas.util.testing import assert_frame_equal

# Average and variance test
def average_variance_test():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_mean": [4.0, -3.0],
      "y_variance": [32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_no_grouping():
  test_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=False

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x_mean": [3.0, 3.0, 3.0],
      "x_variance": [0.0, 0.0, np.nan],
      "y_mean": [4.0, 4.0, -3.0],
      "y_variance": [32.0, 32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

def average_variance_test_keep_nonnumerical_columns():
  test_df = pd.DataFrame({
      "code": ["a", "b", "c"],
      "lat": [0.0, 0.0, 1.0],
      "long": [3.0, 3.0, 6.0],
      "x": [3.0, 3.0, 3.0],
      "y": [8.0, 0.0, -3.0]}
  )
  feature_columns = ["code", "lat", "long"]
  label_columns = ["x", "y"]
  aggregate_columns = ["lat", "long"]
  keep_grouping=True

  test_sample = dataset.preprocess_sample_data(test_df, feature_columns, label_columns, aggregate_columns, keep_grouping)
  expected_df = pd.DataFrame({
      "lat": [0.0, 1.0],
      "long": [3.0, 6.0],
      "code": ["a", "c"],
      "x_mean": [3.0, 3.0],
      "x_variance": [0.0, np.nan],
      "y_mean": [4.0, -3.0],
      "y_variance": [32.0, np.nan]
  })
  assert_frame_equal(expected_df, test_sample)

average_variance_test()
average_variance_test_no_grouping()
average_variance_test_keep_nonnumerical_columns()

  from pandas.util.testing import assert_frame_equal


AttributeError: ignored

In [None]:
# @title Load example CSV
jun23_reference_data = """
ID,Cod Lab,Code,Species,Scientific_name,Genus,Family,Point,Origin,State,date sample havest,lat,long,d15N_wood,%N_wood,d13C_wood,%C_wood,d13C_cel,%C_cel,d18O_cel_CENA,d18O_cel_Davis,d18O_cel_analysis,d18O_cel,VPD,RH,PET,DEM,PA,Mean Annual Temperature,Mean Annual Precipitation,Iso_Oxi_Stack_mean_TERZER
1,ZAC 730,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
2,ZAC 731,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
3,ZAC 732,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
4,ZAC 733,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
5,ZAC 734,mad53,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.499,-59.121,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
6,ZAC 735,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
7,ZAC 736,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
8,ZAC 737,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
9,ZAC 738,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
10,ZAC 739,mad54,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.496,-59.126,0,0,0,0,0,0,0,0,0,0,0.76667,0.78997,98.28333,103,1000.65411,26.725,2264,-3.70363
11,ZAC 740,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,0,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
12,ZAC 741,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,25,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
13,ZAC 742,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,50,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
14,ZAC 743,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,75,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
15,ZAC 744,mad55,macaranduba,Manilkara huberi (Ducke) A.Chev.,manilkara,sapotaceae,100,itacoatiara,amazonas,Jul-22,-2.495,-59.12,0,0,0,0,0,0,0,0,0,0,0.775,0.78866,98.45,139,996.36792,26.79167,2253,-3.70363
1094,ZAC 623,madsd10,ucuuba_puna,Iryanthera  laevis Markgr,iryanthera,myristicaceae,75,manicoré,amazonas,Mar-14,-6.009706576,-61.8686565,0,0,0,0,0,0,0,0,0,0,0.77083,0.79509,93.975,71,1004.47662,27.2,1996,-4.05694
1095,ZAC 624,madsd10,ucuuba_puna,Iryanthera  laevis Markgr,iryanthera,myristicaceae,100,manicoré,amazonas,Mar-14,-6.009706576,-61.8686565,0,0,0,0,0,0,0,0,0,0,0.77083,0.79509,93.975,71,1004.47662,27.2,1996,-4.05694
"""

In [None]:
def xgb_equivalence():
  with open(raster.get_sample_db_path("scratch_file.txt"), "w") as f:
    f.write(jun23_reference_data)

  pds1 = dataset.partitioned_reference_data("scratch_file.txt")

  # The equivalent...
  df = pd.read_csv(raster.get_sample_db_path("scratch_file.txt"),
    encoding="ISO-8859-1", sep=',')
  df = dataset.partition(df.rename(
      columns={'long': 'lon' })).train
  df = df[['Code', 'lat', 'lon', 'd18O_cel']]
  df = df[df['d18O_cel'].notna()]
  df = df.groupby(['lat', 'lon'])
  df = df.mean().reset_index().rename(
      columns={'d18O_cel': 'cellulose_oxygen_ratio' }).reset_index()
  df.drop('index', inplace=True, axis=1)

  fdf = dataset.add_features_from_rasters(df, [raster.relative_humidity_geotiff(),
      raster.temperature_geotiff(),
      raster.vapor_pressure_deficit_geotiff(),
      raster.atmosphere_isoscape_geotiff()])

  fdf1 = fdf.sort_index(axis=1).reset_index(drop=True)
  fdf2 = pds1.train.sort_index(axis=1).reset_index(drop=True)

  assert_frame_equal(fdf1, fdf2)

  xgb_equivalence()

In [None]:
from pandas.testing import assert_frame_equal

def test_partition_data_fixed():
  '''
  For reference:
  _FIXED_PARTITION_STRATEGY = FixedPartitionStrategy(
    # Train
    DatasetGeographicPartitions(
        min_longitude=-62.5,
        max_longitude=float('inf'),
        min_latitude=-5,
        max_latitude=float('inf'),
    ),
    # Validation
    DatasetGeographicPartitions(
        min_longitude=float('-inf'),
        max_longitude=-62.5,
        min_latitude=-5,
        max_latitude=float('inf')
    ),
    # Test
    DatasetGeographicPartitions(
        min_longitude=float('-inf'),
        max_longitude=float('inf'),
        min_latitude=float('-inf'),
        max_latitude=-5
    )
  )
  '''
  train_long_values = [-62.5, -50.0, -20.0, 0.0, 100.0]
  train_lat_values = [-5.0, -3.0, -2.0, -1.0, 0.0]

  validation_long_values = [-80.0, -70.0, -63.0]
  validation_lat_values = [-5.0, -2.0, 5.0]

  test_long_values = [-50.0, -30.0, -40.0, 0.0, 50.0, 100.0]
  test_lat_values = [-50.0, -30.0, -40.0, -20.0, -10.0, -5.1]

  test_data = pd.DataFrame({
      "long": train_long_values + test_long_values + validation_long_values,
      "lat": train_lat_values + test_lat_values + validation_lat_values,
  })

  partitioned_dataset = dataset.partition(test_data, dataset.PartitionStrategy.FIXED)

  assert_frame_equal(
      partitioned_dataset.train.reset_index(drop=True),
      pd.DataFrame({
          "long": train_long_values,
          "lat": train_lat_values,
      }).reset_index(drop=True)
  )
  assert_frame_equal(
      partitioned_dataset.validation.reset_index(drop=True),
      pd.DataFrame({
          "long": validation_long_values,
          "lat": validation_lat_values,
      }).reset_index(drop=True)
  )
  assert_frame_equal(
      partitioned_dataset.test.reset_index(drop=True),
      pd.DataFrame({
          "long": test_long_values,
          "lat": test_lat_values,
      }).reset_index(drop=True)
  )

test_partition_data_fixed()

In [None]:
def test_partition_data_random():
  '''
  For reference:
  TRAIN_VALIDATION_TEST_RATIOS = [0.8, 0.1, 0.1]
  '''

  test_data = pd.DataFrame({
      "long": [-62.5, -50.0, -20.0, 0.0, 100.0, -62.5, -50.0, -20.0, 0.0, 100.0],
      "lat": [-5.0, -3.0, -2.0, -1.0, 0.0, -5.0, -3.0, -2.0, -1.0, 0.0],
  })

  partitioned_dataset = dataset.partition(test_data, dataset.PartitionStrategy.RANDOM)

  print(partitioned_dataset.train)
  print(partitioned_dataset.validation)
  print(partitioned_dataset.test)

  assert(partitioned_dataset.train.shape[0] == 8)
  assert(partitioned_dataset.validation.shape[0] == 1)
  assert(partitioned_dataset.test.shape[0] == 1)

test_partition_data_random()