<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/update_data_ingestion/data_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
SAMPLE_CSV_PATH = "2023_06_23_Results_Google.csv" #@param
FEATURE_COLUMNS = ['lat', 'long', 'VPD', 'RH', 'PET', 'DEM', 'PA', 'Mean Annual Temperature', 'Mean Annual Precipitation',        'Iso_Oxi_Stack_mean_TERZER', 'predkrig_br_lat_ISORG',        'isoscape_fullmodel_d18O_prec_REGRESSION', 'Code', 'Family', 'Origin'] #@param
LABEL_COLUMNS = ["d18O_cel"] #@param

# The columns that will be used to group the dataset to calculate
# means and variance on LABEL_COLUMNS
GROUPING_COLUMNS = ["Code", "lat", "long"] #@param
# If True, the rows will remain unique GROUPING_COLUMNS values. Otherwise
# we merge the grouping columns aggregates with the original dataset.
KEEP_GROUPING = True #@param

# Values can be:
# - SORTING: Sample rows are sorted by key made of PARTITION_COLUMNS
PARTITION_STRATEGY = "FIXED" #@param ["FIXED"] {allow-input:true}

# For FIXED only, the bounds of each partition for the split
TRAIN_FIXED_BOUNDS = [(-5, -62.5), (float('inf'), float('inf'))]
VALIDATION_FIXED_BOUNDS = [(-5, float('-inf')), (float('inf'), -62.5)]
TEST_FIXED = BOUNDS = [(float('-inf'), float('-inf')), (-5, float('inf'))]
TRAIN_VALIDATION_TEST_BOUNDS = [TRAIN_FIXED_BOUNDS, VALIDATION_FIXED_BOUNDS, TEST_FIXED]

OUTPUT_DATASET_NAME="uc_davis" #@param
OUTPUT_DATASET_ROOT = "/canonical/" #@param
GROUPING_STR = "grouped" if KEEP_GROUPING else "ungrouped"

OUTPUT_TRAIN_CSV_PATH = f"{OUTPUT_DATASET_ROOT}{OUTPUT_DATASET_NAME}_train_{PARTITION_STRATEGY.lower()}_{GROUPING_STR}.csv"
OUTPUT_VALIDATION_CSV_PATH = f"{OUTPUT_DATASET_ROOT}{OUTPUT_DATASET_NAME}_validation_{PARTITION_STRATEGY.lower()}_{GROUPING_STR}.csv"
OUTPUT_TEST_CSV_PATH = f"{OUTPUT_DATASET_ROOT}{OUTPUT_DATASET_NAME}_test_{PARTITION_STRATEGY.lower()}_{GROUPING_STR}.csv"

In [2]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_source_control_pane()
# ddfimport.ddf_import_common()



interactive(children=(Text(value='', description='Email', placeholder='Enter email'), Text(value='', descripti…

# Pre-Process Sample

In [3]:
import dataset

In [5]:
import raster

In [6]:
df = pd.read_csv(raster.get_sample_db_path(SAMPLE_CSV_PATH), encoding="ISO-8859-1", sep=',')
sample_data = dataset.preprocess_sample_data(df, FEATURE_COLUMNS, LABEL_COLUMNS, GROUPING_COLUMNS, KEEP_GROUPING)

  means = grouped.mean().reset_index()
  variances = grouped.var().reset_index()


# Partition Data

In [7]:
partitioned_dataset = None
if PARTITION_STRATEGY == "FIXED":
  partitioned_dataset = dataset.partition(sample_data, dataset.PartitionStrategy.FIXED)
else:
  raise ValueError(f"Unknown partition strategy: {PARTITION_STRATEGY}")

In [8]:
train_data = partitioned_dataset.train
validation_data = partitioned_dataset.validation
test_data = partitioned_dataset.test

In [9]:
# print(train_data.shape[0])
# print(validation_data.shape[0])
# print(test_data.shape[0])

In [12]:
# Optional (plot splits)
# import matplotlib
# import matplotlib.pyplot as plt

# plt.scatter(train_data["lat"], train_data["long"], label="train", alpha=0.2)
# plt.scatter(validation_data["lat"], validation_data["long"], label="validation", alpha=0.2)
# plt.scatter(test_data["lat"], test_data["long"], label="test", alpha=0.2)
# plt.xlabel('Lattitude')
# plt.ylabel('Longitude')
# plt.legend()
# plt.show()

In [13]:
train_data.to_csv(raster.get_sample_db_path(OUTPUT_TRAIN_CSV_PATH))
validation_data.to_csv(raster.get_sample_db_path(OUTPUT_VALIDATION_CSV_PATH))
test_data.to_csv(raster.get_sample_db_path(OUTPUT_TEST_CSV_PATH))