In [None]:
from collections import defaultdict
from dataclasses import dataclass
import math
import numpy as np
from numpy.random import MT19937, RandomState, SeedSequence
from osgeo import gdal, gdal_array
import pandas as pd
from typing import List
from tqdm import tqdm

#@title Debugging
# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = True #@param {type:"boolean"}

RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
OUTPUT_DIR = "/MyDrive/amazon_rainforest_files/dataframes/" #@param
GDRIVE_BASE = "/content/drive" #@param

def get_raster_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{RASTER_BASE}{filename}"

def get_output_dir(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{OUTPUT_DIR}{filename}"

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

##Dependencies from xgboost.ipynb

TODO: Move these to a shared module

In [None]:
@dataclass
class AmazonGeoTiff:
  """Represents a geotiff from our dataset."""
  gdal_dataset: gdal.Dataset
  image_value_array: np.ndarray # ndarray of floats
  image_mask_array: np.ndarray # ndarray of uint8
  masked_image: np.ma.masked_array
  yearly_masked_image: np.ma.masked_array

@dataclass
class Bounds:
  """Represents geographic bounds and size information."""
  minx: float
  maxx: float
  miny: float
  maxy: float
  pixel_size_x: float
  pixel_size_y: float
  raster_size_x: float
  raster_size_y: float

  def to_matplotlib(self) -> List[float]:
    return [self.minx, self.maxx, self.miny, self.maxy]

def print_raster_info(raster):
  dataset = raster
  print("Driver: {}/{}".format(dataset.GetDriver().ShortName,
                              dataset.GetDriver().LongName))
  print("Size is {} x {} x {}".format(dataset.RasterXSize,
                                      dataset.RasterYSize,
                                      dataset.RasterCount))
  print("Projection is {}".format(dataset.GetProjection()))
  geotransform = dataset.GetGeoTransform()
  if geotransform:
      print("Origin = ({}, {})".format(geotransform[0], geotransform[3]))
      print("Pixel Size = ({}, {})".format(geotransform[1], geotransform[5]))

  for band in range(dataset.RasterCount):
    band = dataset.GetRasterBand(band+1)
    #print("Band Type={}".format(gdal.GetDataTypeName(band.DataType)))

    min = band.GetMinimum()
    max = band.GetMaximum()
    if not min or not max:
        (min,max) = band.ComputeRasterMinMax(False)
    #print("Min={:.3f}, Max={:.3f}".format(min,max))

    if band.GetOverviewCount() > 0:
        print("Band has {} overviews".format(band.GetOverviewCount()))

    if band.GetRasterColorTable():
        print("Band has a color table with {} entries".format(band.GetRasterColorTable().GetCount()))

def load_raster(path: str, use_only_band_index: int = -1) -> AmazonGeoTiff:
  """
  TODO: Refactor (is_single_band, etc., should be a better design)
  --> Find a way to simplify this logic. Maybe it needs to be more abstract.
  """
  dataset = gdal.Open(path, gdal.GA_ReadOnly)
  try:
    print_raster_info(dataset)
  except AttributeError as e:
    raise OSError("Failed to print raster. This likely means it did not load properly from "+ path)
  image_datatype = dataset.GetRasterBand(1).DataType
  mask_datatype = dataset.GetRasterBand(1).GetMaskBand().DataType
  image = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                  dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))
  mask = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                  dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))

  if use_only_band_index == -1:
    if dataset.RasterCount != 12 and dataset.RasterCount != 1:
      raise ValueError(f"Expected 12 raster bands (one for each month) or one annual average, but found {dataset.RasterCount}")
    if dataset.RasterCount == 1:
      use_only_band_index = 0

  is_single_band = use_only_band_index != -1

  if is_single_band and use_only_band_index >= dataset.RasterCount:
    raise IndexError(f"Specified raster band index {use_only_band_index}"
    f" but there are only {dataset.RasterCount} rasters")

  for band_index in range(12):
    band = dataset.GetRasterBand(use_only_band_index+1 if is_single_band else band_index+1)
    image[:, :, band_index] = band.ReadAsArray()
    mask[:, :, band_index] = band.GetMaskBand().ReadAsArray()
  masked_image = np.ma.masked_where(mask == 0, image)
  yearly_masked_image = masked_image.mean(axis=2)

  return AmazonGeoTiff(dataset, image, mask, masked_image, yearly_masked_image)

def get_extent(dataset):
  geoTransform = dataset.GetGeoTransform()
  minx = geoTransform[0]
  maxy = geoTransform[3]
  maxx = minx + geoTransform[1] * dataset.RasterXSize
  miny = maxy + geoTransform[5] * dataset.RasterYSize
  return Bounds(minx, maxx, miny, maxy, geoTransform[1], geoTransform[5], dataset.RasterXSize, dataset.RasterYSize)

def coords_to_indices(bounds: Bounds, x: float, y: float):
  if x < bounds.minx or x > bounds.maxx or y < bounds.miny or y > bounds.maxy:
    raise ValueError("Coordinates out of bounds")

  # X => lat, Y => lon
  x_idx = bounds.raster_size_y - int(math.ceil((y - bounds.miny) / abs(bounds.pixel_size_y)))
  y_idx = int((x - bounds.minx) / abs(bounds.pixel_size_x))

  return x_idx, y_idx

def get_data_at_coords(dataset: AmazonGeoTiff, x: float, y: float, month: int) -> float:
  # x = longitude
  # y = latitude
  bounds = get_extent(dataset.gdal_dataset)
  x_idx, y_idx = coords_to_indices(bounds, x, y)
  if month == -1:
    value = dataset.yearly_masked_image[x_idx, y_idx]
  else:
    value = dataset.masked_image[x_idx, y_idx, month]
  if np.ma.is_masked(value):
    raise ValueError("Coordinates are masked")
  else:
    return value

# Randomly sample from geotiffs image files to generate DataFrames.
def generate_tabular_dataset(
    monthly: bool,
    geotiff_features: dict[str, AmazonGeoTiff],
    sample_site_coordinates: List[tuple[float, float]],
    sample_radius: float,
    random_samples_per_site: int) -> pd.DataFrame:
  feature_names = ["lat", "lon", "month_of_year"] + list(geotiff_features.keys())
  feature_values = defaultdict(list)

  rs = RandomState(MT19937(SeedSequence(42)))
  for x, y in tqdm(sample_site_coordinates):
    for month in range(0, 12 if monthly else 1):
      random_samples_collected = 0
      while random_samples_collected < random_samples_per_site:
        row = {}
        sample_x, sample_y = 2*(rs.rand(2) - 0.5) * sample_radius
        sample_x += x
        sample_y += y
        try:
          for geotiff_label, geotiff in geotiff_features.items():
            row[geotiff_label] = get_data_at_coords(
                geotiff, sample_x, sample_y, month)
          row["month_of_year"] = month
          row["lon"] = sample_x
          row["lat"] = sample_y
          random_samples_collected += 1
        except ValueError:
          continue # masked and out-of-bounds coordinates
        for feature_name, value in row.items():
          feature_values[feature_name].append(value)

  samples = pd.DataFrame(feature_values)

  if not monthly:
    samples.drop("month_of_year", axis=1, inplace=True)

  return samples

In [None]:
relative_humidity_geotiff = load_raster(get_raster_path("R.rh_Stack.tif"))
temperature_geotiff = load_raster(get_raster_path("Temperatura_Stack.tif"))
vapor_pressure_deficit_geotiff = load_raster(get_raster_path("R.vpd_Stack.tif"))
atmosphere_isoscape_geotiff = load_raster(get_raster_path("Iso_Oxi_Stack.tif"))
cellulose_isoscape_geotiff = load_raster(get_raster_path("iso_O_cellulose.tif"))

name_to_geotiff = {
    "rh": relative_humidity_geotiff,
    "temp" : temperature_geotiff,
    "vpd" : vapor_pressure_deficit_geotiff,
    "atmosphere_oxygen_ratio" : atmosphere_isoscape_geotiff,
    "cellulose_oxygen_ratio" : cellulose_isoscape_geotiff
}

Generate and save the dataframes to CSVs:

In [None]:
sample_site_coordinates = [(-70,-5,),(-67.5,0,),(-66,-4.5,),(-63,-9.5,),(-63,-9,),(-62,-6,),(-60,-2.5,),(-60,1,),(-60,-12.5,),(-59,-2.5,),(-57.5,-4,),(-55,-3.5,),(-54,-1,),(-52.5,-13,),(-51.5,-2.5,)]

monthly_large_df = generate_tabular_dataset(
    monthly=True,
    geotiff_features=name_to_geotiff,
    sample_site_coordinates=sample_site_coordinates,
    sample_radius=0.5,
    random_samples_per_site=30)
monthly_large_df.to_csv(get_output_dir("monthly_large.csv"))

In [None]:
monthly_sparse_df = generate_tabular_dataset(
    monthly=True,
    geotiff_features=name_to_geotiff,
    sample_site_coordinates=sample_site_coordinates,
    sample_radius=0.5,
    random_samples_per_site=17)
monthly_sparse_df.to_csv(get_output_dir("monthly_sparse.csv"))

In [None]:
yearly_large_df = generate_tabular_dataset(
    monthly=False,
    geotiff_features=name_to_geotiff,
    sample_site_coordinates=sample_site_coordinates,
    sample_radius=0.5,
    random_samples_per_site=30*12)
yearly_large_df.to_csv(get_output_dir("yearly_large.csv"))

In [None]:
yearly_small_df = generate_tabular_dataset(
    monthly=False,
    geotiff_features=name_to_geotiff,
    sample_site_coordinates=sample_site_coordinates,
    sample_radius=0.5,
    random_samples_per_site=30*12)
yearly_small_df.to_csv(get_output_dir("yearly_small.csv"))