<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/UCDavisXGB/xgboost/hosted_runtime/xgboost_hosted_runtime.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copyright 2023 Google LLC.
SPDX-License-Identifier: Apache-2.0

# Imports

In [None]:
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

In [None]:
# Raster directory. Contains:
# iso_O_cellulose.tif: Isoscape of 18O from Precipitation; <-- MODELING TARGET
# Iso_Oxi_Stack.tif: Isoscape of 18O from Precipitation; <-- Model input
# R.rh_Stack.tif: Atmospheric Relative humidity <-- Model input
# R.vpd_Stack.tif: Vapor Pressure Deficit - VPD <-- Model input
# Temperature_Stack.tif: Atmospheric Temperature <-- Model input
RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
GDRIVE_BASE = "/content/drive" #@param

REBUILD_MODEL = True #@param {type:"boolean"}
MODEL_BASE = "/MyDrive/amazon_rainforest_files/amazon_isoscape_models/" #@param

# How often should XGB log training metadata? 0 is the default, which indicates never.
XGB_VERBOSITY_LEVEL = 0 #@param

# Used to compute invalid terrain when making predictions. Leave disabled if on a low memory.
LOAD_WATER_MASK_GEOTIFF = False #@param {type:"boolean"}
LOAD_TREE_MASK_GEOTIFF = False #@param {type:"boolean"}

# If true, then we use a test set loaded from a CSV of real data.
# The default is False, which means to simulate the test set using
# a tiff isoscape to sample test points from.
REFERENCE_CSV_FILENAME = "2023_06_23_Results_Google.csv" #@param
USE_REFERENCE_SAMPLES_FOR_TRAINING = True #@param {type:"boolean"}

# If true, requires soil and plant soil nitrogen geotiffs. Also requires the following files:
# RASTER_BASE/raster_krig_d15N_soil_plant.tiff
# RASTER_BASE/raster_krig_d15N_soil.tiff
REGENERATE_PLANT_NITROGEN_GEOTIFF = False #@param {type:"boolean"}

# If false, requires XGB oxygen isoscape in MODEL_BASE/predicted_isoscape_xgboost.tiff
REGENERATE_OXYGEN_XGB_ISOSCAPE = True #@param {type:"boolean"}

# If false, requires MODEL_BASE/xgb_means_oxygen_isoscape.tiff and MODEL_BASE/xgb_variances_oxygen_isoscape.tiff
REGENERATE_OXYGEN_XGB_MEANS_VARIANCES = True #@param {type:"boolean"}

# If false, requires MODEL_BASE/plant_nitrogen_isoscape_pooled_samples.numpy.mask and MODEL_BASE/plant_nitrogen_isoscape_pooled_samples.numpy.data
REGENERATE_NITROGEN_ISOSCAPE = False #@param {type:"boolean"}

In [None]:
#@title Debugging
# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = False #@param {type:"boolean"}
if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

# Data Types

In [None]:
@dataclass
class AmazonGeoTiff:
  """Represents a geotiff from our dataset."""
  gdal_dataset: gdal.Dataset
  image_value_array: np.ndarray # ndarray of floats
  image_mask_array: np.ndarray # ndarray of uint8
  masked_image: np.ma.masked_array
  yearly_masked_image: np.ma.masked_array

@dataclass
class Bounds:
  """Represents geographic bounds and size information."""
  minx: float
  maxx: float
  miny: float
  maxy: float
  pixel_size_x: float
  pixel_size_y: float
  raster_size_x: float
  raster_size_y: float

  def to_matplotlib(self) -> List[float]:
    return [self.minx, self.maxx, self.miny, self.maxy]

@dataclass
class PartitionedDataset:
  train: pd.DataFrame
  test: pd.DataFrame
  validation: pd.DataFrame

# Use Global Params to access files

In [None]:
def get_raster_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{RASTER_BASE}{filename}"

def get_model_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{MODEL_BASE}{filename}"

def get_sample_db_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{SAMPLE_DATA_BASE}{filename}"

def get_animations_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{ANIMATIONS_BASE}{filename}"

## Utils for loading Rasters

In [None]:
def print_raster_info(raster):
  dataset = raster
  print("Driver: {}/{}".format(dataset.GetDriver().ShortName,
                              dataset.GetDriver().LongName))
  print("Size is {} x {} x {}".format(dataset.RasterXSize,
                                      dataset.RasterYSize,
                                      dataset.RasterCount))
  print("Projection is {}".format(dataset.GetProjection()))
  geotransform = dataset.GetGeoTransform()
  if geotransform:
      print("Origin = ({}, {})".format(geotransform[0], geotransform[3]))
      print("Pixel Size = ({}, {})".format(geotransform[1], geotransform[5]))

  for band in range(dataset.RasterCount):
    band = dataset.GetRasterBand(band+1)
    #print("Band Type={}".format(gdal.GetDataTypeName(band.DataType)))

    min = band.GetMinimum()
    max = band.GetMaximum()
    if not min or not max:
        (min,max) = band.ComputeRasterMinMax(False)
    #print("Min={:.3f}, Max={:.3f}".format(min,max))

    if band.GetOverviewCount() > 0:
        print("Band has {} overviews".format(band.GetOverviewCount()))

    if band.GetRasterColorTable():
        print("Band has a color table with {} entries".format(band.GetRasterColorTable().GetCount()))

def load_raster(path: str, use_only_band_index: int = -1) -> AmazonGeoTiff:
  """
  TODO: Refactor (is_single_band, etc., should be a better design)
  --> Find a way to simplify this logic. Maybe it needs to be more abstract.
  """
  dataset = gdal.Open(path, gdal.GA_ReadOnly)
  try:
    print_raster_info(dataset)
  except AttributeError as e:
    raise OSError("Failed to print raster. This likely means it did not load properly from "+ path)
  image_datatype = dataset.GetRasterBand(1).DataType
  mask_datatype = dataset.GetRasterBand(1).GetMaskBand().DataType
  image = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                  dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))
  mask = np.zeros((dataset.RasterYSize, dataset.RasterXSize, 12),
                  dtype=gdal_array.GDALTypeCodeToNumericTypeCode(image_datatype))

  if use_only_band_index == -1:
    if dataset.RasterCount != 12 and dataset.RasterCount != 1:
      raise ValueError(f"Expected 12 raster bands (one for each month) or one annual average, but found {dataset.RasterCount}")
    if dataset.RasterCount == 1:
      use_only_band_index = 0

  is_single_band = use_only_band_index != -1

  if is_single_band and use_only_band_index >= dataset.RasterCount:
    raise IndexError(f"Specified raster band index {use_only_band_index}"
    f" but there are only {dataset.RasterCount} rasters")

  for band_index in range(12):
    band = dataset.GetRasterBand(use_only_band_index+1 if is_single_band else band_index+1)
    image[:, :, band_index] = band.ReadAsArray()
    mask[:, :, band_index] = band.GetMaskBand().ReadAsArray()
  masked_image = np.ma.masked_where(mask == 0, image)
  yearly_masked_image = masked_image.mean(axis=2)

  return AmazonGeoTiff(dataset, image, mask, masked_image, yearly_masked_image)

def get_extent(dataset):
  geoTransform = dataset.GetGeoTransform()
  minx = geoTransform[0]
  maxy = geoTransform[3]
  maxx = minx + geoTransform[1] * dataset.RasterXSize
  miny = maxy + geoTransform[5] * dataset.RasterYSize
  return Bounds(minx, maxx, miny, maxy, geoTransform[1], geoTransform[5], dataset.RasterXSize, dataset.RasterYSize)

def plot_band(geotiff: AmazonGeoTiff, month_index, figsize=None):
  if figsize:
    plt.figure(figsize=figsize)
  im = plt.imshow(geotiff.masked_image[:,:,month_index], extent=get_extent(geotiff.gdal_dataset).to_matplotlib(), interpolation='none')
  plt.colorbar(im)

def animate(geotiff: AmazonGeoTiff, nSeconds, fps):
  fig = plt.figure( figsize=(8,8) )

  months = []
  labels = []
  for m in range(12):
    months.append(geotiff.masked_image[:,:,m])
    labels.append(f"Month: {m+1}")
  a = months[0]
  extent = get_extent(geotiff.gdal_dataset).to_matplotlib()
  ax = fig.add_subplot()
  im = fig.axes[0].imshow(a, interpolation='none', aspect='auto', extent = extent)
  txt = fig.text(0.3,0,"", fontsize=24)
  fig.colorbar(im)

  def animate_func(i):
    if i % fps == 0:
      print( '.', end ='' )

    im.set_array(months[i])
    txt.set_text(labels[i])
    return [im, txt]

  anim = animation.FuncAnimation(
                                fig,
                                animate_func,
                                frames = nSeconds * fps,
                                interval = 1000 / fps, # in ms
                                )
  plt.close()

  return anim

def save_numpy_to_geotiff(bounds: Bounds, prediction: np.ma.MaskedArray, path: str):
  """Copy metadata from a base geotiff and write raster data + mask from `data`"""
  driver = gdal.GetDriverByName("GTiff")
  metadata = driver.GetMetadata()
  if metadata.get(gdal.DCAP_CREATE) != "YES":
      raise RuntimeError("GTiff driver does not support required method Create().")
  if metadata.get(gdal.DCAP_CREATECOPY) != "YES":
      raise RuntimeError("GTiff driver does not support required method CreateCopy().")

  dataset = driver.Create(path, bounds.raster_size_x, bounds.raster_size_y, prediction.shape[2], eType=gdal.GDT_Float64)
  dataset.SetGeoTransform([bounds.minx, bounds.pixel_size_x, 0, bounds.maxy, 0, bounds.pixel_size_y])
  dataset.SetProjection('GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433],AUTHORITY["EPSG","4326"]]')

  #dataset = driver.CreateCopy(path, base.gdal_dataset, strict=0)
  if len(prediction.shape) != 3 or prediction.shape[0] != bounds.raster_size_x or prediction.shape[1] != bounds.raster_size_y:
    raise ValueError("Shape of prediction does not match base geotiff")
  #if prediction.shape[2] > base.gdal_dataset.RasterCount:
  #  raise ValueError(f"Expected fewer than {dataset.RasterCount} bands in prediction but found {prediction.shape[2]}")

  prediction_transformed = np.flip(np.transpose(prediction, axes=[1,0,2]), axis=0)
  for band_index in range(dataset.RasterCount):
    band = dataset.GetRasterBand(band_index+1)
    if band.CreateMaskBand(0) == gdal.CE_Failure:
      raise RuntimeError("Failed to create mask band")
    mask_band = band.GetMaskBand()
    band.WriteArray(np.choose(prediction_transformed[:, :, band_index].mask, (prediction_transformed[:, :, band_index].data,np.array(band.GetNoDataValue()),)))
    mask_band.WriteArray(np.logical_not(prediction_transformed[:, :, band_index].mask))

def coords_to_indices(bounds: Bounds, x: float, y: float):
  if x < bounds.minx or x > bounds.maxx or y < bounds.miny or y > bounds.maxy:
    raise ValueError("Coordinates out of bounds")

  # X => lat, Y => lon
  x_idx = bounds.raster_size_y - int(math.ceil((y - bounds.miny) / abs(bounds.pixel_size_y)))
  y_idx = int((x - bounds.minx) / abs(bounds.pixel_size_x))

  return x_idx, y_idx

def test_coords_to_indices():
  bounds = Bounds(50, 100, 50, 100, 1, 1, 50, 50)
  x, y = coords_to_indices(bounds, 55, 55)
  assert x == 45
  assert y == 5

  bounds = Bounds(-100, -50, -100, -50, 1, 1, 50, 50)
  x, y = coords_to_indices(bounds, -55, -55)
  assert x == 5
  assert y == 45

  bounds = Bounds(-10, 50, -10, 50, 1, 1, 60, 60)
  x, y = coords_to_indices(bounds, -1, 13)
  assert x == 37
  assert y == 9

  bounds = Bounds(minx=-73.97513931345594, maxx=-34.808472803053895, miny=-33.73347244751509, maxy=5.266527396029211, pixel_size_x=0.04166666650042771, pixel_size_y=-0.041666666499513144, raster_size_x=937, raster_size_y=941)
  x, y = coords_to_indices(bounds, -67.14342073173958, -7.273271869467912e-05)
  #print(x)
  assert x == 131 # was: 132
  assert y == 163

test_coords_to_indices()

def get_data_at_coords(dataset: AmazonGeoTiff, x: float, y: float, month: int) -> float:
  # x = longitude
  # y = latitude
  bounds = get_extent(dataset.gdal_dataset)
  x_idx, y_idx = coords_to_indices(bounds, x, y)
  if month == -1:
    value = dataset.yearly_masked_image[x_idx, y_idx]
  else:
    value = dataset.masked_image[x_idx, y_idx, month]
  if np.ma.is_masked(value):
    raise ValueError("Coordinates are masked")
  else:
    return value

# Load Rasters

In [None]:
# Access data stored on Google Drive
if GDRIVE_BASE:
    from google.colab import drive
    drive.mount(GDRIVE_BASE)

In [None]:
brazil_map_geotiff = load_raster(get_raster_path("brasil_clim_raster.tiff")) # mean annual precipitation
# Will be used to compute isoscapes for carbon and nitrogen

relative_humidity_geotiff = load_raster(get_raster_path("R.rh_Stack.tif"))
temperature_geotiff = load_raster(get_raster_path("Temperatura_Stack.tif"))
vapor_pressure_deficit_geotiff = load_raster(get_raster_path("R.vpd_Stack.tif"))
atmosphere_isoscape_geotiff = load_raster(get_raster_path("Iso_Oxi_Stack.tif"))
cellulose_isoscape_geotiff = load_raster(get_raster_path("iso_O_cellulose.tif"))

# Soil Geotiffs are not necessary to load, but required to build plant nitrogen geotiff.
soil_plant_nitrogen_difference_isoscape_geotiff = load_raster(get_raster_path("raster_krig_d15N_soil_plant.tiff"))
soil_nitrogen_isoscape_geotiff = load_raster(get_raster_path("raster_krig_d15N_soil.tiff"))
plant_nitrogen_isoscape_geotiff = load_raster(get_raster_path("plant_nitrogen_isoscape.tiff"))

carbon_means_krig_isoscape_geotiff = load_raster(get_raster_path("Brasil_Raster_Krig_iso_d13C.tiff"))

land_water_mask_geotiff = load_raster(get_raster_path("Land_Water_Brazil_MODIS.tif")) if LOAD_WATER_MASK_GEOTIFF else None
possible_tree_mask_geotiff = load_raster(get_raster_path("Possible_Trees_Brazil_MODIS.tif")) if LOAD_TREE_MASK_GEOTIFF else None

In [None]:
if REGENERATE_PLANT_NITROGEN_GEOTIFF:
  plant_nitrogen_array = soil_nitrogen_isoscape_geotiff.yearly_masked_image - soil_plant_nitrogen_difference_isoscape_geotiff.yearly_masked_image
  save_numpy_to_geotiff(soil_plant_nitrogen_difference_isoscape_geotiff,
    np.expand_dims(np.flip(plant_nitrogen_array.T, axis=1), axis=2),
    get_raster_path("plant_nitrogen_isoscape.tiff"))
plant_nitrogen_isoscape_geotiff = load_raster(get_raster_path("plant_nitrogen_isoscape.tiff"))

# Train Isoscape Models

## Preprocess

Sample data from Martinelli's map of measurement sites to train fake isoscape models

In [None]:
def gen_tabular_dataset(monthly: bool, samples_per_site: int) -> pd.DataFrame:
  return gen_tabular_dataset_with_coords(monthly, samples_per_site,
                            [(-70,-5,),(-67.5,0,),(-66,-4.5,),(-63,-9.5,),
                             (-63,-9,),(-62,-6,),(-60,-2.5,),(-60,1,),
                              (-60,-12.5,),(-59,-2.5,),(-57.5,-4,),
                               (-55,-3.5,),(-54,-1,),(-52.5,-13,),(-51.5,-2.5,)],
                                         0.5)

def gen_tabular_dataset_with_coords(monthly: bool, samples_per_site: int, sample_site_coordinates: list, sample_radius: float) -> pd.DataFrame:
  features = [relative_humidity_geotiff, temperature_geotiff, vapor_pressure_deficit_geotiff, atmosphere_isoscape_geotiff, cellulose_isoscape_geotiff]
  image_feature_names = ["rh", "temp", "vpd", "atmosphere_oxygen_ratio", "cellulose_oxygen_ratio"]
  feature_names = ["lat", "lon", "month_of_year"] + image_feature_names
  rs = RandomState(MT19937(SeedSequence(42)))

  feature_values = {}
  for name in feature_names:
    feature_values[name] = []

  for coord in tqdm(sample_site_coordinates):
    month_start = 0 if monthly else -1
    month_end = 12 if monthly else 0
    for month in range(month_start, month_end):
      samples_collected = 0
      while samples_collected < samples_per_site:
        row = {}
        sample_x, sample_y = 2*(rs.rand(2) - 0.5) * sample_radius
        sample_x += coord[0]
        sample_y += coord[1]

        try:
          for feature, feature_name in zip(features, image_feature_names):
            row[feature_name] = get_data_at_coords(feature, sample_x, sample_y, month)
          row["month_of_year"] = month
          row["lon"] = sample_x
          row["lat"] = sample_y
          samples_collected += 1

        except ValueError as e:
          # masked and out-of-bounds coordinates
          print("!!!!! x={:f}, y={:f}".format(sample_x, sample_y))
          if sample_radius == 0:
            samples_collected += 1
          continue
        for key, value in row.items():
          feature_values[key].append(value)

  samples = pd.DataFrame(feature_values)

  if not monthly:
    samples.drop("month_of_year", axis=1, inplace=True)

  return samples

monthly_data_large = gen_tabular_dataset(monthly=True, samples_per_site=30)
monthly_data_255_trees = gen_tabular_dataset(monthly=True, samples_per_site=17)
yearly_data_large = gen_tabular_dataset(monthly=False, samples_per_site=30*12)
yearly_data_255_trees = gen_tabular_dataset(monthly=False, samples_per_site=17)

In [None]:
def load_sample_data() -> pd.DataFrame:
  df = pd.read_csv(get_sample_db_path(REFERENCE_CSV_FILENAME), encoding="ISO-8859-1", sep=',')
  print(df.shape)
  df = df[['Code', 'lat', 'long', 'd18O_cel']]
  df = df[df['d18O_cel'].notna()]

  grouped = df.groupby(['lat', 'long'])

  means = grouped.mean().reset_index()
  locations = list(zip(means["long"], means["lat"]))

  sample_data = gen_tabular_dataset_with_coords(monthly=False, samples_per_site=1, sample_site_coordinates=locations, sample_radius = 0)
  sample_data = sample_data.drop('cellulose_oxygen_ratio', axis = 1)
  sample_data = pd.merge(sample_data, means, how="inner", left_on=['lat', 'lon'], right_on=['lat', 'long'])
  sample_data = sample_data.drop('long', axis=1).rename(columns={'d18O_cel': 'cellulose_oxygen_ratio' }).reset_index()
  sample_data.drop('index', inplace=True, axis=1)
  print()
  print(sample_data)


  return sample_data

# If a real test set is requested, we use that over any simulated data.
if USE_REFERENCE_SAMPLES_FOR_TRAINING:
  #override yearly data with UC David 40 locations
  yearly_data_255_trees = load_sample_data()

In [None]:
leaf_data = pd.read_csv(get_sample_db_path("pontos-vasp-cluster.csv"))
leaf_data.head()

In [None]:
def load_leaf_dataframe(db_path: str, isotope_col: str):
  leaf_data = pd.read_csv(db_path)
  leaf_data = leaf_data.rename(columns={"latitude": "lat", "longitude": "lon"})
  leaf_df = leaf_data[["lon", "lat", "MAP", "MAT", "vap", "d15N_soil", "dem", "pa", "pet", "ph", isotope_col]]
  return leaf_df

carbon_df = load_leaf_dataframe(get_sample_db_path("pontos-vasp-cluster.csv"), "d13C")
nitrogen_df = load_leaf_dataframe(get_sample_db_path("pontos-vasp-cluster.csv"), "d15N")

### Partition

In [None]:
def partition(df) -> PartitionedDataset:
  train = df[df["lon"] < -55]
  test = df[(df["lon"] >= -55) & (df["lat"] > -2.85)]
  validation = df[(df["lon"] >= -55) & (df["lat"] <= -2.85)]
  return PartitionedDataset(train, test, validation)

def print_split(dataset: PartitionedDataset) -> None:
  total_len = len(dataset.train)+len(dataset.validation)+len(dataset.test)
  print(f"Train: {100*len(dataset.train)/total_len:.2f}% ({len(dataset.train)})")
  print(f"Test: {100*len(dataset.test)/total_len:.2f}% ({len(dataset.test)})")
  print(f"Validation: {100*len(dataset.validation)/total_len:.2f}% ({len(dataset.validation)})")

In [None]:
yearly_large_partitioned = partition(yearly_data_large)
print_split(yearly_large_partitioned)

In [None]:
yearly_255_trees_partitioned = partition(yearly_data_255_trees)
print_split(yearly_255_trees_partitioned)

In [None]:
monthly_large_partitioned = partition(monthly_data_large)
print_split(monthly_large_partitioned)

In [None]:
monthly_255_trees_partitioned = partition(monthly_data_255_trees)
print_split(monthly_255_trees_partitioned)

In [None]:
nitrogen_df_partitioned = partition(nitrogen_df)
print_split(nitrogen_df_partitioned)

In [None]:
carbon_df_partitioned = partition(carbon_df)
print_split(carbon_df_partitioned)

## XGBoost: Train XGBoost Models

In [None]:
def train_xgb(data: PartitionedDataset, booster: str, rounds: int) -> xgb.XGBRegressor:
  xgb_model = xgb.XGBRegressor(n_estimators=rounds, eta=0.1, max_depth=2, objective='reg:squarederror', booster=booster)
  # split data into input and output columns
  X, y = data.train.iloc[:, :-1], data.train.iloc[:, -1]
  X_val, y_val = data.validation.iloc[:, :-1], data.validation.iloc[:, -1]
  print(f"Predicting: {data.train.columns[-1]}")
  xgb_model.fit(X, y, eval_set=[(X_val, y_val)], verbose=XGB_VERBOSITY_LEVEL)
  return xgb_model

def train_or_load_xgboost(basename: str, data: PartitionedDataset, rounds: int=100000):
  if REBUILD_MODEL:
    print("Training model")
    model = train_xgb(data, booster='gblinear', rounds=rounds)
    with open(f"{basename}_config_xgb.json", "w") as f:
      f.write(model.get_booster().save_config())
    model.save_model(f"{basename}_xgb.json")
  else:
    print("Loading model")
    model = xgb.XGBRegressor()
    model.load_model(f"{basename}_xgb.json")
    with open(f"{basename}_config_xgb.json", "r") as f:
      model.get_booster().load_config(f.read())
  print(f"RMSE (validation): {model.evals_result()['validation_0']['rmse'][-1]}")
  return model


In [None]:
# Validation RMSE xgboost: 0.306059 w/ 100,000 rounds
# Validation RMSE google internal tooling: 0.39386
yearly_255_trees_xgb_model = train_or_load_xgboost(
  get_model_path("oxygen_isoscape_model"),
  yearly_255_trees_partitioned,
  rounds=100000)

In [None]:
# HMM, post-bugfix, Carbon might diverge too.
carbon_isoscape_model = train_or_load_xgboost(get_model_path("carbon_isoscape_model"), carbon_df_partitioned)

In [None]:
# Validation loss seems to diverge
nitrogen_isoscape_model = train_or_load_xgboost(get_model_path("nitrogen_isoscape_model"), nitrogen_df_partitioned, rounds=10000)

### Test XGBoost Model Code

Test data created as follows:
```python
# Create data for unit tests
from io import StringIO

train_text = StringIO()
yearly_255_trees_partitioned.validation.iloc[:10].to_csv(train_text, index=False)
print(train_text.getvalue())
```

In [None]:
import os

def create_test_data():
  train_txt = """lat,lon,rh,temp,vpd,atmosphere_oxygen_ratio,cellulose_oxygen_ratio
  -4.880332787307218,-69.95800610699372,0.8044400215148926,26.225001017252605,0.6966667175292969,-4.451689084370931,37.122222900390625
  -4.688096349322666,-70.44263021829333,0.80293075243632,26.35833485921224,0.7074999809265137,-4.41741943359375,37.17825063069662
  -4.872397683046066,-69.63990597562567,0.8054693539937338,26.308329264322918,0.6958333651224772,-4.417543411254883,37.1220448811849
  -4.8247274690858735,-69.81806665464333,0.8040264447530111,26.308331807454426,0.7016665935516357,-4.424846013387044,37.14974721272787
  -4.765274838163909,-70.01923594475969,0.8022874991099039,26.337496439615887,0.709166685740153,-4.418321291605632,37.20004526774088
  -4.771462642125715,-70.34365888186157,0.8011360963185629,26.508333841959637,0.7199999491373698,-4.385458946228027,37.247047424316406
  -4.798305195940103,-70.28306090761369,0.802595059076945,26.366666158040363,0.709166685740153,-4.411936124165853,37.20568339029948
  -5.223217462581197,-69.53591146126529,0.8077573776245117,26.10833231608073,0.6808333396911621,-4.4716800053914385,37.02557881673177
  -4.613341938104102,-69.7943386465883,0.8022151788075765,26.400001525878906,0.7116666634877523,-4.423205057779948,37.17969512939453
  -4.527212807226629,-69.8817482523093,0.8018482526143392,26.4499994913737,0.7149999936421713,-4.395961443583171,37.22857411702474"""
  train_df = pd.read_csv(StringIO(train_txt))

  val_txt = """lat,lon,rh,temp,vpd,atmosphere_oxygen_ratio,cellulose_oxygen_ratio
  -3.6696957825007046,-54.87948135669049,0.8142155011494955,25.770833333333332,0.6483333110809326,-3.3461217880249023,38.01644388834635
  -3.8993546066489224,-54.86859101648585,0.805713415145874,26.149998982747395,0.6933333079020182,-3.261778195699056,38.26288604736328
  -3.159593531700783,-54.93297940204632,0.819889227549235,25.912501017252605,0.6333333253860474,-3.287173271179199,37.95276896158854
  -3.9960354096696906,-54.87203413927777,0.8026766777038574,26.149998982747395,0.7041666507720947,-3.2518555323282876,38.3813222249349
  -3.80255400058822,-54.751942535999156,0.8073338667551676,26.054166158040363,0.6833333174387614,-3.302551587422689,38.1731923421224
  -12.828852720078402,-52.607319143523036,0.7082154750823975,25.258333841959637,1.0200000603993733,-3.547501564025879,39.927050272623696
  -12.532258968752565,-52.097391445126696,0.7110532919565836,25.520833333333332,1.019166628519694,-3.43365478515625,40.047627766927086
  -13.427351375947753,-52.060761037543834,0.7014106909434,24.8249994913737,1.005833387374878,-3.589900334676107,40.11137390136719
  -13.349866138079692,-52.65445230256682,0.7073808511098226,24.958333333333332,1.00083327293396,-3.5771010716756186,39.98369598388672
  -12.730453380778542,-52.375592581693155,0.7120146751403809,25.2375005086263,1.0024999777475994,-3.494396209716797,40.02317810058594"""
  val_df = pd.read_csv(StringIO(val_txt))

  test_df = pd.DataFrame()

  return PartitionedDataset(train=train_df, test=test_df, validation=val_df)

# This function override REBUILD_MODEL for testing.
def test_train_or_load_xgboost__load_succeeds():
  for f in glob.glob("/tmp/foobar_model*"):
    os.remove(f)

  # TODO: Probably better to have a function to load xgboost models instead of train and load sharing a function.
  global REBUILD_MODEL
  REBUILD_MODEL_tmp = REBUILD_MODEL
  REBUILD_MODEL = True
  model_under_test = yearly_255_trees_xgb_model = train_or_load_xgboost("/tmp/foobar_model", create_test_data(), rounds=100)

  final_loss = model_under_test.evals_result()['validation_0']['rmse'][-1]
  initial_loss = model_under_test.evals_result()['validation_0']['rmse'][0]
  assert final_loss < initial_loss

  original_prediction = model_under_test.predict(pd.DataFrame(np.array([[1,2,3,4,5,6]], dtype=float), columns=['lat', 'lon', 'rh', 'temp', 'vpd', 'atmosphere_oxygen_ratio']))[0]

  REBUILD_MODEL = False
  model_under_test = train_or_load_xgboost("/tmp/foobar_model", create_test_data(), rounds=100)
  REBUILD_MODEL = REBUILD_MODEL_tmp

  loaded_prediction = model_under_test.predict(pd.DataFrame(np.array([[1,2,3,4,5,6]], dtype=float), columns=['lat', 'lon', 'rh', 'temp', 'vpd', 'atmosphere_oxygen_ratio']))[0]
  assert original_prediction == loaded_prediction


test_train_or_load_xgboost__load_succeeds()

**We also trained a model assuming 255 trees sampled monthly.**

Preserving this as text only because it is not realistic as of 2023.

Validation RMSE xgboost: 0.29072 \
Validation RMSE Google internal tooling: 0.29183 \
`monthly_255_trees_xgb_model = train_xgb(monthly_255_trees_partitioned, booster='gbtree', rounds=15000)`

For the best results here, add `max_depth=2` to XGBRegressor params.

# Data Validation

Do we use coordinates correctly?
Ideally, we should create a sample image and make this a unit test.


In [None]:
get_data_at_coords(relative_humidity_geotiff, -65, -5, 0)

In [None]:
get_data_at_coords(relative_humidity_geotiff, -43, -10, 0)

# Compute Isoscapes

## XGBoost: Compute AI-Predicted Isoscape

Required: REGENERATE_OXYGEN_XGB_ISOSCAPE == true

In [None]:
def get_xgb_isoscape_prediction():
  bounds = get_extent(cellulose_isoscape_geotiff.gdal_dataset)
  features = [relative_humidity_geotiff, temperature_geotiff, vapor_pressure_deficit_geotiff, atmosphere_isoscape_geotiff]
  image_feature_names = ["rh", "temp", "vpd", "atmosphere_oxygen_ratio"]
  #feature_names = ["lat", "lon", "month_of_year"] + image_feature_names
  feature_names = ["lat", "lon"] + image_feature_names
  predicted_isoscape = np.ma.array(np.zeros([bounds.raster_size_x, bounds.raster_size_y, 1], dtype=float), mask=np.ones([bounds.raster_size_x, bounds.raster_size_y, 1], dtype=bool))

  for x_idx, x in enumerate(tqdm(np.arange(bounds.minx, bounds.maxx, bounds.pixel_size_x, dtype=float))):
    rows = []
    row_indexes = []
    for y_idx, y in enumerate(np.arange(bounds.miny, bounds.maxy, -bounds.pixel_size_y, dtype=float)):
      #for month in range(12):
      month = 0
      row = {}
      try:
        for feature, feature_name in zip(features, image_feature_names):
          row[feature_name] = get_data_at_coords(feature, x, y, month)
        #row["month_of_year"] = month
        row["lon"] = x
        row["lat"] = y
      except ValueError:
        # masked and out-of-bounds coordinates
        continue
      except IndexError:
        continue
      rows.append(row)
      row_indexes.append((y_idx,month,))
    if (len(rows) > 0):
      reordered = pd.DataFrame(rows)[yearly_255_trees_xgb_model.get_booster().feature_names]
      predictions = yearly_255_trees_xgb_model.predict(reordered)
      predictions_np = predictions
      for prediction, (y_idx, month_idx) in zip(predictions_np, row_indexes):
        predicted_isoscape.mask[x_idx,y_idx,month_idx] = False # unmask since we have data
        predicted_isoscape.data[x_idx,y_idx,month_idx] = prediction

  return predicted_isoscape

if REGENERATE_OXYGEN_XGB_ISOSCAPE:
  xgb_isoscape_prediction = get_xgb_isoscape_prediction()
  save_numpy_to_geotiff(get_extent(cellulose_isoscape_geotiff.gdal_dataset), xgb_isoscape_prediction, get_model_path("predicted_isoscape_xgboost.tiff"))
  plt.imshow(xgb_isoscape_prediction)

# TODO: TESTME!

## Turn XGBoost isoscape into a Gaussian distribution

In [None]:
predicted_cellulose_isoscape_geotiff = load_raster(get_model_path("predicted_isoscape_xgboost.tiff"))

In [None]:
plt.imshow(predicted_cellulose_isoscape_geotiff.yearly_masked_image)

In [None]:
from scipy.stats import multivariate_normal

def get_2d_gaussian(center_lon: float, center_lat: float, stdev: float):
  """Quick-and-dirty function to get a PDF for sampling from an image
  to turn it into a distribution. Intended for use with isoscapes.

  Major room for improvement! This framing assumes no distortion from the
  projection, i.e. that 1 deg latitude == 1 deg longitude == 111 km everywhere.
  This should probably be fine for Brazil, for now, since it's near the Equator.
  """
  rv = multivariate_normal([center_lat, center_lon], [[stdev, 0], [0, stdev]])

  return rv

# x = longitude
# y = latitude
def plot_gaussian(rv, shape: Bounds):
  """Informative, for debugging and visualizing get_2d_gaussian()."""
  x = np.linspace(shape.minx, shape.maxx, shape.raster_size_x)
  y = np.linspace(shape.maxy, shape.miny, shape.raster_size_y) # inverted y axis
  X, Y = np.meshgrid(x,y)
  target = np.empty((shape.raster_size_y, shape.raster_size_x,2,), dtype=float)
  target[:, :, 0] = Y
  target[:, :, 1] = X
  pd = rv.pdf(target)
  plt.imshow(pd)
  plt.colorbar()

# TODO: For each pixel in the predicted isoscape
# Yeah, this is basically Gaussian blur, huh...
# BUT, it does give us a distribution.
def gaussian_kernel(input: AmazonGeoTiff, stdev_in_degrees: float=1):
  bounds = get_extent(input.gdal_dataset)
  means = np.ma.zeros((bounds.raster_size_x, bounds.raster_size_y,), dtype=float)
  means.mask = np.ones((bounds.raster_size_x, bounds.raster_size_y), dtype=bool)
  variances = np.ma.zeros((bounds.raster_size_x, bounds.raster_size_y,), dtype=float)
  variances.mask = np.ones((bounds.raster_size_x, bounds.raster_size_y), dtype=bool)
  for map_y in tqdm(range(0, bounds.raster_size_y, 1)):
    y_coord = map_y * abs(bounds.pixel_size_y) + bounds.miny
    for map_x in range(0, bounds.raster_size_x, 1):
      x_coord = map_x * abs(bounds.pixel_size_x) + bounds.minx
      rv = get_2d_gaussian(y_coord, x_coord, stdev_in_degrees)
      rsamp = rv.rvs(1000)
      values = []
      for coordinate_pair in rsamp:
        try:
          #print(coordinate_pair)
          values.append(get_data_at_coords(input, coordinate_pair[0], coordinate_pair[1], 0))
        except ValueError:
          pass
        if len(values) == 30:
          break
      if len(values) == 30:
        # Set the mean and stdev pixels
        #print(x_coord, y_coord)
        means[map_x, map_y] = np.mean(values)
        variances[map_x, map_y] = np.var(values)
        # Apply sample corrective factor to variance
        variances[map_x, map_y] *= len(values) / (len(values)-1)
        means.mask[map_x, map_y] = False
        variances.mask[map_x, map_y] = False
  return means, variances

rv = get_2d_gaussian(-60.16, 4.11, 1)
plot_gaussian(rv, get_extent(predicted_cellulose_isoscape_geotiff.gdal_dataset))

if REGENERATE_OXYGEN_XGB_MEANS_VARIANCES:
  xgb_means, xgb_variances = gaussian_kernel(predicted_cellulose_isoscape_geotiff, stdev_in_degrees=0.1)
  bds = get_extent(predicted_cellulose_isoscape_geotiff.gdal_dataset)
  save_numpy_to_geotiff(bds, np.expand_dims(xgb_means, axis=2), get_model_path("xgb_means_oxygen_isoscape.tiff"))
  save_numpy_to_geotiff(bds, np.expand_dims(xgb_variances, axis=2), get_model_path("xgb_variances_oxygen_isoscape.tiff"))

In [None]:
xgb_means_oxygen_geotiff = load_raster(get_model_path("xgb_means_oxygen_isoscape.tiff"))
xgb_variances_oxygen_geotiff = load_raster(get_model_path("xgb_variances_oxygen_isoscape.tiff"))

# Until we re-generate the map
xgb_variances_oxygen_geotiff.yearly_masked_image *= (5 / 4)
xgb_variances_oxygen_geotiff.masked_image *= (5 / 4)

In [None]:
plot_band(xgb_means_oxygen_geotiff, 0)

In [None]:
plot_band(xgb_variances_oxygen_geotiff, 0)

In [None]:
get_data_at_coords(xgb_means_oxygen_geotiff, -60, 4, 0)

# A probability that a coordinate matches a sample given a predicted isoscape

1. Compute t-intervals for a coordinate pixel on the paperwork based on the output of the AI isoscape model
2. For a given sample, compute a z-score from the estimated t-distribution, and turn that into a p-value based on two-sided area under the curve: p($\in$ isotope distribution | possible coordinates, isoscape). *This can be combined with other knowledge in the future to get, for example, p($\in$ isoscape distribution $\land$ tree rings look right for the area | possible coordinates, isoscape, tree ring knowledge).*
3. Depending on the p-value, reject the null hypothesis that the paperwork is correct

Key challenge with this approach: The statistical bound on false positives is per-pixel, and only guarantees that it falls in a range of isoscape values associated with that location rather than the location itself. We will likely have multiple possible points of origin for each sample.

## Compute Sample Origin Given AI-Predicted Isoscapes

In [None]:
predicted_cellulose_isoscape_geotiff = load_raster(get_model_path("predicted_isoscape_xgboost.tiff"))

In [None]:
try:
    np.sum(predicted_cellulose_isoscape_geotiff.masked_image.mask[0,:,:])
except IndexError as err:
    raise IndexError(err + " If you're seeing this error, the image mask is unexpectedly missing. You might need to rerun the Oxygen xgboost trainer.")


In [None]:
plot_band(predicted_cellulose_isoscape_geotiff, 1, figsize=(12,12))

In [None]:
plt.imshow(predicted_cellulose_isoscape_geotiff.masked_image.data[:,:,3])