<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/xgbminimal/minimal_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Imports and modules.
%pip install opencv-python
%pip install matplotlib
%pip install pandas

from osgeo import gdal, gdal_array
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
import matplotlib.animation as animation
from matplotlib import rc
from typing import List
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
from io import StringIO
import xgboost as xgb
import os
import math
import glob

rc('animation', html='jshtml')

import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
#ddfimport.ddf_source_control_pane()
ddfimport.ddf_import_common()


In [34]:
#@title Options.
import raster
import dataset
import importlib
import gaussian

importlib.reload(raster)
importlib.reload(dataset)
importlib.reload(gaussian)

# Raster directory. Contains:
# iso_O_cellulose.tif: Isoscape of 18O from Precipitation; <-- MODELING TARGET
# Iso_Oxi_Stack.tif: Isoscape of 18O from Precipitation; <-- Model input
# R.rh_Stack.tif: Atmospheric Relative humidity <-- Model input
# R.vpd_Stack.tif: Vapor Pressure Deficit - VPD <-- Model input
# Temperature_Stack.tif: Atmospheric Temperature <-- Model input
raster.RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
raster.SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
raster.TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
raster.ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
raster.GDRIVE_BASE = "/content/gdrive" #@param

REBUILD_MODEL = True #@param {type:"boolean"}
raster.MODEL_BASE = "/MyDrive/amazon_rainforest_files/amazon_isoscape_models/" #@param

# How often should XGB log training metadata? 0 is the default, which indicates never.
XGB_VERBOSITY_LEVEL = 0 #@param

# @markdown If True (the default), then we train using a CSV of reference samples.
# @markdown If False, we simulate the training set by sampling training data
# @markdown using points from a tiff isoscape.
USE_REFERENCE_SAMPLES_FOR_TRAINING = True #@param {type:"boolean"}
# @markdown The filename that contains the reference samples
REFERENCE_CSV_FILENAME = "2023_06_23_Results_Google.csv" #@param

# If false, requires XGB oxygen isoscape in MODEL_BASE/predicted_isoscape_xgboost.tiff
REGENERATE_OXYGEN_XGB_ISOSCAPE = True #@param {type:"boolean"}

# If false, requires MODEL_BASE/xgb_means_oxygen_isoscape.tiff and MODEL_BASE/xgb_variances_oxygen_isoscape.tiff
REGENERATE_OXYGEN_XGB_MEANS_VARIANCES = True #@param {type:"boolean"}

# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = False #@param {type:"boolean"}
if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

if not USE_REFERENCE_SAMPLES_FOR_TRAINING:
  REFERENCE_CSV_FILENAME = ""

In [None]:
#@title Train Model.
import raster
import dataset

def train_xgb(data: dataset.PartitionedDataset, booster: str, rounds: int) -> xgb.XGBRegressor:
  xgb_model = xgb.XGBRegressor(n_estimators=rounds, eta=0.1, max_depth=2, objective='reg:squarederror', booster=booster)
  # split data into input and output columns
  X, y = data.train.iloc[:, :-1], data.train.iloc[:, -1]
  X_val, y_val = data.validation.iloc[:, :-1], data.validation.iloc[:, -1]
  print(f"Predicting: {data.train.columns[-1]}")
  xgb_model.fit(X, y, eval_set=[(X_val, y_val)], verbose=XGB_VERBOSITY_LEVEL)
  return xgb_model

def train_or_load_xgboost(basename: str, data: dataset.PartitionedDataset, rounds: int=100000):
  if REBUILD_MODEL:
    print("Training model")
    model = train_xgb(data, booster='gblinear', rounds=rounds)
    with open(f"{basename}_config_xgb.json", "w") as f:
      f.write(model.get_booster().save_config())
    model.save_model(f"{basename}_xgb.json")
  else:
    print("Loading model")
    model = xgb.XGBRegressor()
    model.load_model(f"{basename}_xgb.json")
    with open(f"{basename}_config_xgb.json", "r") as f:
      model.get_booster().load_config(f.read())
  print(f"RMSE (validation): {model.evals_result()['validation_0']['rmse'][-1]}")
  return model

# Validation RMSE xgboost: 0.306059 w/ 100,000 rounds
# Validation RMSE google internal tooling: 0.39386
yearly_255_trees_xgb_model = train_or_load_xgboost(
  raster.get_model_path("oxygen_isoscape_model"),
  dataset.yearly_255_trees_partitioned(REFERENCE_CSV_FILENAME),
  rounds=100000)


In [None]:
#@title Generate Means GeoTif.

def get_xgb_isoscape_prediction():
  bounds = raster.get_extent(raster.cellulose_isoscape_geotiff().gdal_dataset)
  features = [raster.relative_humidity_geotiff(),
              raster.temperature_geotiff(),
              raster.vapor_pressure_deficit_geotiff(),
              raster.atmosphere_isoscape_geotiff()]
  image_feature_names = ["rh", "temp", "vpd", "atmosphere_oxygen_ratio"]
  #feature_names = ["lat", "lon", "month_of_year"] + image_feature_names
  feature_names = ["lat", "lon"] + image_feature_names
  predicted_isoscape = np.ma.array(np.zeros([bounds.raster_size_x, bounds.raster_size_y, 1], dtype=float), mask=np.ones([bounds.raster_size_x, bounds.raster_size_y, 1], dtype=bool))

  for x_idx, x in enumerate(tqdm(np.arange(bounds.minx, bounds.maxx, bounds.pixel_size_x, dtype=float))):
    rows = []
    row_indexes = []
    for y_idx, y in enumerate(np.arange(bounds.miny, bounds.maxy, -bounds.pixel_size_y, dtype=float)):
      #for month in range(12):
      month = 0
      row = {}
      try:
        for feature, feature_name in zip(features, image_feature_names):
          row[feature_name] = raster.get_data_at_coords(feature, x, y, month)
        #row["month_of_year"] = month
        row["lon"] = x
        row["lat"] = y
      except ValueError:
        # masked and out-of-bounds coordinates
        continue
      except IndexError:
        continue
      rows.append(row)
      row_indexes.append((y_idx,month,))
    if (len(rows) > 0):
      reordered = pd.DataFrame(rows)[yearly_255_trees_xgb_model.get_booster().feature_names]
      predictions = yearly_255_trees_xgb_model.predict(reordered)
      predictions_np = predictions
      for prediction, (y_idx, month_idx) in zip(predictions_np, row_indexes):
        predicted_isoscape.mask[x_idx,y_idx,month_idx] = False # unmask since we have data
        predicted_isoscape.data[x_idx,y_idx,month_idx] = prediction

  return predicted_isoscape

if REGENERATE_OXYGEN_XGB_ISOSCAPE:
  xgb_isoscape_prediction = get_xgb_isoscape_prediction()
  raster.save_numpy_to_geotiff(raster.get_extent(
      raster.cellulose_isoscape_geotiff().gdal_dataset),
                               xgb_isoscape_prediction,
                               raster.get_model_path("predicted_isoscape_xgboost.tiff"))
  plt.imshow(xgb_isoscape_prediction)

# TODO: TESTME!

In [None]:
#@title Generate Variance GeoTif.
import gaussian

predicted_cellulose_isoscape_geotiff = raster.load_raster(raster.get_model_path("predicted_isoscape_xgboost.tiff"))
plt.imshow(predicted_cellulose_isoscape_geotiff.yearly_masked_image)

rv = gaussian.get_2d_gaussian(-60.16, 4.11, 1)
gaussian.plot_gaussian(rv, raster.get_extent(predicted_cellulose_isoscape_geotiff.gdal_dataset))

if REGENERATE_OXYGEN_XGB_MEANS_VARIANCES:
  xgb_means, xgb_variances = gaussian.gaussian_kernel(predicted_cellulose_isoscape_geotiff, stdev_in_degrees=0.1)
  bds = raster.get_extent(predicted_cellulose_isoscape_geotiff.gdal_dataset)
  raster.save_numpy_to_geotiff(bds, np.expand_dims(xgb_means, axis=2),
                               raster.get_model_path("xgb_means_oxygen_isoscape.tiff"))
  raster.save_numpy_to_geotiff(bds, np.expand_dims(xgb_variances, axis=2),
                                raster.get_model_path("xgb_variances_oxygen_isoscape.tiff"))


In [None]:
#@title Display Results.
xgb_means_oxygen_geotiff = raster.load_raster(raster.get_model_path("xgb_means_oxygen_isoscape.tiff"))
xgb_variances_oxygen_geotiff = raster.load_raster(raster.get_model_path("xgb_variances_oxygen_isoscape.tiff"))

# Until we re-generate the map
xgb_variances_oxygen_geotiff.yearly_masked_image *= (5 / 4)
xgb_variances_oxygen_geotiff.masked_image *= (5 / 4)
raster.plot_band(xgb_means_oxygen_geotiff, 0)
raster.plot_band(xgb_variances_oxygen_geotiff, 0)
