Copyright 2023 Google LLC.
SPDX-License-Identifier: Apache-2.0

# Imports

In [None]:
%pip install opencv-python
%pip install matplotlib
%pip install pandas
%pip install tqdm
%pip install xgboost

In [None]:
import data
import geotiffs as gts
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
from numpy.random import MT19937, RandomState, SeedSequence
import pandas as pd
from tqdm import tqdm
import math
import raster
import xgb_lib as xgb

rc('animation', html='jshtml')

In [None]:
#@title Debugging
# See https://zohaib.me/debugging-in-google-collab-notebook/ for tips,
# as well as docs for pdb and ipdb.
DEBUG = False #@param {type:"boolean"}
if DEBUG:
    %pip install -Uqq ipdb
    import ipdb
    %pdb on

In [None]:
# Raster directory. Contains:
# iso_O_cellulose.tif: Isoscape of 18O from Precipitation; <-- MODELING TARGET
# Iso_Oxi_Stack.tif: Isoscape of 18O from Precipitation; <-- Model input
# R.rh_Stack.tif: Atmospheric Relative humidity <-- Model input
# R.vpd_Stack.tif: Vapor Pressure Deficit - VPD <-- Model input
# Temperature_Stack.tif: Atmospheric Temperature <-- Model input
RASTER_BASE = "/MyDrive/amazon_rainforest_files/amazon_rasters/" #@param
SAMPLE_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_sample_data/" #@param
TEST_DATA_BASE = "/MyDrive/amazon_rainforest_files/amazon_test_data/" #@param
ANIMATIONS_BASE = "/MyDrive/amazon_rainforest_files/amazon_animations/" #@param
GDRIVE_BASE = "/content/drive" #@param

REBUILD_MODEL = False #@param {type:"boolean"}
MODEL_BASE = "/MyDrive/amazon_rainforest_files/amazon_isoscape_models/" #@param

# Used to compute invalid terrain when making predictions. Leave disabled if on a low memory. 
LOAD_WATER_MASK_GEOTIFF = False #@param {type:"boolean"}
LOAD_TREE_MASK_GEOTIFF = False #@param {type:"boolean"}

# If true, requires soil and plant soil nitrogen geotiffs. Also requires the following files:
# RASTER_BASE/raster_krig_d15N_soil_plant.tiff
# RASTER_BASE/raster_krig_d15N_soil.tiff
REGENERATE_PLANT_NITROGEN_GEOTIFF = False #@param {type:"boolean"}

# Use Global Params to access files

In [None]:
def get_raster_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{RASTER_BASE}{filename}"


def get_animations_path(filename: str) -> str:
  root = GDRIVE_BASE if GDRIVE_BASE else ""
  return f"{root}{ANIMATIONS_BASE}{filename}"

# Load Rasters

In [None]:
geotiffs = gts.Geotiffs(LOAD_WATER_MASK_GEOTIFF, LOAD_TREE_MASK_GEOTIFF, REGENERATE_PLANT_NITROGEN_GEOTIFF, get_raster_path(''))

brazil_map_geotiff = geotiffs.brazil_map_geotiff() # mean annual precipitation
# Will be used to compute isoscapes for carbon and nitrogen

relative_humidity_geotiff = geotiffs.relative_humidity_geotiff()
temperature_geotiff = geotiffs.temperature_geotiff()
vapor_pressure_deficit_geotiff = geotiffs.vapor_pressure_deficit_geotiff()
atmosphere_isoscape_geotiff = geotiffs.atmosphere_isoscape_geotiff()
cellulose_isoscape_geotiff = geotiffs.cellulose_isoscape_geotiff()

# Soil Geotiffs are not necessary to load, but required to build plant nitrogen geotiff.
soil_plant_nitrogen_difference_isoscape_geotiff = geotiffs.soil_plant_nitrogen_difference_isoscape_geotiff()
soil_nitrogen_isoscape_geotiff = geotiffs.soil_nitrogen_isoscape_geotiff()
plant_nitrogen_isoscape_geotiff = geotiffs.plant_nitrogen_isoscape_geotiff()

carbon_means_krig_isoscape_geotiff = geotiffs.carbon_means_krig_isoscape_geotiff()

land_water_mask_geotiff = geotiffs.land_water_mask_geotiff()
possible_tree_mask_geotiff = geotiffs.possible_tree_mask_geotiff()

plant_nitrogen_isoscape_geotiff = geotiffs.plant_nitrogen_isoscape_geotiff()

# Data Validation

Do we use coordinates correctly?
Ideally, we should create a sample image and make this a unit test.


In [None]:
raster.get_data_at_coords(relative_humidity_geotiff, -65, -5, 0)

In [None]:
raster.get_data_at_coords(relative_humidity_geotiff, -43, -10, 0)

## Plots

### GeoTIFFs

In [None]:
#raster.animate(land_water_mask_geotiff, 1, 1)

In [None]:
raster.animate(soil_nitrogen_isoscape_geotiff, 1, 1)

In [None]:
raster.animate(soil_plant_nitrogen_difference_isoscape_geotiff, 1, 1)

In [None]:
raster.animate(plant_nitrogen_isoscape_geotiff, 1, 1)

In [None]:
raster.animate(relative_humidity_geotiff, 12, 1)

In [None]:
raster.animate(temperature_geotiff, 12, 1)

In [None]:
raster.animate(vapor_pressure_deficit_geotiff, 12, 1)

In [None]:
raster.animate(atmosphere_isoscape_geotiff, 12, 1)

In [None]:
raster.animate(cellulose_isoscape_geotiff, 12, 1)

In [None]:
raster.animate(relative_humidity_geotiff, 12, 1).save(get_animations_path('relative_humidity.gif'), writer='imagemagick', fps=1)

In [None]:
raster.animate(temperature_geotiff, 12, 1).save(get_animations_path('temperature.gif'), writer='imagemagick', fps=1)

In [None]:
raster.animate(vapor_pressure_deficit_geotiff, 12, 1).save(get_animations_path('vapor_pressure_deficit.gif'), writer='imagemagick', fps=1)

In [None]:
raster.animate(atmosphere_isoscape_geotiff, 12, 1).save(get_animations_path('atmospheric_isoscape.gif'), writer='imagemagick', fps=1)

In [None]:
raster.animate(cellulose_isoscape_geotiff, 12, 1).save(get_animations_path('cellulose_isoscape.gif'), writer='imagemagick', fps=1)

In [None]:
# Make sure this is Gaussian for the next step
_ = plt.hist(cellulose_isoscape_geotiff.yearly_masked_image.data[cellulose_isoscape_geotiff.yearly_masked_image.mask == False], bins=100)

In [None]:
_ = plt.hist(cellulose_isoscape_geotiff.masked_image.data[cellulose_isoscape_geotiff.masked_image.mask == False], bins=100)

If we squint, the distribution of monthly samples (bottom) looks like it could be Gaussian. NO WAY for the annual means (top). Additionally, when we measure an individual cellulose sample, we are sampling this monthly distribution, not a yearly distribution of means (which has an artificially lower std dev). For these reasons, we will compare samples against monthly point-in-time measurements instead of yearly means.


**When we capture real training data, it will be important to also capture corresponding point-in-time measurements at the same location so our z-scores are coming from the same distribution.**

## Investigate Tree Samples

* Do they fit the Craig-Gordon model?
* If so, how well?
* RMSE, r, r^2, variance

In [None]:
samples = pd.read_csv("/usr/local/google/home/nicholasroth/Existing Samples - Jamari1_flona_tapajos_18sampes_Nicholas.csv")

In [None]:
plt.title("Expected Values (Craig-Gordon)")
im = plt.imshow(cellulose_isoscape_geotiff.yearly_masked_image,
                extent=raster.get_extent(cellulose_isoscape_geotiff.gdal_dataset).to_matplotlib(), interpolation='none')
_ = plt.colorbar(im)

In [None]:
expected_values = []
actual_values = []
for _, row in samples.iterrows():
  actual_value = float(row['sample_value'])
  monthly_craig_gordon_value = get_data_at_coords(cellulose_isoscape_geotiff, row["long"], row["lat"], int(row["date"].split("-")[1])-1)
  craig_gordon_values = []
  for i in range(12):
    craig_gordon_values.append(get_data_at_coords(cellulose_isoscape_geotiff, row["long"], row["lat"], i))
  expected_craig_gordon_value = np.mean(craig_gordon_values)
  expected_values.append(expected_craig_gordon_value)
  actual_values.append(actual_value)
  if False:
    print(f"Monthly: {monthly_craig_gordon_value:.05f}")
    print(f"Expected: {expected_craig_gordon_value:.05f}")
    print(f"Actual: {actual_value:.05f}")
    print()

residuals = np.array(actual_values)-np.array(expected_values)
rmse = np.sqrt(np.mean(residuals**2))
r = np.corrcoef(np.array(actual_values), np.array(expected_values))
print(f"RMSE: {rmse:.05f}")
print(f"r = {r[1,0]:.05f}")
print(f"r^2 = {r[1,0]**2:.05f}")

plt.title("Expected (Craig-Gordon) vs Actual Sample Values w/ expected=actual Line")
plt.scatter(actual_values, expected_values)
_ = plt.plot([min(actual_values), max(actual_values)],
         [min(actual_values), max(actual_values)], color='black')
plt.show()

plt.title("Expected (Craig-Gordon) vs Actual Sample Values w/ Regression Line")
plt.scatter(actual_values, expected_values)
b, a = np.polyfit(actual_values, expected_values, deg=1)
xseq = np.linspace(min(actual_values), max(actual_values), num=100)
plt.plot(xseq, a + b * xseq, color="k", lw=2.5);
plt.show()

plt.title("Residuals")
_ = plt.hist(residuals, bins=5)