# Getting started with authentication and package installation

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!earthengine authenticate
import ee
ee.Initialize()

In [None]:
import subprocess
try:
  import geemap
except ImportError:
  print('geemap package not installed. Installing ...')
  subprocess.check_call(["python", '-m', 'pip', 'install', 'geemap'])

In [None]:
!pip install geopandas

In [None]:
import geemap
import geopandas as gpd
import os
import glob
import pandas as pd
import numpy as np

# Prepare datasets for extracting raster value with buffered points

In [None]:
# Import datasets; soil, terrain, and LULC were handled in GEE so not repeated here; however, they may be processed in Colab if GEE capacity was exceeded
# Alternative Landsat and MODIS datasets may be used depending on the scale and resolution of interests
DAYMET = ee.ImageCollection("NASA/ORNL/DAYMET_V4")
NDWI = ee.ImageCollection("MODIS/MOD09GA_006_NDWI")
MODISTree = ee.ImageCollection("MODIS/006/MOD44B")
MODISGPP = ee.ImageCollection("MODIS/006/MOD17A2H")
MODISEVI = ee.ImageCollection("MODIS/MOD09GA_006_EVI")
MODISLST = ee.ImageCollection('MODIS/006/MOD11A1')
siteyear = ee.FeatureCollection("users/username/Moisture/site_year")
sitedate = ee.FeatureCollection("users/username/Moisture/Site_date")

In [None]:
# Climate data processing
precipitation = DAYMET.select("prcp")
airTmin = DAYMET.select("tmin")
airTmax = DAYMET.select("tmax")
vpd = DAYMET.select("vp")

# Use loop function to automatically extract all covariates

In [None]:
# Define output directory
out_dir = os.path.expanduser('.')
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
# Define function for automatic extraction
def Extresult(i):
   ppt = precipitation.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first();
   Tmin = airTmin.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first();
   Tmax = airTmax.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first();
   Tmean = Tmin.add(Tmax).divide(2).rename("tavg");
   VPD = vpd.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first();
   modisndwi = NDWI.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first().rename("ndwi");
   modisgpp = MODISGPP.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)) \
                      .advance(-8, 'day'), ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)).advance(8, 'day'))).first().divide(16).select("Gpp");
   modisevi = MODISEVI.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first().select("EVI");    
   modislst = MODISLST.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))).first().select("LST_Day_1km").rename("LST");         
   modistree = MODISTree.filterMetadata("system:index", "contains", str(site_date_filtered['Water_year'].get(i))).first().select("Percent_Tree_Cover").rename("tree");                                                                
   covariates = ee.Image.cat(ppt,Tmean,VPD,modisgpp,modisevi,modisndwi,modislst,modistree)
   return covariates;

In [None]:
def loop(i):
 Cov = ee.ImageCollection([listofimages.get(i)]);
 Covimg = Cov.toBands();
 ROI_Buffer = geemap.geopandas_to_ee(site_Buffer.iloc[[i]]);
 out_stats = os.path.join(out_dir, 'test' + str(i) +'.csv'); 
 geemap.zonal_statistics(Covimg, ROI_Buffer, out_stats, statistics_type='MEAN', scale=90)

In [None]:
for k in range (1, 500000, 5000): # Customize the value based on the total record number; set a reasonable step size for intermediate exports
  low = k;
  high = k+5000 # Set the value to the step size
  for j in range (low, high, 5): # Define a step size for processing subsample; smaller value usually corresponds to faster processing but can take more memory space
    lb = j;
    hb = j+5;
    subset = sitedate.filter(ee.Filter.And(ee.Filter.gte('Num', lb),ee.Filter.lt('Num', hb)))
    site_date = geemap.ee_to_geopandas(subset, selectors = ['Num','Site','Water_year','Date'])
    site_year = geemap.ee_to_geopandas(subset, selectors = ['Num','Site','Water_year'])
    site = geemap.ee_to_geopandas(subset, selectors = ['Site', 'Num', 'ID', 'Network', 'Depth', 'Date', 'Water_day', 'Water_year'])
    site_buffer = site.buffer(0.0008084837557075693617); #90m 
    site_Buffer = gpd.GeoDataFrame(geometry=gpd.GeoSeries(site_buffer),crs="EPSG:4326")
    ROI_Buffer = geemap.geopandas_to_ee(site_Buffer)
    good_list = [] # Get a list of records with values; or else the automatic run can be interrupted
    for i in range(0,subset.size().getInfo(),1):
      ppt_test = precipitation.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY', site_date['Date'].get(i)))).size().getInfo();
      Tmin_test = airTmin.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo();
      Tmax_test = airTmax.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo();
      VPD_test = vpd.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo();
      modisndwi_test = NDWI.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo();
      modisgpp_test = MODISGPP.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)).advance(-8, 'day'), ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)).advance(8, 'day'))).size().getInfo();
      modisevi_test = MODISEVI.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo();   
      modislst_test = MODISLST.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo(); 
      modistree_test = MODISTree.filterMetadata("system:index", "contains", str(site_year['Water_year'].get(i))).size().getInfo();   
      if (ppt_test != 0) and (Tmin_test != 0) and (Tmax_test != 0) and (VPD_test != 0) and (modisndwi_test != 0) and (modisgpp_test != 0) and (modisevi_test != 0) and (modislst_test != 0) and (modistree_test != 0):
        good_list.append(i)
    site_date_filtered = site_date[site_date.index.isin(good_list)].reset_index() # Filter out the records without values
    merged_col = ee.ImageCollection([])
    for i in range(0,len(good_list),1):
      col = ee.ImageCollection(Extresult(i))
      merged_col = merged_col.merge(col)
      listofimages = merged_col.toList(merged_col.size());
    for i in range(0,len(good_list),1):
      loop(i)
    file_list = []
    record_list = []
    for files in glob.glob("test*.csv"):
      file_list.append(files)
      file_name_nopath = os.path.basename(files)
      file_name = [os.path.splitext(file_name_nopath)[0]]
      record_list.append(file_name) 
    name = pd.DataFrame (record_list, columns = ['File_name'])
    result_list = []
    for file in file_list:
      result = pd.read_csv(file, usecols = [0,1,2,3,4,5,6,7], header = 0)
      result.columns = ['ppt', 'Tavg', 'VPD', 'GPP', 'EVI', 'NDWI', 'LST', 'Tree']
      result['No']= os.path.basename(file)[4:-4]
      siteresult = pd.DataFrame(result)
      result_list.append(siteresult)
    result = pd.DataFrame(np.concatenate(result_list))
    result.columns = ['ppt', 'Tavg', 'VPD', 'GPP', 'EVI', 'NDWI', 'LST', 'Tree', 'No']
    site_date_filtered['No'] = site_date_filtered.index
    site_date_filtered = pd.DataFrame(site_date_filtered)
    site_date_filtered["No"] = site_date_filtered["No"].astype(int)
    result["No"] = result["No"].astype(int)
    Final_result = site_date_filtered.merge(result, left_on='No', right_on='No')
    Final_result.to_csv('Final_result'+ str(j) +'.csv') 
  Final_list = []
  for files in glob.glob("Final*.csv"):
    Final_list.append(files)
  comb_list = []
  for file in Final_list:
    comb = pd.read_csv(file)
    combresult = pd.DataFrame(comb)
    comb_list.append(combresult)
  combdata = pd.DataFrame(np.concatenate(comb_list)) 
  combdata.to_csv('combdata.csv') 
  !gsutil cp combdata.csv gs://bucket/Moisture/Test/"covset_"$j".csv" # Customize the output pathway
  !rm *.csv 