# Getting started with authentication and package installation

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!earthengine authenticate
import ee
ee.Initialize()

In [None]:
import subprocess
try:
  import geemap
except ImportError:
  print('geemap package not installed. Installing ...')
  subprocess.check_call(["python", '-m', 'pip', 'install', 'geemap'])

In [None]:
!pip install geopandas

In [None]:
import geemap
import geopandas as gpd
import os
import glob
import pandas as pd
import numpy as np

# Prepare datasets for extracting raster value

In [None]:
# Import datasets
NLDAS = ee.ImageCollection("pathway/NLDAS") # Use pathway for where NLDAS data was deposited from the previous step
sitedate = ee.FeatureCollection("users/username/Moisture/Site_date")

# Extract NLDAS for all data points automatically for large dataset

In [None]:
# Define output directory
out_dir = os.path.expanduser('.')
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
# Define function for automatic extraction
def Extresult(i):  
   NLDAS_site = NLDAS.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date_filtered['Date'].get(i)))) \
                     .first().select(['B0','B1','B3']);  # Select relevant depth bands                                                           
   return NLDAS_site;

In [None]:
def loop(i):
 Cov = ee.ImageCollection([listofimages.get(i)]);
 Covimg = Cov.toBands();
 ROI_Buffer = geemap.geopandas_to_ee(site_Buffer.iloc[[i]]);
 out_stats = os.path.join(out_dir, 'test' + str(i) +'.csv'); 
 geemap.zonal_statistics(Covimg, ROI_Buffer, out_stats, statistics_type='MEAN', scale=30)

In [None]:
for k in range (0, 500000, 5000): # Customize the value based on the total record number; set a reasonable step size for intermediate exports
  low = k;
  high = k+5000 # Set the value to the step size
  for j in range (low, high, 5): # Define a step size for processing subsample; smaller value usually corresponds to faster processing but can take more memory space
    lb = j;
    hb = j+5;
    subset = sitedate.filter(ee.Filter.And(ee.Filter.gte('Num', lb),ee.Filter.lt('Num', hb)))
    site_date = geemap.ee_to_geopandas(subset, selectors = ['Num','Site','Water_year','Date'])
    site = geemap.ee_to_geopandas(subset, selectors = ['Site', 'Num', 'ID', 'Network', 'Depth', 'Date', 'Water_day', 'Water_year'])
    site_buffer = site.buffer(0.0008084837557075693617); #90m 
    site_Buffer = gpd.GeoDataFrame(geometry=gpd.GeoSeries(site_buffer),crs="EPSG:4326")
    ROI_Buffer = geemap.geopandas_to_ee(site_Buffer)
    good_list = [] # Get a list of records with values; or else the automatic run can be interrupted
    for i in range(0,subset.size().getInfo(),1):
      NLDAS_test = NLDAS.filter(ee.Filter.date(ee.Date.parse('MM/dd/YYYY',site_date['Date'].get(i)))).size().getInfo(); 
      if (NLDAS_test != 0):
        good_list.append(i)
    site_date_filtered = site_date[site_date.index.isin(good_list)].reset_index()
    merged_col = ee.ImageCollection([])
    for i in range(0,len(good_list),1):
      col = ee.ImageCollection(Extresult(i))
      merged_col = merged_col.merge(col)
      listofimages = merged_col.toList(merged_col.size());
    for i in range(0,len(good_list),1):
      loop(i)
    file_list = []
    record_list = []
    for files in glob.glob("test*.csv"):
      file_list.append(files)
      file_name_nopath = os.path.basename(files)
      file_name = [os.path.splitext(file_name_nopath)[0]]
      record_list.append(file_name) 
    name = pd.DataFrame (record_list, columns = ['File_name'])
    result_list = []
    for file in file_list:
      result = pd.read_csv(file, usecols = [0,1,2], header = 0)
      result.columns = ['B0', 'B1', 'B2']
      result['No']= os.path.basename(file)[4:-4]
      siteresult = pd.DataFrame(result)
      result_list.append(siteresult)
    result = pd.DataFrame(np.concatenate(result_list))
    result.columns = ['B0', 'B1', 'B2', 'No']
    site_date_filtered['No'] = site_date_filtered.index
    site_date_filtered = pd.DataFrame(site_date_filtered)
    site_date_filtered["No"] = site_date_filtered["No"].astype(int)
    result["No"] = result["No"].astype(int)
    Final_result = site_date_filtered.merge(result, left_on='No', right_on='No')
    Final_result.to_csv('Final_result'+ str(j) +'.csv') 
  Final_list = []
  for files in glob.glob("Final*.csv"):
    Final_list.append(files)
  comb_list = []
  for file in Final_list:
    comb = pd.read_csv(file)
    combresult = pd.DataFrame(comb)
    comb_list.append(combresult)
  combdata = pd.DataFrame(np.concatenate(comb_list)) 
  combdata.to_csv('combdata.csv') 
  !gsutil cp combdata.csv gs://bucket/Moisture/Test2/"covset_"$j".csv" # Customize the output pathway
  !rm *.csv 