# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 
- Revisist other scripts and convert to PyArrow/parquet Brocli compressed file storage
- replace for loops with lamba functions
- connect precip to DF,
- VIIRS
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- connect regional data together to train model
- connect different regions
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance
- add snotel script to functions

Put all units in SI, while it should not matter for model training since they are being normalized, they will be more interpretable.

In [1]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
region = 'S_Sierras'
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"
folder_name = f"{region}/{directory}"

#Get ASO data
data_tool = ASODownload(short_name, version)
b_box = data_tool.BoundingBox(region)  
url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
data_tool.cmr_download(directory, region)

#Convert ASO tifs to parquet
data_processor = ASODataProcessing()
data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

Bounding Box collected for S_Sierras: -120.3763448720203,36.29256774541929,-118.292253412863,38.994985247736324
Fetching file URLs in progress for S_Sierras from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-120.3763448720203,36.29256774541929,-118.292253412863,38.994985247736324

Found 131 matches.
['https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif.xml', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_20130429.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_20130429.tif

100%|██████████| 262/262 [00:00<00:00, 2180.47it/s]




All NASA ASO data collected for given date range and can be found in /home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Raw_ASO_Data...
Files with .xml extension moved to the destination folder.
Converting .tif to parquet
Converting 131 ASO tif files to parquet'


100%|██████████| 131/131 [00:00<00:00, 331.35it/s]


Checking to make sure all files successfully converted...


100%|██████████| 99/99 [00:30<00:00,  3.22it/s]


# Code for generating ML dataframe using nearest in situ monitoring sites

In [2]:
import GeoDF

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
region = region #Should be done in above code block
output_res = output_res

#load snotel meta location data, use haversive function
#GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites, can this be threaded?

# Get geophysical attributes for each site, need to see how to add output resolution
gdf = GeoDF.GeoSpatial(region, output_res)
#gdf = gdf.head(100)
#use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)




Loading geospatial data for S_Sierras
Converting to geodataframe
Calculating dataframe bounding box
Retrieving Copernicus 90m DEM tiles


100%|██████████| 30/30 [00:00<00:00, 294681.78it/s]

There are 30 tiles in the region





Unnamed: 0_level_0,sliceID
tileID,Unnamed: 1_level_1
Copernicus_DSM_COG_30_N39_00_W121_00_DEM,0
Copernicus_DSM_COG_30_N39_00_W120_00_DEM,1
Copernicus_DSM_COG_30_N39_00_W119_00_DEM,2
Copernicus_DSM_COG_30_N39_00_W118_00_DEM,3
Copernicus_DSM_COG_30_N39_00_W117_00_DEM,4
Copernicus_DSM_COG_30_N38_00_W121_00_DEM,5
Copernicus_DSM_COG_30_N38_00_W120_00_DEM,6
Copernicus_DSM_COG_30_N38_00_W119_00_DEM,7
Copernicus_DSM_COG_30_N38_00_W118_00_DEM,8
Copernicus_DSM_COG_30_N38_00_W117_00_DEM,9


Determining Grid Cell Spatial Features


100%|██████████| 1478371/1478371 [06:00<00:00, 4100.33it/s]


Job complete for getting geospatial metadata, processing dataframe


146570it [1:42:08,  1.51it/s]Exception ignored in: <function CachingFileManager.__del__ at 0x7f2084df6af0>
Traceback (most recent call last):
  File "/home/rjohnson18/envs/SWEML_env2/lib/python3.9/site-packages/xarray/backends/file_manager.py", line 240, in __del__
146573it [1:42:09,  1.99it/s]    ref_count = self._ref_counter.decrement(self._key)
  File "/home/rjohnson18/envs/SWEML_env2/lib/python3.9/site-packages/xarray/backends/file_manager.py", line 293, in decrement
    count = self._counts[name] - 1
KeyError: [<function open at 0x7f207ea39f70>, ('https://elevationeuwest.blob.core.windows.net/copernicus-dem/COP90_hh/Copernicus_DSM_COG_30_N38_00_W120_00_DEM.tif?st=2024-04-28T16%3A29%3A47Z&se=2024-04-29T17%3A14%3A47Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2024-04-28T03%3A05%3A11Z&ske=2024-05-05T03%3A05%3A11Z&sks=b&skv=2021-06-08&sig=aW9CJdjpeUWeySz%2BzKJLM0VHPFOeqn5NVkphak8IXs0%3D',), 'r', (('sharing', False

Saving S_Sierras dataframe in /home/rjohnson18/SWEMLv2.0/data/TrainingDFs/S_Sierras/300M_Resolution


In [2]:
import os
import pandas as pd
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

region = 'S_Sierras'
output_res = 300

HOME = os.path.expanduser('~')
path = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/300M_Resolution/S_Sierras_metadata.parquet"
meta = pd.read_parquet(path, engine='fastparquet')

In [3]:
meta['Elevation_m'].unique()

array([1987, 1989, 1997, ..., 1533, 1544, 1791])

In [4]:
meta.head()

Unnamed: 0_level_0,cen_lat,cen_lon,Elevation_m,Slope_Deg,Aspect_Deg
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S_Sierras_300M_37.627_-119.028,37.62726,-119.028352,1987,3,288
S_Sierras_300M_37.627_-119.027,37.62726,-119.027352,1989,1,0
S_Sierras_300M_37.627_-119.026,37.62726,-119.026352,1989,2,180
S_Sierras_300M_37.627_-119.031,37.626927,-119.030685,1997,15,161
S_Sierras_300M_37.627_-119.03,37.626927,-119.030352,1988,6,211


In [5]:
path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/300M_SWE_parquet/ASO_300M_SWE_20130403.parquet"
aso = pd.read_parquet(path, engine='fastparquet')
aso.head()

Unnamed: 0,cen_lat,cen_lon,swe_m,cell_id
45264,38.186188,-119.589588,0.42043,S_Sierras_300M_38.186_-119.59
45265,38.186188,-119.589255,0.397254,S_Sierras_300M_38.186_-119.589
45266,38.186188,-119.588921,0.281826,S_Sierras_300M_38.186_-119.589
45267,38.186188,-119.588588,0.222769,S_Sierras_300M_38.186_-119.589
47120,38.185854,-119.590588,1.332055,S_Sierras_300M_38.186_-119.591


In [7]:
aso[aso['swe_m']>=0]

Unnamed: 0,cen_lat,cen_lon,swe_m,cell_id
45264,38.186188,-119.589588,0.420430,S_Sierras_300M_38.186_-119.59
45265,38.186188,-119.589255,0.397254,S_Sierras_300M_38.186_-119.589
45266,38.186188,-119.588921,0.281826,S_Sierras_300M_38.186_-119.589
45267,38.186188,-119.588588,0.222769,S_Sierras_300M_38.186_-119.589
47120,38.185854,-119.590588,1.332055,S_Sierras_300M_38.186_-119.591
...,...,...,...,...
2537275,37.739521,-119.272588,0.081362,S_Sierras_300M_37.74_-119.273
2537276,37.739521,-119.272255,0.092054,S_Sierras_300M_37.74_-119.272
2537277,37.739521,-119.271921,0.122147,S_Sierras_300M_37.74_-119.272
2537278,37.739521,-119.271588,0.202335,S_Sierras_300M_37.74_-119.272


In [1]:
import os
import pandas as pd
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import pickle as pkl
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook

region = 'S_Sierras'
output_res = 300

HOME = os.path.expanduser('~')


print('Connecting site observations with nearest monitoring network obs')
#get Snotel observations
snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data/"
Snotelobs_path = f"{snotel_path}ground_measures_train_featuresALLDATES.parquet"
#nearest snotel path
nearest_snotel_dict_path = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"
#ASO observations
aso_swe_files_folder_path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"

    #Make folder for predictions
obsdf_path = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution/Obsdf"
if not os.path.exists(obsdf_path):
    os.makedirs(obsdf_path, exist_ok=True)

#Get sites/snotel observations from 2013-2019
print('Loading observations from 2013-2019')
try:
    snotel_data = pd.read_csv(Snotelobs_path)
except:
    print("Snotel obs not found, retreiving from AWS S3")
    if not os.path.exists(snotel_path):
        os.makedirs(snotel_path, exist_ok=True)
    key = "NSMv2.0"+Snotelobs_path.split("SWEMLv2.0",1)[1]        
    S3.meta.client.download_file(BUCKET_NAME, key,Snotelobs_path)
    snotel_data = pd.read_csv(Snotelobs_path)

#Load dictionary of nearest sites
print(f"Loading {output_res}M resolution grids for {region} region")
with open(f"{nearest_snotel_dict_path}/nearest_SNOTEL.pkl", 'rb') as handle:
    nearest_snotel = pkl.load(handle)

#Processing SNOTEL Obs to correct date/time
print('Processing datetime component of SNOTEL observation dataframe')
date_columns = snotel_data.columns[1:]
new_column_names = {col: pd.to_datetime(col, format='%Y-%m-%d').strftime('%Y%m%d') for col in date_columns}
snotel_data_f = snotel_data.rename(columns=new_column_names)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 300M resolution grids for S_Sierras region
Processing datetime component of SNOTEL observation dataframe


In [2]:
aso_swe_file = f"ASO_300M_SWE_20130403.parquet"
Obsdf = pd.DataFrame()
args = aso_swe_files_folder_path, aso_swe_file, new_column_names, snotel_data_f, nearest_snotel , Obsdf, obsdf_path

In [3]:
def vectorized_cell_id_2_topography(row, timestamp,transposed_data,nearest_snotel, snotel_data_f):
        cell_id = row['cell_id']
        station_ids = nearest_snotel[cell_id]
        selected_snotel_data = snotel_data_f[['station_id', timestamp]].loc[snotel_data_f['station_id'].isin(station_ids)]
        station_mapping = {old_id: f"nearest_site_{i+1}" for i, old_id in enumerate(station_ids)}
        
        # Rename the station IDs in the selected SNOTEL data
        selected_snotel_data['station_id'] = selected_snotel_data['station_id'].map(station_mapping)

        # Transpose and set the index correctly
        transposed_data[cell_id] = selected_snotel_data.set_index('station_id').T



In [8]:
#get key variable from args
aso_swe_files_folder_path, aso_swe_file, new_column_names, snotel_data_f, nearest_snotel , Obsdf, obsdf_path = args
    
timestamp = aso_swe_file.split('_')[-1].split('.')[0]

#load in SWE data from ASO
aso_swe_data = pd.read_parquet(os.path.join(aso_swe_files_folder_path, aso_swe_file), engine='fastparquet')

aso_swe_data.reset_index(inplace=True)
transposed_data = {}

if timestamp in new_column_names.values():
    tqdm_notebook.pandas()
    aso_swe_data.progress_apply(lambda row: vectorized_cell_id_2_topography(row, timestamp,transposed_data,nearest_snotel, snotel_data_f), axis =1)
    
    #Convert dictionary of sites to dataframe
    transposed_df = pd.concat(transposed_data, axis=0)

    # Reset index and rename columns
    transposed_df.reset_index(inplace = True)
    transposed_df.rename(columns={'level_0': 'cell_id', 'level_1': 'Date'}, inplace = True)
    transposed_df['Date'] = pd.to_datetime(transposed_df['Date'])

    aso_swe_data['Date'] = pd.to_datetime(timestamp)
    aso_swe_data = aso_swe_data[['cell_id', 'Date', 'swe_m']]
    merged_df = pd.merge(aso_swe_data, transposed_df, how='left', on=['cell_id', 'Date'])

    Obsdf = pd.concat([Obsdf, merged_df], ignore_index=True)

else:
    aso_swe_data['Date'] = pd.to_datetime(timestamp)
    aso_swe_data = aso_swe_data[['cell_id', 'Date', 'swe_m']]

    # No need to merge in this case, directly concatenate
    Obsdf = pd.concat([Obsdf, aso_swe_data], ignore_index=True)

cols = [
'cell_id', 'Date', 'swe_m', 'nearest_site_1', 'nearest_site_2', 'nearest_site_3', 'nearest_site_4', 
'nearest_site_5', 'nearest_site_6'
]

Obsdf = Obsdf[cols]

100%|██████████| 10000/10000 [00:13<00:00, 750.42it/s]


In [29]:
Obsdf

Unnamed: 0,cell_id,Date,swe_m,nearest_site_1,nearest_site_2,nearest_site_3,nearest_site_4,nearest_site_5,nearest_site_6
0,S_Sierras_300M_38.186_-119.59,2013-04-03,0.420430,44.52,49.0,0.0,0.1,18.9,18.8
1,S_Sierras_300M_38.186_-119.589,2013-04-03,0.397254,44.52,49.0,0.0,0.1,18.9,18.8
2,S_Sierras_300M_38.186_-119.589,2013-04-03,0.281826,44.52,49.0,0.0,0.1,18.9,18.8
3,S_Sierras_300M_38.186_-119.589,2013-04-03,0.222769,44.52,49.0,0.0,0.1,18.9,18.8
4,S_Sierras_300M_38.186_-119.591,2013-04-03,1.332055,44.52,49.0,0.0,0.1,18.9,18.8
...,...,...,...,...,...,...,...,...,...
95,S_Sierras_300M_38.184_-119.588,2013-04-03,0.572257,44.52,49.0,0.0,0.1,18.9,18.8
96,S_Sierras_300M_38.184_-119.587,2013-04-03,0.748074,44.52,49.0,0.0,0.1,18.9,18.8
97,S_Sierras_300M_38.184_-119.587,2013-04-03,0.900428,44.52,49.0,0.0,0.1,18.9,18.8
98,S_Sierras_300M_38.184_-119.587,2013-04-03,0.575085,44.52,49.0,0.0,0.1,18.9,18.8


## Connect Snotel to each ASO obs

In [1]:
import Obs_to_DF
region = "S_Sierras"
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date
finaldf = Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res) 

Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 300M resolution grids for S_Sierras region
Processing datetime component of SNOTEL observation dataframe
Loading all available processed ASO observations for the S_Sierras at 300M resolution


100%|██████████| 99/99 [00:00<00:00, 1200104.32it/s]


Connecting 99 timesteps of observations for S_Sierras


100%|██████████| 99/99 [00:01<00:00, 59.07it/s]


Job complete for connecting SNOTEL obs to sites/dates
Adding geospatial data to 20160614 observations...


  0%|          | 0/25504 [00:00<?, ?it/s]

Adding geospatial data to 20160626 observations...


  0%|          | 0/25504 [00:00<?, ?it/s]

Adding geospatial data to 20170128 observations...


  0%|          | 0/25504 [00:00<?, ?it/s]

Adding geospatial data to 20160621 observations...


  0%|          | 0/25504 [00:00<?, ?it/s]

Adding geospatial data to 20160607 observations...


  0%|          | 0/29776 [00:00<?, ?it/s]

Adding geospatial data to 20160708 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20130503 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20170129 observations...


  0%|          | 0/1083515 [00:00<?, ?it/s]

Adding geospatial data to 20130608 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20130601 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20130525 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20130403 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20130429 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20150428 observations...


  0%|          | 0/1406580 [00:00<?, ?it/s]

Adding geospatial data to 20140429 observations...


  0%|          | 0/1406580 [00:00<?, ?it/s]

Adding geospatial data to 20150609 observations...


  0%|          | 0/246750 [00:00<?, ?it/s]

Adding geospatial data to 20160509 observations...


  0%|          | 0/1083515 [00:00<?, ?it/s]

Adding geospatial data to 20160401 observations...


  0%|          | 0/1083515 [00:00<?, ?it/s]

Adding geospatial data to 20160426 observations...


  0%|          | 0/1075906 [00:00<?, ?it/s]

Adding geospatial data to 20170717 observations...


  0%|          | 0/1083515 [00:00<?, ?it/s]

Adding geospatial data to 20160326 observations...


  0%|          | 0/1083515 [00:00<?, ?it/s]

Adding geospatial data to 20140527 observations...


  0%|          | 0/2600741 [00:00<?, ?it/s]

Adding geospatial data to 20160527 observations...


  0%|          | 0/1084383 [00:00<?, ?it/s]

Adding geospatial data to 20150217 observations...


  0%|          | 0/2600741 [00:00<?, ?it/s]

Adding geospatial data to 20190309 observations...


  0%|          | 0/25418 [00:00<?, ?it/s]

Adding geospatial data to 20190611 observations...


  0%|          | 0/592738 [00:00<?, ?it/s]

Adding geospatial data to 20190703 observations...


  0%|          | 0/945465 [00:00<?, ?it/s]

Adding geospatial data to 20180425 observations...


  0%|          | 0/812700 [00:00<?, ?it/s]

Adding geospatial data to 20190715 observations...


  0%|          | 0/25418 [00:00<?, ?it/s]

Adding geospatial data to 20190329 observations...


  0%|          | 0/767635 [00:00<?, ?it/s]

Adding geospatial data to 20190604 observations...


  0%|          | 0/945465 [00:00<?, ?it/s]

Adding geospatial data to 20190326 observations...


  0%|          | 0/1232092 [00:00<?, ?it/s]

Adding geospatial data to 20190716 observations...


  0%|          | 0/945465 [00:00<?, ?it/s]

In [None]:
import GeoDF

region = 'S_Sierras'
output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
GeoDF.add_geospatial_threaded(region, output_res)

In [None]:
import get_Precip

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
region = 'S_Sierras'
output_res = 100
for year in years:
    get_Precip.get_precip_threaded(year, region, output_res)

In [None]:
import os
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

HOME = os.path.expanduser('~')

region = 'S_Sierras'
year = 2013
output_res = 100

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/{year}"

ppt = pd.read_csv(f"{Precippath}/NLDAS_PPT_2013-04-03.parquet")

ppt.set_index('cell_id', inplace=True)

ppt

In [None]:
#Convert DataFrame to Apache Arrow Table
table = pa.Table.from_pandas(ppt)

# Parquet with Brotli compression
pq.write_table(table, f"{Precippath}/PYARROW_NLDAS_PPT_2013-04-03.parquet", compression='BROTLI')

In [None]:
pptparquet = pd.read_parquet(f"{Precippath}/PYARROW_NLDAS_PPT_2013-04-03.parquet")
pptparquet

In [None]:
ppt

In [None]:
# Merge with metadata
req_cols = ['cell_id', 'lat', 'lon', 'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
            'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat', 'geometry']
Result = final_df.merge(metadata[req_cols], how='left', on='cell_id')

# Column renaming and ordering
Result.rename(columns={'swe': 'ASO_SWE_in'}, inplace=True)
Result = Result[['cell_id', 'Date', 'ASO_SWE_in', 'lat', 'lon', 'nearest site 1', 'nearest site 2',
                    'nearest site 3', 'nearest site 4', 'nearest site 5', 'nearest site 6',
                    'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                    'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat']]

# Save the merged data to a new file
output_filename = f"{HOME}/SWEML/data/NSMv2.0/data/TrainingDFs/Merged_aso_snotel_data.parquet"
Result.to_csv(output_filename, index=False)
display(Result.head(10))
print("Processed and saved data")

In [None]:
region = 'S_Sierras'
ASO_meta_loc_DF = pd.read_csv(f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/ASO_meta.parquet")

In [None]:
#Connect nearest snotel with ASO data, this should be last for now, need to add geophysical characteristics to the site first, then this...
finaldf = GeoDF.Nearest_Snotel_2_obs(region, output_res, dropna = True) 

In [None]:
"""
A Simple implementation of parallel processing using concurrency it takes so long to execute,
Explore terrain_daskconcurrency and terrain-processing_cluster python for more optimized implementations.
"""

def process_single_location(args):
    lat, lon, regions, tiles = args
    print(lat, lon, regions, tiles)

    if (lat, lon) in elevation_cache:
        elev, slop, asp = elevation_cache[(lat, lon)]
        return elev, slop, asp

    tile_id = 'Copernicus_DSM_COG_30_N' + str(math.floor(lon)) + '_00_W' + str(math.ceil(abs(lat))) + '_00_DEM'
    index_id = regions.loc[tile_id]['sliceID']

    signed_asset = planetary_computer.sign(tiles[index_id].assets["data"])
    #print(signed_asset)
    elevation = rxr.open_rasterio(signed_asset.href)
    
    slope = elevation.copy()
    aspect = elevation.copy()

    transformer = Transformer.from_crs("EPSG:4326", elevation.rio.crs, always_xy=True)
    xx, yy = transformer.transform(lon, lat)

    tilearray = np.around(elevation.values[0]).astype(int)
    #print(tilearray)
    geo = (math.floor(float(lon)), 90, 0.0, math.ceil(float(lat)), 0.0, -90)

    no_data_value = -9999
    driver = gdal.GetDriverByName('MEM')
    temp_ds = driver.Create('', tilearray.shape[1], tilearray.shape[0], 1, gdalconst.GDT_Float32)

    temp_ds.GetRasterBand(1).WriteArray(tilearray)
    temp_ds.GetRasterBand(1).SetNoDataValue(no_data_value)
    temp_ds.SetProjection('EPSG:4326')
    temp_ds.SetGeoTransform(geo)

    tilearray_np = temp_ds.GetRasterBand(1).ReadAsArray()
    slope_arr, aspect_arr = np.gradient(tilearray_np)
    aspect_arr = np.rad2deg(np.arctan2(aspect_arr[0], aspect_arr[1]))
    
    slope.values[0] = slope_arr
    aspect.values[0] = aspect_arr

    elev = round(elevation.sel(x=xx, y=yy, method="nearest").values[0])
    slop = round(slope.sel(x=xx, y=yy, method="nearest").values[0])
    asp = round(aspect.sel(x=xx, y=yy, method="nearest").values[0])

    elevation_cache[(lat, lon)] = (elev, slop, asp)  
    return elev, slop, asp

def extract_terrain_data_threaded(metadata_df, bounding_box, max_workers=10):
    global elevation_cache 

    elevation_cache = {} 
    min_x, min_y, max_x, max_y = *bounding_box[0], *bounding_box[1]
    
    client = Client.open(
            "https://planetarycomputer.microsoft.com/api/stac/v1",
            ignore_conformance=True,
        )

    search = client.search(
                    collections=["cop-dem-glo-90"],
                    intersects = {
                            "type": "Polygon",
                            "coordinates": [[
                            [min_x, min_y],
                            [max_x, min_y],
                            [max_x, max_y],
                            [min_x, max_y],
                            [min_x, min_y]  
                        ]]})

    tiles = list(search.items())

    regions = []

    print("Retrieving Copernicus 90m DEM tiles")
    for i in tqdm(range(0, len(tiles))):
        row = [i, tiles[i].id]
        regions.append(row)
    regions = pd.DataFrame(columns = ['sliceID', 'tileID'], data = regions)
    regions = regions.set_index(regions['tileID'])
    del regions['tileID']

    print("Interpolating Grid Cell Spatial Features")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_location, (metadata_df.iloc[i]['cen_lat'], metadata_df.iloc[i]['cen_lon'], regions, tiles))
                   for i in tqdm(range(len(metadata_df)))]
        
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())
    
    metadata_df['Elevation_m'], metadata_df['Slope_Deg'], metadata_df['Aspect_L'] = zip(*results)

In [None]:
metadata_df = pd.read_csv(r"/home/vgindi/Provided_Data/Merged_aso_nearest_sites1.csv")
metadata_df= metadata_df.head(20)
bounding_box = ((-120.3763448720203, 36.29256774541929), (-118.292253412863, 38.994985247736324))    
    
extract_terrain_data_threaded(metadata_df, bounding_box)

# Display the results
metadata_df.head(10)

In [None]:
"""
This code block crops the global coverage VIIRS data to south sierras subregion. 
"""

def crop_sierras(input_file_path, output_file_path, shapes):
    with rasterio.open(input_file_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shapes, crop=True)
        out_meta = src.out_meta
        out_meta.update({"driver": "GTiff",
                         "height": out_image.shape[1],
                         "width": out_image.shape[2],
                         "transform": out_transform})
                         
        with rasterio.open(output_file_path, "w", **out_meta) as dest:
            dest.write(out_image)

def download_viirs_sca(input_dir, output_dir, shapefile_path):
    
    # Load shapes from the shapefile
    with fiona.open(shapefile_path, 'r') as shapefile:
        shapes = [feature["geometry"] for feature in shapefile]
    
    # Iterate through each year directory in the input directory
    for year_folder in os.listdir(input_dir):
        year_folder_path = os.path.join(input_dir, year_folder)
        if os.path.isdir(year_folder_path):
            # Extract year from the folder name (assuming folder names like 'WY2013')
            year = re.search(r'\d{4}', year_folder).group()
            output_year_folder = os.path.join(output_dir, year)
            os.makedirs(output_year_folder, exist_ok=True)
        
            for file_name in os.listdir(year_folder_path):        
                if file_name.endswith('.tif'):   
                    parts = file_name.split('_')
                    output_file_name = '_'.join(parts[:3]) + '.tif'
                    output_file_path = os.path.join(output_year_folder, output_file_name)
                    input_file_path = os.path.join(year_folder_path, file_name)
                    crop_sierras(input_file_path, output_file_path, shapes)
                    print(f"Processed and saved {output_file_path}")

if __name__ == "__main__":
    
    input_directory = r"/home/vgindi/VIIRS_Data"
    output_directory = r"/home/vgindi/VIIRS_Sierras"
    shapefile_path = r"/home/vgindi/Provided_Data/low_sierras_points.shp"
    download_viirs_sca(input_directory, output_directory, shapefile_path)

In [None]:
"""
This code cell transforms the raw VIIRS tiff files to 100m resolution and saves each file in .csv format
"""
def processing_VIIRS(input_file, output_res):
    try:
        # Define the output file path for TIFFs using the original file name
        output_folder_tiff = os.path.join("/home/vgindi/Processed_VIIRS", os.path.basename(os.path.dirname(input_file)))
        os.makedirs(output_folder_tiff, exist_ok=True)
        output_file = os.path.join(output_folder_tiff, os.path.basename(input_file))

        # Reproject and resample
        ds = gdal.Open(input_file)
        if ds is None:
            print(f"Failed to open '{input_file}'. Make sure the file is a valid GeoTIFF file.")
            return None
        
        gdal.Warp(output_file, ds, dstSRS="EPSG:4326", xRes=output_res, yRes=-output_res, resampleAlg="bilinear")

        # Read the processed TIFF file using rasterio
        rds = rxr.open_rasterio(output_file)
        rds = rds.squeeze().drop("spatial_ref").drop("band")
        rds.name = "data"
        df = rds.to_dataframe().reset_index()
        return df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_and_convert_viirs(input_dir, output_res):
    # Iterate over subdirectories in the input directory
    for year in os.listdir(input_dir):
        year_dir = os.path.join(input_dir, year)
        
        if os.path.isdir(year_dir):
            for file_name in os.listdir(year_dir):
                if file_name.endswith('.tif'):
                    input_file_path = os.path.join(year_dir, file_name)
                    df = processing_VIIRS(input_file_path, output_res)
                    
                    if df is not None:
                        csv_folder = os.path.join("/home/vgindi/Processed_VIIRS", "VIIRS_csv")
                        os.makedirs(csv_folder, exist_ok=True)
                        csv_file_path = os.path.join(csv_folder, file_name.replace('.tif', '.csv'))
 
                        df.to_csv(csv_file_path, index=False)
                        print(f"Processed and saved {csv_file_path}")

if __name__ == "__main__":
    input_directory = "/home/vgindi/VIIRS_Sierras"
    output_res = 100  # Desired resolution in meters
    process_and_convert_viirs(input_directory, output_res)

In [None]:
"""
This code cell fetches the cell id using grid_cells_meta_idx metadata for each lat/lon pair for VIIRS csv file
"""
def create_polygon(self, row):
    return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                    (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                    (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                    (row['UL_Coord_Long'], row['UL_Coord_Lat'])])
    
def process_folder(self, input_folder, metadata_path, output_folder):
    # Import the metadata into a pandas DataFrame
    pred_obs_metadata_df = pd.read_csv(metadata_path)

    # Assuming create_polygon is defined elsewhere, we add a column with polygon geometries
    pred_obs_metadata_df = pred_obs_metadata_df.drop(columns=['Unnamed: 0'], axis=1)
    pred_obs_metadata_df['geometry'] = pred_obs_metadata_df.apply(self.create_polygon, axis=1)

    # Convert the DataFrame to a GeoDataFrame
    metadata = gpd.GeoDataFrame(pred_obs_metadata_df, geometry='geometry')

    # Drop coordinates columns
    metadata = metadata.drop(columns=['BL_Coord_Long', 'BL_Coord_Lat', 
                                         'BR_Coord_Long', 'BR_Coord_Lat', 
                                         'UR_Coord_Long', 'UR_Coord_Lat', 
                                         'UL_Coord_Long', 'UL_Coord_Lat'], axis=1)

    # List all CSV files in the input folder
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    for csv_file in csv_files:
        input_path = os.path.join(input_folder, csv_file)
        output_path = os.path.join(output_folder, csv_file)

        # Check if the output file already exists
        if os.path.exists(output_path):
            print(f"CSV file {csv_file} already exists in the output folder.")
            continue

        # Process each CSV file
        viirs_sca_df = pd.read_csv(input_path)

        # Convert the "aso_swe_df" into a GeoDataFrame with point geometries
        geometry = [Point(xy) for xy in zip(viirs_sca_df['x'], viirs_sca_df['y'])]
        viirs_sca_geo = gpd.GeoDataFrame(viirs_sca_df, geometry=geometry)
        result = gpd.sjoin(viirs_sca_geo, metadata, how='left', predicate='within', op = 'intersects')

        # Select specific columns for the final DataFrame
        Final_df = result[['y', 'x', 'data', 'cell_id']]
        Final_df.rename(columns={'data': 'VIIRS_SCA'}, inplace=True)

        # Drop rows where 'cell_id' is NaN
        if Final_df['cell_id'].isnull().values.any():
            Final_df = Final_df.dropna(subset=['cell_id'])

        # Save the processed DataFrame to a CSV file
        Final_df.to_csv(output_path, index=False)
        print(f"Processed {csv_file}")

if __name__ == "__main__":
    input_folder = r""
    metadata_path = r""
    output_folder = r""
    process_folder(input_folder, metadata_path, output_folder)

In [None]:
#Applying polygon geometries
# input_folder = f"ASO/{region}/{output_res}M_SWE_parquet/"
# metadata_file = f"grid_cells_meta.csv"
# output_folder = f"ASO/{region}/Processed_SWE"
# data_processor = ASODataProcessing()
# data_processor.process_folder(input_folder, metadata_file, output_folder) 