# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 
- Revisist other scripts and convert to PyArrow/parquet Brocli compressed file storage
- replace row-by-roy for loops with lamba functions,  for aso_swe_file in tqdm(os.listdir(f"{TrainingDFpath}/Obsdf")):  #add file names to aso_swe_files
        aso_swe_files.append(aso_swe_file) list comprehension  == tiff_files = [filename for filename in os.listdir(folder_path) if filename.endswith(".tif")]
- VIIRS
- connect precip to DF,
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- connect regional data together to train model
- connect different regions
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance
- add snotel script to functions

Put all units in SI, while it should not matter for model training since they are being normalized, they will be more interpretable.

In [2]:
region_list =    [ 'N_Sierras',
                                'Greater_Yellowstone',
                                'N_Co_Rockies',
                                'SW_Mont',
                                'SW_Co_Rockies',
                                'GBasin',
                                'N_Wasatch',
                                'N_Cascade',
                                'S_Wasatch',
                                'SW_Mtns',
                                'E_WA_N_Id_W_Mont',
                                'S_Wyoming',
                                'SE_Co_Rockies',
                                'Sawtooth',
                                'Ca_Coast',
                                'E_Or',
                                'N_Yellowstone',
                                'S_Cascade',
                                'Wa_Coast',
                                'Greater_Glacier']

In [3]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
region_list = ['S_Sierras']
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data
for region in region_list:
    folder_name = f"{region}/{directory}"
    data_tool = ASODownload(short_name, version)
    b_box = data_tool.BoundingBox(region)  
    url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing()
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

Bounding Box collected for S_Sierras: -120.3763448720203,36.29256774541929,-118.292253412863,38.994985247736324
Fetching file URLs in progress for S_Sierras from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-120.3763448720203,36.29256774541929,-118.292253412863,38.994985247736324

Found 131 matches.
['https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif.xml', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_20130429.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_20130429.tif

100%|██████████| 262/262 [00:00<00:00, 2744.26it/s]




KeyboardInterrupt: 






# Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
import GeoDF

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    #load snotel meta location data, use haversive function
    GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites, can this be threaded?

    # Get geophysical attributes for each site, need to see how to add output resolution
    gdf = GeoDF.GeoSpatial(region, output_res)

    #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
    metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)




## Connect Snotel to each ASO obs

In [1]:
import importlib
importlib.reload(Obs_to_DF)

import Obs_to_DF
region = "S_Sierras"
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works
Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res) 

Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 300M resolution grids for S_Sierras region
Processing datetime component of SNOTEL observation dataframe
Loading 99 processed ASO observations for the S_Sierras at 300M resolution
There are 99 aso dates in snotel obs
There are 0 missing snotel obs
Getting CDEC and SNOTEL observations for the following dates: []


0it [00:00, ?it/s]

Updating local meta and saving.





Connecting 1 timesteps of observations for S_Sierras


100%|██████████| 1/1 [00:00<00:00,  1.74it/s]


Adding geospatial data to 20160614 observations...


  0%|          | 0/25504 [00:00<?, ?it/s]

Job complete for connecting SNOTEL obs to sites/dates


In [None]:
import GeoDF

region = 'S_Sierras'
output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
GeoDF.add_geospatial_threaded(region, output_res)

# Get NASA VIIRS fraction snow covered area for each location 

In [None]:
import get_VIIRS_SCA
region = 'S_Sierras'
output_res = 300
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS
get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)

In [None]:
#load access key
import pandas as pd
import numpy as np
import os
HOME = os.path.expanduser('~')
region = 'S_Sierras'
output_res = 300
threshold = 20

ViirsFolder = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution/VIIRSGeoObsDFs/20_fSCA_Thresh"

#Get list of GeoObsDF dataframes
GeoObsDF_files = [filename for filename in os.listdir(ViirsFolder)]

file = pd.read_parquet(os.path.join(ViirsFolder, GeoObsDF_files[1]), engine = 'fastparquet')

file

In [None]:
file[(file['VIIRS_SCA'].isna())]

In [None]:
notna = file[(file['VIIRS_SCA']>0) & (file['swe_m'] > 0.5)]
np.sort(notna['VIIRS_SCA'].unique())

In [None]:
np.NaN

In [None]:
import get_Precip

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
region = 'S_Sierras'
output_res = 100
for year in years:
    get_Precip.get_precip_threaded(year, region, output_res)

In [None]:
import os
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

HOME = os.path.expanduser('~')

region = 'S_Sierras'
year = 2013
output_res = 100

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/{year}"

ppt = pd.read_parquet(f"{Precippath}/NLDAS_PPT_2013-04-03.parquet")

ppt.set_index('cell_id', inplace=True)

ppt

In [None]:
ppt

In [None]:
# Merge with metadata
req_cols = ['cell_id', 'lat', 'lon', 'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
            'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat', 'geometry']
Result = final_df.merge(metadata[req_cols], how='left', on='cell_id')

# Column renaming and ordering
Result.rename(columns={'swe': 'ASO_SWE_in'}, inplace=True)
Result = Result[['cell_id', 'Date', 'ASO_SWE_in', 'lat', 'lon', 'nearest site 1', 'nearest site 2',
                    'nearest site 3', 'nearest site 4', 'nearest site 5', 'nearest site 6',
                    'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                    'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat']]

# Save the merged data to a new file
output_filename = f"{HOME}/SWEML/data/NSMv2.0/data/TrainingDFs/Merged_aso_snotel_data.parquet"
Result.to_csv(output_filename, index=False)
display(Result.head(10))
print("Processed and saved data")