# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 
- Revisist other scripts and convert to PyArrow/parquet Brocli compressed file storage
- replace row-by-roy for loops with lamba functions,  for aso_swe_file in tqdm(os.listdir(f"{TrainingDFpath}/Obsdf")):  #add file names to aso_swe_files
        aso_swe_files.append(aso_swe_file) list comprehension  == tiff_files = [filename for filename in os.listdir(folder_path) if filename.endswith(".tif")]
- VIIRS
- connect precip to DF,
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- connect regional data together to train model
- connect different regions
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance
- add snotel script to functions

Put all units in SI, while it should not matter for model training since they are being normalized, they will be more interpretable.

In [None]:
region_list =    [ 'N_Sierras',
                                'Greater_Yellowstone',
                                'N_Co_Rockies',
                                'SW_Mont',
                                'SW_Co_Rockies',
                                'GBasin',
                                'N_Wasatch',
                                'N_Cascade',
                                'S_Wasatch',
                                'SW_Mtns',
                                'E_WA_N_Id_W_Mont',
                                'S_Wyoming',
                                'SE_Co_Rockies',
                                'Sawtooth',
                                'Ca_Coast',
                                'E_Or',
                                'N_Yellowstone',
                                'S_Cascade',
                                'Wa_Coast',
                                'Greater_Glacier']

In [None]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
region_list = ['S_Sierras']
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data
for region in region_list:
    folder_name = f"{region}/{directory}"
    data_tool = ASODownload(short_name, version)
    b_box = data_tool.BoundingBox(region)  
    url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing()
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

# Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
import GeoDF

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    #load snotel meta location data, use haversive function
    GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites, can this be threaded?

    # Get geophysical attributes for each site, need to see how to add output resolution
    gdf = GeoDF.GeoSpatial(region, output_res)

    #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
    metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)




## Connect Snotel to each ASO obs

In [1]:
import Obs_to_DF
region = "S_Sierras"
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works
aso_swe_files, aso_swe_files_folder_path, new_column_names, snotel_data = Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res) 

Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 300M resolution grids for S_Sierras region
Processing datetime component of SNOTEL observation dataframe
Loading all available processed ASO observations for the S_Sierras at 300M resolution


In [2]:
ts = []
nots = []
for aso_swe_file in aso_swe_files:
    timestamp = aso_swe_file.split('_')[-1].split('.')[0]


    if timestamp in new_column_names.values():
        ts.append(timestamp)
    else:
        nots.append(timestamp)

print(f"There are {len(ts)} aso dates in snotel obs")
print(f"There are {len(nots)} missing snotel obs")

There are 33 aso dates in snotel obs
There are 66 missing snotel obs


In [6]:
snotel_data.T.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,754,755,756,757,758,759,760,761,762,763
station_id,CDEC:ADM,CDEC:AGP,CDEC:ALP,CDEC:BCB,CDEC:BCH,CDEC:BFL,CDEC:BGP,CDEC:BIM,CDEC:BKL,CDEC:BLA,...,SNOTEL:988_ID_SNTL,SNOTEL:989_ID_SNTL,SNOTEL:990_WA_SNTL,SNOTEL:991_WA_SNTL,SNOTEL:992_UT_SNTL,SNOTEL:994_WA_SNTL,SNOTEL:995_WA_SNTL,SNOTEL:996_WA_SNTL,SNOTEL:998_WA_SNTL,SNOTEL:999_WA_SNTL
2013-01-01,5.9,17.52,12.75,4.3,2.88,6.6,7.2,19.32,16.56,4.92,...,19.2,9.0,27.5,19.171154,4.1,19.171154,19.171154,19.171154,48.2,33.0
2013-01-08,5.9,17.54,13.32,4.42,3.0,7.92,7.28,19.32,17.04,5.64,...,19.6,9.1,28.6,19.928846,4.1,19.928846,19.928846,19.928846,53.0,36.3
2013-01-15,6.5,17.85,14.26,4.62,3.48,8.76,7.79,20.16,18.72,7.32,...,21.6,10.7,31.5,22.063462,4.4,22.063462,22.063462,22.063462,61.3,40.6
2013-01-22,6.5,17.39,14.02,4.53,3.84,9.0,7.8,20.28,19.2,7.56,...,21.9,11.1,31.8,22.096154,4.5,22.096154,22.096154,22.096154,62.0,42.4
2013-01-29,7.4,18.03,13.39,4.67,3.96,10.44,8.35,20.52,19.08,8.28,...,23.3,11.7,33.1,23.247059,4.8,23.247059,23.247059,23.247059,66.3,46.6
2013-02-05,7.6,17.7,13.25,4.9,4.44,10.8,8.12,20.16,20.16,9.6,...,23.8,14.1,33.8,25.207843,5.1,25.207843,25.207843,25.207843,69.8,50.2
2013-02-12,7.4,17.65,14.3,4.9,5.4,11.16,8.21,21.12,21.72,8.52,...,24.6,14.5,35.2,26.375,5.2,26.375,26.375,26.375,72.9,53.9
2013-02-19,8.0,16.66,13.95,5.06,5.16,11.28,8.28,21.12,22.32,8.52,...,25.2,14.5,36.5,26.980769,5.2,26.980769,26.980769,26.980769,76.9,56.4
2013-02-26,8.0,17.21,15.73,5.11,3.6,12.0,8.22,21.12,24.36,8.16,...,26.7,17.0,38.1,29.275,6.0,29.275,29.275,29.275,82.0,61.6


In [4]:
import numpy as np
np.sort(nots)

array(['20140320', '20140323', '20140324', '20140406', '20140407',
       '20140414', '20140420', '20140423', '20140428', '20140502',
       '20140503', '20140511', '20140512', '20140517', '20140531',
       '20140605', '20150305', '20150325', '20150326', '20150403',
       '20150409', '20150412', '20150415', '20150426', '20150427',
       '20150503', '20150527', '20150528', '20150531', '20150608',
       '20160407', '20160416', '20170718', '20170719', '20170727',
       '20170815', '20170816', '20180304', '20180422', '20180423',
       '20180426', '20180528', '20180601', '20180602', '20190315',
       '20190316', '20190317', '20190324', '20190325', '20190417',
       '20190418', '20190421', '20190427', '20190428', '20190501',
       '20190502', '20190503', '20190605', '20190608', '20190609',
       '20190613', '20190614', '20190704', '20190705', '20190713',
       '20190714'], dtype='<U8')

In [5]:
import pandas as pd
pd.to_datetime('2014-03-20').strftime('%Y-%m-%d')

'2014-03-20'

In [1]:
import get_InSitu_obs

dates = ['2014-03-20']
SWE_dict, CDECsites = get_InSitu_obs.Get_Monitoring_Data_Threaded(dates)



ADM 2014-03-19 2014-03-20
AGP 2014-03-19 2014-03-20
ALP 2014-03-19 2014-03-20
BCB 2014-03-19 2014-03-20
BCH 2014-03-19 2014-03-20
BFL 2014-03-19 2014-03-20
BGP 2014-03-19 2014-03-20
BIM 2014-03-19 2014-03-20
BKL 2014-03-19 2014-03-20
BLA 2014-03-19 2014-03-20
BLC 2014-03-19 2014-03-20
BLD 2014-03-19 2014-03-20
BLK 2014-03-19 2014-03-20
BLS 2014-03-19 2014-03-20
BMW 2014-03-19 2014-03-20
BNK 2014-03-19 2014-03-20
BSK 2014-03-19 2014-03-20
CDP 2014-03-19 2014-03-20
CHM 2014-03-19 2014-03-20
CHP 2014-03-19 2014-03-20
CRL 2014-03-19 2014-03-20
CSL 2014-03-19 2014-03-20
CSV 2014-03-19 2014-03-20
CWD 2014-03-19 2014-03-20
CWF 2014-03-19 2014-03-20
CXS 2014-03-19 2014-03-20
DAN 2014-03-19 2014-03-20
DDM 2014-03-19 2014-03-20
DPO 2014-03-19 2014-03-20
DSS 2014-03-19 2014-03-20
EBB 2014-03-19 2014-03-20
EP5 2014-03-19 2014-03-20
FDC 2014-03-19 2014-03-20
FLL 2014-03-19 2014-03-20
FRN 2014-03-19 2014-03-20
GEM 2014-03-19 2014-03-20
GIN 2014-03-19 2014-03-20
GKS 2014-03-19 2014-03-20
GNF 2014-03

<suds.sax.document.Document object at 0x7f0bae360100>
<suds.sax.document.Document object at 0x7f0bafeaf670>


Snotel data fail, SNOTEL:878_WY_SNTL
Snotel data fail, SNOTEL:549_NV_SNTL


<suds.sax.document.Document object at 0x7f0bafeaf880>
<suds.sax.document.Document object at 0x7f0bac1d84c0>


Snotel data fail, SNOTEL:549_NV_SNTL
Attempt 2 for site SNOTEL:549_NV_SNTL


<suds.sax.document.Document object at 0x7f0ba559e070>


Snotel data fail, SNOTEL:878_WY_SNTL
Attempt 2 for site SNOTEL:878_WY_SNTL
Snotel data fail, SNOTEL:998_WA_SNTL


<suds.sax.document.Document object at 0x7f0bafeaf1f0>
<suds.sax.document.Document object at 0x7f0ba5594e50>


Snotel data fail, SNOTEL:549_NV_SNTL
Attempt 3 for site SNOTEL:549_NV_SNTL


<suds.sax.document.Document object at 0x7f0bae360100>
<suds.sax.document.Document object at 0x7f0bafeaf700>


Snotel data fail, SNOTEL:998_WA_SNTL
Attempt 2 for site SNOTEL:998_WA_SNTL
Snotel data fail, SNOTEL:878_WY_SNTL
Attempt 3 for site SNOTEL:878_WY_SNTL
Snotel data fail, SNOTEL:549_NV_SNTL
Attempt 4 for site SNOTEL:549_NV_SNTL


<suds.sax.document.Document object at 0x7f11e88bb070>


Snotel data fail, SNOTEL:305_CO_SNTL


<suds.sax.document.Document object at 0x7f0ba2269e20>
<suds.sax.document.Document object at 0x7f0ba5594e80>
<suds.sax.document.Document object at 0x7f0ba1e2a850>


Snotel data fail, SNOTEL:878_WY_SNTL
Attempt 4 for site SNOTEL:878_WY_SNTL
Snotel data fail, SNOTEL:998_WA_SNTL
Attempt 3 for site SNOTEL:998_WA_SNTL


<suds.sax.document.Document object at 0x7f11e88bb6d0>


Snotel data fail, SNOTEL:549_NV_SNTL
Snotel data fail, SNOTEL:305_CO_SNTL
Attempt 2 for site SNOTEL:305_CO_SNTL


<suds.sax.document.Document object at 0x7f1184d4df40>
<suds.sax.document.Document object at 0x7f0bc9757280>


Snotel data fail, SNOTEL:878_WY_SNTL
Snotel data fail, SNOTEL:998_WA_SNTL
Attempt 4 for site SNOTEL:998_WA_SNTL


<suds.sax.document.Document object at 0x7f11c5c84520>
<suds.sax.document.Document object at 0x7f1185abc7c0>


Snotel data fail, SNOTEL:305_CO_SNTL
Attempt 3 for site SNOTEL:305_CO_SNTL
Snotel data fail, SNOTEL:998_WA_SNTL


<suds.sax.document.Document object at 0x7f11f10965e0>


Snotel data fail, SNOTEL:305_CO_SNTL
Attempt 4 for site SNOTEL:305_CO_SNTL


<suds.sax.document.Document object at 0x7f10a6ecff40>
<suds.sax.document.Document object at 0x7f116597eaf0>


Snotel data fail, SNOTEL:379_WY_SNTL
Snotel data fail, SNOTEL:305_CO_SNTL


<suds.sax.document.Document object at 0x7f10a6ecf460>


Snotel data fail, SNOTEL:379_WY_SNTL
Attempt 2 for site SNOTEL:379_WY_SNTL


<suds.sax.document.Document object at 0x7f11f01f81c0>


Snotel data fail, SNOTEL:379_WY_SNTL
Attempt 3 for site SNOTEL:379_WY_SNTL


<suds.sax.document.Document object at 0x7f11471e9f40>


Snotel data fail, SNOTEL:379_WY_SNTL
Attempt 4 for site SNOTEL:379_WY_SNTL


<suds.sax.document.Document object at 0x7f10c6d274f0>


Snotel data fail, SNOTEL:379_WY_SNTL


In [2]:
SWE_dict[SWE_dict['2014-03-20']<0]



Unnamed: 0_level_0,2014-03-20
station_id,Unnamed: 1_level_1
CDEC:ADM,-0.1
CDEC:BLC,-0.05
CDEC:BNK,-0.24
CDEC:CWF,-0.2
CDEC:FLL,-0.2
CDEC:HIG,-0.46
CDEC:PSN,-3.7
CDEC:RBP,-0.38
CDEC:SCT,-0.52
CDEC:TCC,-0.2


In [3]:
SWE_dict

Unnamed: 0_level_0,2014-03-20
station_id,Unnamed: 1_level_1
CDEC:ADM,-0.10
CDEC:AGP,11.94
CDEC:ALP,11.06
CDEC:BCB,19.41
CDEC:BCH,0.12
...,...
SNOTEL:989_ID_SNTL,16.40
SNOTEL:990_WA_SNTL,34.60
SNOTEL:992_UT_SNTL,10.00
SNOTEL:998_WA_SNTL,-9999.00


In [None]:
import ulmo
import pandas as pd
SNOTELtest = {}

# This is the latest CUAHSI API endpoint
wsdlurl = 'https://hydroportal.cuahsi.org/Snotel/cuahsi_1_1.asmx?WSDL'

# Daily SWE
variablecode = 'SNOTEL:WTEQ_D'
sitecode = 'SNOTEL:379_WY_SNTL'
start_date = '2013-03-19'
end_date = '2013-03-20'


#values_df = None
# allows up to 3 attempts for getting site info, sometimes takes a few
# Request data from the server
site_values = ulmo.cuahsi.wof.get_values(wsdlurl, sitecode, variablecode, start=start_date, end=end_date)
# end_date = end_date.strftime('%Y-%m-%d')
# Convert to a Pandas DataFrame
SNOTEL_SWE = pd.DataFrame.from_dict(site_values['values'])
# Parse the datetime values to Pandas Timestamp objects
SNOTEL_SWE['datetime'] = pd.to_datetime(SNOTEL_SWE['datetime'], utc=True).values
SNOTEL_SWE.set_index('datetime', inplace = True)
# Convert values to float and replace -9999 nodata values with NaN
SNOTEL_SWE['value'] = pd.to_numeric(SNOTEL_SWE['value']).replace(-9999, np.nan)
# Remove any records flagged with lower quality
SNOTEL_SWE = SNOTEL_SWE[SNOTEL_SWE['quality_control_level_code'] == '1']


NameError: name 'pd' is not defined

In [10]:
import pandas as pd
import os
#load access key
HOME = os.path.expanduser('~')

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data/"
Snotelmeta_path = f"{snotel_path}ground_measures_metadata.csv"

snotel_file = pd.read_csv(Snotelmeta_path)

#remove sites that are not present in entire dataset
incom_sites = [
                'SNOTEL:1148_UT_SNTL',
                'SNOTEL:1251_CO_SNTL',
                'SNOTEL:1252_CO_SNTL',
                'SNOTEL:1256_WA_SNTL',
                'SNOTEL:1257_WA_SNTL',
                'SNOTEL:1258_CA_SNTL',
                'SNOTEL:1259_WA_SNTL',
                'SNOTEL:1261_UT_SNTL',
                'SNOTEL:1262_NV_SNTL',
                'SNOTEL:1263_WA_SNTL',
                'SNOTEL:1269_UT_SNTL',	
                'SNOTEL:1271_AZ_SNTL',
                'SNOTEL:1272_NV_SNTL',
                'SNOTEL:1277_CA_SNTL',
                'SNOTEL:1278_UT_SNTL',
                'SNOTEL:1280_UT_SNTL',
                'SNOTEL:1286_MT_SNTL',
                'SNOTEL:1287_MT_SNTL'
                ]
snotel_file = snotel_file[~snotel_file['station_id'].isin(incom_sites)]
snotel_file

Unnamed: 0,station_id,name,elevation_m,latitude,longitude,state
0,CDEC:ADM,Adin Mountain,1889.760000,41.237000,-120.792000,California
1,CDEC:AGP,Agnew Pass,2880.360000,37.726631,-119.141731,California
2,CDEC:ALP,Alpha (Smud),2316.480000,38.804192,-120.215652,California
3,CDEC:BCB,Blackcap Basin,3139.440000,37.066685,-118.773010,California
4,CDEC:BCH,Beach Meadows,2331.720000,36.126095,-118.293457,California
...,...,...,...,...,...,...
695,SNOTEL:989_ID_SNTL,Moscow Mountain,1432.560059,46.805000,-116.853500,Idaho
696,SNOTEL:990_WA_SNTL,Beaver Pass,1106.423950,48.879299,-121.255501,Washington
697,SNOTEL:992_UT_SNTL,Bear River RS,2675.229492,40.885201,-110.827698,Utah
698,SNOTEL:998_WA_SNTL,Easy Pass,1606.296021,48.859329,-121.438950,Washington


In [None]:
import GeoDF

region = 'S_Sierras'
output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
GeoDF.add_geospatial_threaded(region, output_res)

# Get NASA VIIRS fraction snow covered area for each location 

In [None]:
import get_VIIRS_SCA
region = 'S_Sierras'
output_res = 300
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS
get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)

In [None]:
#load access key
import pandas as pd
import numpy as np
import os
HOME = os.path.expanduser('~')
region = 'S_Sierras'
output_res = 300
threshold = 20

ViirsFolder = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution/VIIRSGeoObsDFs/20_fSCA_Thresh"

#Get list of GeoObsDF dataframes
GeoObsDF_files = [filename for filename in os.listdir(ViirsFolder)]

file = pd.read_parquet(os.path.join(ViirsFolder, GeoObsDF_files[1]), engine = 'fastparquet')

file

In [None]:
file[(file['VIIRS_SCA'].isna())]

In [None]:
notna = file[(file['VIIRS_SCA']>0) & (file['swe_m'] > 0.5)]
np.sort(notna['VIIRS_SCA'].unique())

In [None]:
np.NaN

In [None]:
import get_Precip

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
region = 'S_Sierras'
output_res = 100
for year in years:
    get_Precip.get_precip_threaded(year, region, output_res)

In [None]:
import os
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

HOME = os.path.expanduser('~')

region = 'S_Sierras'
year = 2013
output_res = 100

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/{year}"

ppt = pd.read_parquet(f"{Precippath}/NLDAS_PPT_2013-04-03.parquet")

ppt.set_index('cell_id', inplace=True)

ppt

In [None]:
ppt

In [None]:
# Merge with metadata
req_cols = ['cell_id', 'lat', 'lon', 'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
            'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat', 'geometry']
Result = final_df.merge(metadata[req_cols], how='left', on='cell_id')

# Column renaming and ordering
Result.rename(columns={'swe': 'ASO_SWE_in'}, inplace=True)
Result = Result[['cell_id', 'Date', 'ASO_SWE_in', 'lat', 'lon', 'nearest site 1', 'nearest site 2',
                    'nearest site 3', 'nearest site 4', 'nearest site 5', 'nearest site 6',
                    'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                    'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat']]

# Save the merged data to a new file
output_filename = f"{HOME}/SWEML/data/NSMv2.0/data/TrainingDFs/Merged_aso_snotel_data.parquet"
Result.to_csv(output_filename, index=False)
display(Result.head(10))
print("Processed and saved data")