# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

#8.1.3 #7.7.3 sometimes you have to change package versions for the progress bars to work, annoying!


In [2]:
pip install --force-reinstall -v "ipywidgets==7.7.5"

Using pip 24.0 from /uufs/chpc.utah.edu/common/home/u1531554/envs/SWEML_310/lib/python3.10/site-packages/pip (python 3.10)
Collecting ipywidgets==7.7.5
  Obtaining dependency information for ipywidgets==7.7.5 from https://files.pythonhosted.org/packages/34/65/e97c8528ce10091a7467fe82ade2d101270a233b9fb7324012ed0ebd0586/ipywidgets-7.7.5-py2.py3-none-any.whl.metadata
  Downloading ipywidgets-7.7.5-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting ipykernel>=4.5.1 (from ipywidgets==7.7.5)
  Obtaining dependency information for ipykernel>=4.5.1 from https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl.metadata
  Using cached ipykernel-6.30.1-py3-none-any.whl.metadata (6.2 kB)
Collecting ipython-genutils~=0.2.0 (from ipywidgets==7.7.5)
  Obtaining dependency information for ipython-genutils~=0.2.0 from https://files.pythonhosted.org/packages/fa/bc/9bd3b5c2b4774d5f33b2d544f1460be9df7df2fe42f352135381c347c

In [1]:
import os
HOME = os.getcwd()
import model_Domain
HOME = os.getcwd()


#If you get a proj.db error below, run the following and put the following into the terminal
import pyproj
# Get the PROJ data directory
proj_data_dir = pyproj.datadir.get_data_dir()
proj_db_path = proj_data_dir + "/proj.db"
os.environ['PROJ_LIB'] =pyproj.datadir.get_data_dir()
os.environ['PROJ_LIB']
print(proj_db_path)

#make SWEMLv2.0 modeling domain for western USA
region_list = model_Domain.modeldomain()
region_list.remove('NorthernRockies') # There is no ASO data for this region
region_list = ['SouthernRockies'] #'Northwest', 'Southwest'
region_list

/uufs/chpc.utah.edu/common/home/u1154915/.conda/envs/SWEML_310/lib/python3.10/site-packages/pyproj/proj_dir/share/proj/proj.db
Checking for required files
ground_measures_metadata.parquet is local


['SouthernRockies']

In [3]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()
from ASOget import ASODownload, ASODataProcessing

# # import earthaccess https://earthaccess.readthedocs.io/en/latest/howto/authenticate/
# # earthaccess.login(persist=True)

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
output_res = 750 #desired spatial resulution in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data, sometime sites will give error and break code, most times you can just rerun it using the data_processor sections below (e.g., comment out other parts
for region in region_list:
    print(region)
    folder_name = f"{region}/{directory}"
    # data_tool = ASODownload(short_name, version)
    # b_box = data_tool.BoundingBox(region)  
    # url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    # data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing() #note, 2019-5-1, 2019-06-11 seems to be bad, manually removed from SW region
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

SouthernRockies
Converting .tif to parquet
Converting 1 ASO tif files to parquet


  0%|          | 0/1 [00:00<?, ?it/s]

Checking to make sure all files successfully converted...


  0%|          | 0/1 [00:00<?, ?it/s]

## Get Snotel and CDEC in situ observations
- clean in situ observations, specifically the CDEC sites, need a data processing method to remove outtliers and nan/0 obs
- Ideas - add nearest sites elevation, distance from cell, then can bypass sites with bad data. 

In [5]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()
#Get in situ observations
import get_InSitu_obs
import numpy as np

#make a list of dates to align with the ASO observations (they go as early as Jan-29 and as far out as the July-17)
years = np.arange(2013,2020,1)
start_month_day = '10-01'
end_month_day = '08-31'
#datelist = get_InSitu_obs.make_dates(years, start_month_day, end_month_day, WY = True)

# observations 
get_InSitu_obs.Get_Monitoring_Data_Threaded_Updated(years, start_month_day, end_month_day, WY = True)

#combine years
get_InSitu_obs.combine_dfs(years)

getting in situ snow obs metadata
Getting SNOTEL and CDEC observations for 2013
Getting California Data Exchange Center SWE data from 130 sites...


  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2012-10-01,2013-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2012-10-01,2013-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2012-10-01,2013-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2013-10-01,2014-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2013-10-01,2014-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2013-10-01,2014-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2014-10-01,2015-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2014-10-01,2015-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2014-10-01,2015-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2015-10-01,2016-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2015-10-01,2016-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2015-10-01,2016-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2016-10-01,2017-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2016-10-01,2017-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2016-10-01,2017-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2017-10-01,2018-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2017-10-01,2018-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2017-10-01,2018-08-

  0%|          | 0/130 [00:00<?, ?it/s]

Start retrieving data for Farewell Gap, FRW
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/FRW:CA:MSNT%257Cid=%2522%2522%257Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Bonanza King, BNK
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/BNK:CA:MSNT%257Cid=%2522%2522%257Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Cedar Pass, CDP
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CDP:CA:MSNT%257Cid=%2522%2522%257Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Charlotte Lake, CRL
https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/CRL:CA:MSNT%257Cid=%2522%2522%257Cname/2018-10-01,2019-08-31/WTEQ::value?fitToS

  0%|          | 0/839 [00:00<?, ?it/s]

Start retrieving data for Adin Mtn, 301_CA_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/301:CA:SNTL%7Cid=%22%22%7Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Agua Canyon, 907_UT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/907:UT:SNTL%7Cid=%22%22%7Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Albro Lake, 916_MT_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/916:MT:SNTL%7Cid=%22%22%7Cname/2018-10-01,2019-08-31/WTEQ::value?fitToScreen=false
Start retrieving data for Alexander Lake, 1267_AK_SNTL using https://wcc.sc.egov.usda.gov/reportGenerator/view_csv/customMultiTimeSeriesGroupByStationReport/daily/start_of_period/1267:AK:SNTL%7Cid=%22%22%7Cname/2018-10-01,2019-08-

# Code for generating ML dataframe using nearest in situ monitoring sites

In [6]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()

import GeoDF
#output_res = 1000

#region_list = ['SouthernRockies']

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    # path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{region}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(region)
        #load snotel meta location data, use haversive function
        GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites

        # Get geophysical attributes for each site, need to see how to add output resolution
        gdf = GeoDF.GeoSpatial(region, output_res)

        #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
        metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)
    else:
        print(f"No ASO data for {region}")




SouthernRockies
Loading all Geospatial prediction/observation files and concatenating into one dataframe


  0%|          | 0/1 [00:00<?, ?it/s]

Identifying unique sites to create geophysical information dataframe
converting to geodataframe
Processing snotel geometry
Calculating haversine distance for 1498 locations to in situ OBS, and saving cell-obs relationships in dictionary


  0%|          | 0/1498 [00:00<?, ?it/s]

Saving nearest SNOTEL in SouthernRockies for each cell id in a pkl file
Loading geospatial data for SouthernRockies
Converting to geodataframe
Calculating dataframe bounding box
-108 37 -105 41
Retrieving Copernicus 90m DEM tiles


  0%|          | 0/20 [00:00<?, ?it/s]

There are 20 tiles in the region
Determining Grid Cell Spatial Features


  0%|          | 0/1498 [00:00<?, ?it/s]

Saving SouthernRockies dataframe in /uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/data/TrainingDFs/SouthernRockies/750M_Resolution


## Connect Snotel to each ASO obs


In [9]:
ModuleDir = os.getcwd()
# ModuleDir = os.chdir('Dataprocessing')
# ModuleDir = os.getcwd()

import Obs_to_DF
#region_list = ['Northwest','SouthernRockies', 'Southwest']
#output_res = 1000

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works - need to just load the SNOTEL file, not collect them as in the function
for region in region_list:
    # path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{region}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(region)
        dates = []
        manual = False
        Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res, manual, dates) 
    else:
        print(f"No ASO data for {region}")


SouthernRockies
Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 750M resolution grids for SouthernRockies region
Processing datetime component of SNOTEL observation dataframe
Loading 1 processed ASO observations for the SouthernRockies at 750M resolution
There are 1 aso dates in snotel obs
There are 0 missing snotel obs
Connecting 1 timesteps of observations for SouthernRockies


100%|██████████| 1/1 [00:04<00:00,  4.44s/it]

Site processing complete, adding observtional data to 20180330 df...


  0%|          | 0/1498 [00:00<?, ?it/s]




/uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/data/TrainingDFs/SouthernRockies/750M_Resolution/Obsdf/20180330_ObsDF.parquet
Job complete for connecting SNOTEL obs to sites/dates


In [8]:
ModuleDir

'/uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/Dataprocessing'

In [10]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()
import GeoDF

#output_res = 1000

#Connect cell ids with ASO obs and snotel obs to geospatial features
for region in region_list:
    #path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        GeoDF.add_geospatial_threaded(region, output_res)
    else:
        print(f"No ASO data for {region}")

SouthernRockies
Loading goeospatial meta data for grids in SouthernRockies
Loading all available processed ASO observations for the SouthernRockies at 750M resolution
Concatenating 1 with geospatial data...


  0%|          | 0/1 [00:00<?, ?it/s]

Job complete for connecting obs with geospatial data, the files can be found in /uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/data/TrainingDFs/SouthernRockies/750M_Resolution/GeoObsDFs


# Get NASA VIIRS fraction snow covered area for each location 

* Make sure the code grabs all dates for each region, may have to run multiple times
* run until "No granules found for DATE, requesting data from NSIDC..." no longer occurs


In [13]:
get_VIIRS_SCA.get_VIIRS_from_AWS()

In [12]:
ModuleDir = os.getcwd()
# ModuleDir = os.chdir('Dataprocessing')
# ModuleDir = os.getcwd()

import get_VIIRS_SCA
#output_res = 1000
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS - I think all of this data is for the incorrect year...
#get_VIIRS_SCA.get_VIIRS_from_AWS()

#region_list = ['Southwest']


#Connect VIIRS data to dataframes
for region in region_list:
    # path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)
    else:
        print(f"No ASO data for {region}")


SouthernRockies
Getting VIIRS fsca values for 1 timesteps of observations for SouthernRockies
20180330
dataprocessing VIIRS for 20180330 complete...
Job complete for connecting VIIRS fsca to sites/dates, files can be found in /uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/data/TrainingDFs/SouthernRockies/750M_Resolution/VIIRSGeoObsDFs/20_fSCA_Thresh


In [6]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()

import get_Precip

'''
note*, if using python > 3.9, you will likely need to change the ee package to from io import StringIO
'''

# import os
# HOME = os.path.expanduser('~')

#gets precipitation for each location, accumulates it through the water year
#This step could be made much more efficient by collecting all of the tiles in one step, then multiprocessing later

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
#output_res = 1000
threshold = 20

for region in region_list:
    # path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{region}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(region)
        get_Precip.get_precip_threaded(region, output_res, years)
    else:
        print(f"No ASO data for {region}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(region, output_res, threshold)


SouthernRockies
['2018-03-30']
No ASO observations for WY2013
No ASO observations for WY2014
No ASO observations for WY2015
No ASO observations for WY2016
No ASO observations for WY2017
No ASO observations for WY2019
[2018] 2017-09-30 2018-03-31
Getting daily precipitation data for 1498 sites


100%|██████████| 1498/1498 [00:00<00:00, 10975.92it/s]


Job complete for getting precipiation datdata for WY2019, processing dataframes for file storage
Adding precipitation features to ML dataframe for the SouthernRockies region.
Connecting precipitation to ASO observations for SouthernRockies on 2018-03-30


  0%|          | 0/1498 [00:00<?, ?it/s]

## Add seasonality metrics to the dataframe

In [10]:
ModuleDir = os.getcwd()
ModuleDir = os.chdir('Dataprocessing')
ModuleDir = os.getcwd()

import get_Seasonality

#output_res = 1000
threshold = 20
for region in region_list:
    #process snotel sites to make "snow hydrograph features" to determine above/below average WY conditions
    get_Seasonality.seasonal_snotel()


    #get the Day of season metric for each dataframe
    get_Seasonality.add_Seasonality(region, output_res, threshold)

Adding Day of Season, seasonal nearest monitoring site averages, and seasonal nearest monitoring site relationship to averages to all SouthernRockies dataframes...


  0%|          | 0/1 [00:00<?, ?it/s]


  0%|          | 0/1498 [00:00<?, ?it/s][A
 15%|█▍        | 224/1498 [00:00<00:00, 2238.15it/s][A
 30%|██▉       | 448/1498 [00:00<00:00, 2238.01it/s][A
 45%|████▍     | 672/1498 [00:00<00:00, 2232.31it/s][A
 60%|█████▉    | 896/1498 [00:00<00:00, 2227.52it/s][A
 75%|███████▍  | 1119/1498 [00:00<00:00, 2223.68it/s][A
100%|██████████| 1498/1498 [00:00<00:00, 2221.14it/s][A


# Use Sturm's snow classification as features within model framework

Using the originally created env, it looks like the rasterio package does not contain the correct ECS driver. Trying to address this with conda install conda-forge::rasterio in my SWEML_310 env from the shell in CHPC

In [14]:
# ModuleDir = os.getcwd()
# ModuleDir = os.chdir('Dataprocessing')
# ModuleDir = os.getcwd()

import sturm_processer as stpro
#outputres =1000
thres = 20

#download sturm data
stpro.get_Sturm_data()

for region in region_list:
    input_directory = f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/Seasonality_PrecipVIIRSGeoObsDFs/{thres}_fSCA_Thresh"
    sturm_file = f"{HOME}/data/SnowClassification/SnowClass_NA_300m_10.0arcsec_2021_v01.0.tif" #https://nsidc.org/data/nsidc-0768/versions/1
    output_directory = f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/Sturm_Seasonality_PrecipVIIRSGeoObsDFs/{thres}_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    stpro.process_sturm_data_for_files(input_directory, sturm_file, output_directory)

sturm data already downloaded
Sturm file bounds: BoundingBox(left=-180.0, bottom=0.0, right=-10.0, top=90.0)


Processing Parquet Files:   0%|          | 0/1 [00:00<?, ?it/s]
Sampling Sturm Data: 100%|██████████| 1498/1498 [00:00<00:00, 525647.74it/s]
Processing Parquet Files: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it]


# Add vegetation data to the dataframe from the North American land Cover Management Systemoutput_path

In [16]:
# ModuleDir = os.getcwd()
# ModuleDir = os.chdir('Dataprocessing')
# ModuleDir = os.getcwd()

import vegetation_processer as vegpro
import os

#get data
url = "http://www.cec.org/files/atlas_layers/1_terrestrial_ecosystems/1_01_0_land_cover_2020_30m/usa_land_cover_2020v2_30m_tif.zip"
output_path = f"{HOME}/data/LandCover/"
file = "usa_land_cover_2020v2_30m_tif.zip" 
vegpro.get_data(url, output_path, file)
#unzip the file is not already done
#vegpro.unzip_LC_data(output_path, file)
#output = 1000 

region_list = ['SouthernRockies']
for region in region_list:
    input_directory = f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/Sturm_Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    vegetation_file = f"{HOME}/data/LandCover/usa_land_cover_2020v2_30m_tif/USA_NALCMS_landcover_2020v2_30m/data/USA_NALCMS_landcover_2020v2_30m.tif"
    output_directory = f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/Vegetation_Sturm_Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    vegpro.process_vegetation_data_for_files(input_directory, vegetation_file, output_directory)

File downloaded successfully!
Vegetation file bounds: BoundingBox(left=-2043060.0, bottom=-2113150.0, right=2529600.0, top=732440.0)
Vegetation CRS: PROJCS["WGS_1984_Lambert_Azimuthal_Equal_Area",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",45],PARAMETER["longitude_of_center",-100],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1],AXIS["Easting",EAST],AXIS["Northing",NORTH]]


Processing Parquet Files:   0%|          | 0/1 [00:00<?, ?it/s]
Sampling Vegetation Data: 100%|██████████| 1498/1498 [00:00<00:00, 22298.57it/s]
Processing Parquet Files: 100%|██████████| 1/1 [00:20<00:00, 20.64s/it]


## Next steps
* Explore why errors in precip sites above
* add in situ obs - seasonality based on the historical neareste x monitoring stations - like a historical average to-date swe value unit hydrograph based on the day of year? This will include a historical time of year of normal swe value and a swe value of year compared to normal
* albedo metric


In [None]:
import pandas as pd

HOME = os.path.expanduser('~')
region = 'Southwest'
output_res = '300'

dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"

SWmeta = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")

import UpdateDataFrame

#need to update the topographic features for every dataframe
output_res = '300'
training_cats = ['Obsdf']
fSCA = '' #'20_fSCA_Thresh'


for training_cat in training_cats:
    print(training_cat)

    for region in region_list:
        print(region)
        dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"
        #file to be used to updated training DF
        updatefile = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")


        #Update Dataframe
        UpdateDataFrame.updateTrainingDF(region, output_res, training_cat, fSCA, updatefile)

trainfile = pd.read_parquet(f"{dfpath}/{training_cat}/{fSCA}/Sturm_Season_Precip_VIIRS_GeoObsDF_20150406.parquet")

import matplotlib.pyplot as plt
import geopandas as gpd

from mpl_toolkits.axes_grid1 import make_axes_locatable

def SpatialAnalysis(EvalDF):
    #Convert to a geopandas DF
    Pred_Geo = gpd.GeoDataFrame(EvalDF, geometry = gpd.points_from_xy(EvalDF.cen_lon, EvalDF.cen_lat))

    Pred_Geo.plot(column='Elevation_m',
                  legend=False,
                )
    
SpatialAnalysis(trainfile)