# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 

- the SE and SW rockies have the same number of sites, make sure they are not the same...
- process ASO data, e.g. swe_m < 0.1 = 0, convert to cm to be consistent with monitoring sites and traditional mesurement. 
- document scripts
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance

In [3]:
import os
import model_Domain
HOME = os.path.expanduser('~')

#make SWEMLv2.0 modeling domain for western USA
region_list = model_Domain.modeldomain()
region_list

['Southwest', 'Northwest', 'NorthernRockies', 'SouthernRockies']

In [2]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
#region_list = ['S_Sierras']
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data
for region in region_list:
    print(region)
    folder_name = f"{region}/{directory}"
    data_tool = ASODownload(short_name, version)
    b_box = data_tool.BoundingBox(region)  
    url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing()
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

Southwest
Bounding Box collected for Southwest: -122.801796139143,36.29256774541929,-112.4801535246097,43.48412459980747
Fetching file URLs in progress for Southwest from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-122.801796139143,36.29256774541929,-112.4801535246097,43.48412459980747

Found 131 matches.
['https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.03/ASO_50M_SWE_USCATB_20130403.tif.xml', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_20130429.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2013.04.29/ASO_50M_SWE_USCATB_2013

100%|██████████| 262/262 [00:00<00:00, 10473.17it/s]




HTTP error 503, Service Unavailable
All NASA ASO data collected for given date range and can be found in /home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Raw_ASO_Data...
Files with .xml extension moved to the destination folder.
Converting .tif to parquet
Converting 131 ASO tif files to parquet'


100%|██████████| 131/131 [00:00<00:00, 382.67it/s]


An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20150428.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20160509.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20170816.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20180423.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20180601.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20180528.tif' not recognized as a supported file format.
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_

ERROR 1: TIFFReadEncodedStrip:Read error at scanline 4294967295; got 7598 bytes, expected 7912
ERROR 1: TIFFReadEncodedStrip() failed.
ERROR 1: /home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20190611.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed.
ERROR 1: TIFFReadEncodedStrip:Read error at scanline 4294967295; got 7604 bytes, expected 7912
ERROR 1: TIFFReadEncodedStrip() failed.
ERROR 1: /home/rjohnson18/SWEMLv2.0/data/ASO/Southwest/Processed_300M_SWE/ASO_300M_20190703.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed.


Checking to make sure all files successfully converted...


100%|██████████| 99/99 [00:01<00:00, 51.22it/s]


Northwest
Bounding Box collected for Northwest: -123.3407853096148,42.07988450615146,-120.65482261009741,48.92977030870274
Fetching file URLs in progress for Northwest from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-123.3407853096148,42.07988450615146,-120.65482261009741,48.92977030870274

Found 2 matches.
['https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2016.02.08/ASO_50M_SWE_USWAOL_20160208.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2016.02.08/ASO_50M_SWE_USWAOL_20160208.tif.xml', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2016.03.29/ASO_50M_SWE_USWAOL_20160329.tif', 'https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2016.03.29/ASO_50M_SWE_USWAOL_20

100%|██████████| 4/4 [00:00<00:00, 354.00it/s]


All NASA ASO data collected for given date range and can be found in /home/rjohnson18/SWEMLv2.0/data/ASO/Northwest/Raw_ASO_Data...
Files with .xml extension moved to the destination folder.
Converting .tif to parquet
Converting 2 ASO tif files to parquet'


100%|██████████| 2/2 [00:00<00:00,  4.88it/s]


Checking to make sure all files successfully converted...


100%|██████████| 2/2 [00:00<00:00, 17.82it/s]

NorthernRockies
Bounding Box collected for NorthernRockies: -119.92718722996061,42.57135262910201,-107.1261944312574,48.97106570807965
Fetching file URLs in progress for NorthernRockies from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-119.92718722996061,42.57135262910201,-107.1261944312574,48.97106570807965






Found no matches.
getting credentials NSIDC


0it [00:00, ?it/s]


All NASA ASO data collected for given date range and can be found in /home/rjohnson18/SWEMLv2.0/data/ASO/NorthernRockies/Raw_ASO_Data...
Files with .xml extension moved to the destination folder.
Converting .tif to parquet
The folder 'NorthernRockies/Raw_ASO_Data' is empty.
SouthernRockies
Bounding Box collected for SouthernRockies: -113.0550753064462,33.35825378630481,-105.0780355834649,42.57135262910201
Fetching file URLs in progress for SouthernRockies from 2013-04-02T00:00:00Z to 2019-07-19T23:59:59Z
Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=ASO_50M_SWE&version=001&version=01&version=1&temporal[]=2013-04-02T00:00:00Z,2019-07-19T23:59:59Z&bounding_box=-113.0550753064462,33.35825378630481,-105.0780355834649,42.57135262910201

Found 19 matches.
['https://n5eil01u.ecs.nsidc.org/DP1/ASO/ASO_50M_SWE.001/2015.04.06/ASO_50M_SWE_USCOCJ_20150406.tif', 'h

100%|██████████| 38/38 [00:00<00:00, 2875.04it/s]


All NASA ASO data collected for given date range and can be found in /home/rjohnson18/SWEMLv2.0/data/ASO/SouthernRockies/Raw_ASO_Data...
Files with .xml extension moved to the destination folder.
Converting .tif to parquet
Converting 19 ASO tif files to parquet'


100%|██████████| 19/19 [00:00<00:00, 50.66it/s]


Checking to make sure all files successfully converted...


100%|██████████| 14/14 [00:00<00:00, 48.73it/s]


## Get Snotel and CDEC in situ observations
- clean in situ observations, specifically the CDEC sites, need a data processing method to remove outtliers and nan/0 obs
- Ideas - add nearest sites elevation, distance from cell, then can bypass sites with bad data. 

In [8]:
# Get in situ observations
import get_InSitu_obs
import numpy as np

#make a list of dates to aligns with the ASO observations (they go as early as Jan-29 and as far out as the July-17)
years = np.arange(2013,2020,1)
start_month_day = '10-01'
end_month_day = '07-31'
#datelist = get_InSitu_obs.make_dates(years, start_month_day, end_month_day, WY = True)

# observations 
get_InSitu_obs.Get_Monitoring_Data_Threaded_dp(years, start_month_day, end_month_day, WY = True)

#combine years
get_InSitu_obs.combine_dfs(years)

Getting SNOTEL and CDEC observations for 2013
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

<suds.sax.document.Document object at 0x7f07cc1d9150>


Snotel data fail, SNOTEL:1187_CO_SNTL


<suds.sax.document.Document object at 0x7f07ce924880>


Snotel data fail, SNOTEL:1187_CO_SNTL
Attempt 2 for site SNOTEL:1187_CO_SNTL


<suds.sax.document.Document object at 0x7f07ce9275e0>


Snotel data fail, SNOTEL:1187_CO_SNTL
Attempt 3 for site SNOTEL:1187_CO_SNTL


<suds.sax.document.Document object at 0x7f07cc1d9db0>


Snotel data fail, SNOTEL:1187_CO_SNTL
Attempt 4 for site SNOTEL:1187_CO_SNTL


<suds.sax.document.Document object at 0x7f07ce9f6500>
<suds.sax.document.Document object at 0x7f077794ad10>
<suds.sax.document.Document object at 0x7f0776d42aa0>
<suds.sax.document.Document object at 0x7f0777fa7220>
<suds.sax.document.Document object at 0x7f07ce973160>


Snotel data fail, SNOTEL:1187_CO_SNTL
Snotel data fail, SNOTEL:1236_UT_SNTL
Snotel data fail, SNOTEL:1242_NV_SNTL
Snotel data fail, SNOTEL:1243_NV_SNTL
Snotel data fail, SNOTEL:1244_NV_SNTL


<suds.sax.document.Document object at 0x7f07ce9f5540>
<suds.sax.document.Document object at 0x7f07ce910520>
<suds.sax.document.Document object at 0x7f0776d43a30>
<suds.sax.document.Document object at 0x7f07cc1da2f0>
<suds.sax.document.Document object at 0x7f0777825120>


Snotel data fail, SNOTEL:1247_UT_SNTL
Snotel data fail, SNOTEL:1236_UT_SNTL
Attempt 2 for site SNOTEL:1236_UT_SNTL
Snotel data fail, SNOTEL:1242_NV_SNTL
Attempt 2 for site SNOTEL:1242_NV_SNTL
Snotel data fail, SNOTEL:1248_UT_SNTL
Snotel data fail, SNOTEL:1243_NV_SNTL
Attempt 2 for site SNOTEL:1243_NV_SNTL


<suds.sax.document.Document object at 0x7f07771be260>
<suds.sax.document.Document object at 0x7f0777948400>
<suds.sax.document.Document object at 0x7f07779ff640>
<suds.sax.document.Document object at 0x7f0776d404c0>
<suds.sax.document.Document object at 0x7f07ce9734c0>


Snotel data fail, SNOTEL:1244_NV_SNTL
Attempt 2 for site SNOTEL:1244_NV_SNTL
Snotel data fail, SNOTEL:1236_UT_SNTL
Attempt 3 for site SNOTEL:1236_UT_SNTL
Snotel data fail, SNOTEL:1247_UT_SNTL
Attempt 2 for site SNOTEL:1247_UT_SNTL
Snotel data fail, SNOTEL:1242_NV_SNTL
Attempt 3 for site SNOTEL:1242_NV_SNTL
Snotel data fail, SNOTEL:1248_UT_SNTL
Attempt 2 for site SNOTEL:1248_UT_SNTL


<suds.sax.document.Document object at 0x7f07771bda80>
<suds.sax.document.Document object at 0x7f0776d43bb0>
<suds.sax.document.Document object at 0x7f07842fa9e0>
<suds.sax.document.Document object at 0x7f07771bde40>
<suds.sax.document.Document object at 0x7f07842f96f0>


Snotel data fail, SNOTEL:1243_NV_SNTL
Attempt 3 for site SNOTEL:1243_NV_SNTL
Snotel data fail, SNOTEL:1244_NV_SNTL
Attempt 3 for site SNOTEL:1244_NV_SNTL
Snotel data fail, SNOTEL:1236_UT_SNTL
Attempt 4 for site SNOTEL:1236_UT_SNTL
Snotel data fail, SNOTEL:1247_UT_SNTL
Attempt 3 for site SNOTEL:1247_UT_SNTL
Snotel data fail, SNOTEL:1242_NV_SNTL
Attempt 4 for site SNOTEL:1242_NV_SNTL


<suds.sax.document.Document object at 0x7f0777fa5030>
<suds.sax.document.Document object at 0x7f07771bf880>
<suds.sax.document.Document object at 0x7f07842fbe20>
<suds.sax.document.Document object at 0x7f07ce9f53f0>
<suds.sax.document.Document object at 0x7f0776d421a0>
<suds.sax.document.Document object at 0x7f07ce913b50>


Snotel data fail, SNOTEL:1243_NV_SNTL
Attempt 4 for site SNOTEL:1243_NV_SNTL
Snotel data fail, SNOTEL:1248_UT_SNTL
Attempt 3 for site SNOTEL:1248_UT_SNTL
Snotel data fail, SNOTEL:1244_NV_SNTL
Attempt 4 for site SNOTEL:1244_NV_SNTL
Snotel data fail, SNOTEL:1236_UT_SNTL
Snotel data fail, SNOTEL:1247_UT_SNTL
Attempt 4 for site SNOTEL:1247_UT_SNTL
Snotel data fail, SNOTEL:1242_NV_SNTL


<suds.sax.document.Document object at 0x7f07ce9f75e0>
<suds.sax.document.Document object at 0x7f07842f85b0>
<suds.sax.document.Document object at 0x7f07ce9f5c30>
<suds.sax.document.Document object at 0x7f07ce9f6890>
<suds.sax.document.Document object at 0x7f07779ffe50>


Snotel data fail, SNOTEL:1248_UT_SNTL
Attempt 4 for site SNOTEL:1248_UT_SNTL
Snotel data fail, SNOTEL:1243_NV_SNTL
Snotel data fail, SNOTEL:1244_NV_SNTL
Snotel data fail, SNOTEL:1249_UT_SNTL
Snotel data fail, SNOTEL:1247_UT_SNTL


<suds.sax.document.Document object at 0x7f07ce912f80>
<suds.sax.document.Document object at 0x7f07ce927250>
<suds.sax.document.Document object at 0x7f07842fa7d0>


Snotel data fail, SNOTEL:1254_NM_SNTL
Snotel data fail, SNOTEL:1248_UT_SNTL
Snotel data fail, SNOTEL:1249_UT_SNTL
Attempt 2 for site SNOTEL:1249_UT_SNTL


<suds.sax.document.Document object at 0x7f07771bf760>
<suds.sax.document.Document object at 0x7f07ce926e60>


Snotel data fail, SNOTEL:1254_NM_SNTL
Attempt 2 for site SNOTEL:1254_NM_SNTL
Snotel data fail, SNOTEL:1249_UT_SNTL
Attempt 3 for site SNOTEL:1249_UT_SNTL


<suds.sax.document.Document object at 0x7f07ce971180>
<suds.sax.document.Document object at 0x7f07ce9f6dd0>


Snotel data fail, SNOTEL:1254_NM_SNTL
Attempt 3 for site SNOTEL:1254_NM_SNTL
Snotel data fail, SNOTEL:1249_UT_SNTL
Attempt 4 for site SNOTEL:1249_UT_SNTL


<suds.sax.document.Document object at 0x7f07ce971180>
<suds.sax.document.Document object at 0x7f07ce9240a0>


Snotel data fail, SNOTEL:1254_NM_SNTLSnotel data fail, SNOTEL:1249_UT_SNTL

Attempt 4 for site SNOTEL:1254_NM_SNTL


<suds.sax.document.Document object at 0x7f07771bfcd0>


Snotel data fail, SNOTEL:1254_NM_SNTL
Getting SNOTEL and CDEC observations for 2014
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

Getting SNOTEL and CDEC observations for 2015
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

Getting SNOTEL and CDEC observations for 2016
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

Getting SNOTEL and CDEC observations for 2017
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

Getting SNOTEL and CDEC observations for 2018
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

<suds.sax.document.Document object at 0x7f07ce926200>


Snotel data fail, SNOTEL:677_ID_SNTL


<suds.sax.document.Document object at 0x7f0776d43df0>


Snotel data fail, SNOTEL:677_ID_SNTL
Attempt 2 for site SNOTEL:677_ID_SNTL


<suds.sax.document.Document object at 0x7f07849068c0>


Snotel data fail, SNOTEL:677_ID_SNTL
Attempt 3 for site SNOTEL:677_ID_SNTL


<suds.sax.document.Document object at 0x7f07ce927a00>


Snotel data fail, SNOTEL:677_ID_SNTL
Attempt 4 for site SNOTEL:677_ID_SNTL


<suds.sax.document.Document object at 0x7f077707c8e0>


Snotel data fail, SNOTEL:677_ID_SNTL
Getting SNOTEL and CDEC observations for 2019
Getting California Data Exchange Center SWE data from 100 sites...


  0%|          | 0/100 [00:00<?, ?it/s]

Getting NRCS SNOTEL SWE data from 571 sites...


  0%|          | 0/571 [00:00<?, ?it/s]

<suds.sax.document.Document object at 0x7f07843f6c80>


Snotel data fail, SNOTEL:1005_CO_SNTL


# Code for generating ML dataframe using nearest in situ monitoring sites

In [15]:
import GeoDF

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        #load snotel meta location data, use haversive function
        GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites

        # Get geophysical attributes for each site, need to see how to add output resolution
        gdf = GeoDF.GeoSpatial(region, output_res)

        #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
        metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)
    else:
        print(f"No ASO data for {region}")




Southwest
Loading all Geospatial prediction/observation files and concatenating into one dataframe


100%|██████████| 99/99 [00:04<00:00, 19.92it/s]


Identifying unique sites to create geophysical information dataframe
converting to geodataframe
Processing snotel geometry
Calculating haversine distance for 299555 locations to in situ OBS, and saving cell-obs relationships in dictionary


  0%|          | 0/299555 [00:00<?, ?it/s]

Saving nearest SNOTEL in Southwest for each cell id in a pkl file
Loading geospatial data for Southwest
Converting to geodataframe
Calculating dataframe bounding box
Retrieving Copernicus 90m DEM tiles


100%|██████████| 30/30 [00:00<00:00, 200364.84it/s]


There are 30 tiles in the region
Determining Grid Cell Spatial Features


100%|██████████| 299555/299555 [00:43<00:00, 6867.49it/s]


Job complete for getting geospatial metadata, processing dataframe


299555it [8:14:20, 10.10it/s]


Saving Southwest dataframe in /home/rjohnson18/SWEMLv2.0/data/TrainingDFs/Southwest/300M_Resolution
Northwest
Loading all Geospatial prediction/observation files and concatenating into one dataframe


100%|██████████| 2/2 [00:00<00:00, 18.29it/s]

Identifying unique sites to create geophysical information dataframe





converting to geodataframe
Processing snotel geometry
Calculating haversine distance for 85185 locations to in situ OBS, and saving cell-obs relationships in dictionary


  0%|          | 0/85185 [00:00<?, ?it/s]

Saving nearest SNOTEL in Northwest for each cell id in a pkl file
Loading geospatial data for Northwest
Converting to geodataframe
Calculating dataframe bounding box
Retrieving Copernicus 90m DEM tiles


100%|██████████| 22/22 [00:00<00:00, 169001.26it/s]


There are 22 tiles in the region
Determining Grid Cell Spatial Features


100%|██████████| 85185/85185 [00:12<00:00, 6842.00it/s]


Job complete for getting geospatial metadata, processing dataframe


85185it [2:15:11, 10.50it/s]


Saving Northwest dataframe in /home/rjohnson18/SWEMLv2.0/data/TrainingDFs/Northwest/300M_Resolution
No ASO data for NorthernRockies
SouthernRockies
Loading all Geospatial prediction/observation files and concatenating into one dataframe


100%|██████████| 14/14 [00:00<00:00, 34.29it/s]


Identifying unique sites to create geophysical information dataframe
converting to geodataframe
Processing snotel geometry
Calculating haversine distance for 139011 locations to in situ OBS, and saving cell-obs relationships in dictionary


  0%|          | 0/139011 [00:00<?, ?it/s]

Saving nearest SNOTEL in SouthernRockies for each cell id in a pkl file
Loading geospatial data for SouthernRockies
Converting to geodataframe
Calculating dataframe bounding box
Retrieving Copernicus 90m DEM tiles


100%|██████████| 36/36 [00:00<00:00, 361231.92it/s]


There are 36 tiles in the region
Determining Grid Cell Spatial Features


100%|██████████| 139011/139011 [00:25<00:00, 5401.52it/s]


Job complete for getting geospatial metadata, processing dataframe


139011it [3:47:42, 10.17it/s]


Saving SouthernRockies dataframe in /home/rjohnson18/SWEMLv2.0/data/TrainingDFs/SouthernRockies/300M_Resolution


## Connect Snotel to each ASO obs

- change nearest_sites name to ns

In [5]:
import importlib
importlib.reload(Obs_to_DF)

NameError: name 'importlib' is not defined

In [6]:
import Obs_to_DF
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works - need to just load the SNOTEL file, not collect them as in the function
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        dates = []
        manual = False
        Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res, manual, dates) 
    else:
        print(f"No ASO data for {region}")


Southwest
Connecting site observations with nearest monitoring network obs
Loading observations from 2013-2019
Loading 300M resolution grids for Southwest region
Processing datetime component of SNOTEL observation dataframe
Loading 99 processed ASO observations for the Southwest at 300M resolution
There are 97 aso dates in snotel obs
There are 2 missing snotel obs
Connecting 99 timesteps of observations for Southwest


  0%|          | 0/99 [00:00<?, ?it/s]

Site processing complete, adding observtional data to 20130403 df...


  0%|          | 0/16523 [00:00<?, ?it/s]

KeyError: "['20130403'] not in index"

In [3]:
import pandas as pd

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data/"
Snotelobs_path = f"{snotel_path}ground_measures.parquet"
#Snotelobs_path = f"{snotel_path}ground_measures_dp.parquet"
snotel_data = pd.read_parquet(Snotelobs_path)
snotel_data

Unnamed: 0_level_0,2013-01-01,2013-01-08,2013-01-15,2013-01-22,2013-01-29,2013-02-05,2013-02-12,2013-02-19,2013-02-26,2013-03-05,...,2019-07-05,2019-07-13,2019-07-14,2019-07-15,2019-07-16,2019-12-03,2019-12-10,2019-12-17,2019-12-24,2019-12-31
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CDEC:ADM,5.90,5.90,6.50,6.50,7.40,7.60,7.40,8.00,8.00,8.00,...,7.65006,7.65006,7.65006,0.990189,0.735849,0.700000,1.200000,3.400000,3.700000,3.400000
CDEC:AGP,17.52,17.54,17.85,17.39,18.03,17.70,17.65,16.66,17.21,16.26,...,7.65006,7.65006,7.65006,0.990189,0.735849,0.000000,0.600000,0.200000,0.000000,0.000000
CDEC:ALP,12.75,13.32,14.26,14.02,13.39,13.25,14.30,13.95,15.73,15.41,...,0.00000,0.00000,0.00000,0.000000,0.000000,5.690000,8.040000,10.740000,12.670000,12.570000
CDEC:BCB,4.30,4.42,4.62,4.53,4.67,4.90,4.90,5.06,5.11,5.23,...,7.65006,7.65006,7.65006,0.990189,0.735849,6.667174,8.578298,9.606105,10.934149,11.442903
CDEC:BCH,2.88,3.00,3.48,3.84,3.96,4.44,5.40,5.16,3.60,1.80,...,0.24000,0.24000,0.24000,0.240000,0.240000,2.880000,4.560000,4.680000,5.040000,6.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SNOTEL:988_ID_SNTL,19.20,19.60,21.60,21.90,23.30,23.80,24.60,25.20,26.70,28.20,...,0.00000,0.00000,0.00000,0.000000,0.000000,5.300000,7.800000,8.700000,14.500000,14.400000
SNOTEL:989_ID_SNTL,9.00,9.10,10.70,11.10,11.70,14.10,14.50,14.50,17.00,18.40,...,0.00000,0.00000,0.00000,0.000000,0.000000,0.300000,0.200000,1.600000,2.700000,3.000000
SNOTEL:990_WA_SNTL,27.50,28.60,31.50,31.80,33.10,33.80,35.20,36.50,38.10,39.60,...,0.00000,0.00000,0.00000,0.000000,0.000000,0.900000,0.800000,2.300000,8.700000,8.600000
SNOTEL:992_UT_SNTL,4.10,4.10,4.40,4.50,4.80,5.10,5.20,5.20,6.00,6.60,...,0.00000,0.00000,0.00000,0.000000,0.000000,2.000000,2.200000,3.500000,3.600000,3.800000


In [32]:
cols = snotel_data.columns
cols

Index(['2013-01-01', '2013-01-08', '2013-01-15', '2013-01-22', '2013-01-29',
       '2013-02-05', '2013-02-12', '2013-02-19', '2013-02-26', '2013-03-05',
       ...
       '2019-07-05', '2019-07-13', '2019-07-14', '2019-07-15', '2019-07-16',
       '2019-12-03', '2019-12-10', '2019-12-17', '2019-12-24', '2019-12-31'],
      dtype='object', length=312)

In [7]:
snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data/"
#Snotelobs_path = f"{snotel_path}ground_measures.parquet"
Snotelobs_path = f"{snotel_path}ground_measures_dp.parquet"
snotel_data = pd.read_parquet(Snotelobs_path)
snotel_data = snotel_data.T
snotel_data.reset_index(inplace = True)
snotel_data.rename(columns={'index':'station_id'}, inplace = True)
snotel_data

dates,station_id,2012-10-01,2012-10-02,2012-10-03,2012-10-04,2012-10-05,2012-10-06,2012-10-07,2012-10-08,2012-10-09,...,2019-07-22,2019-07-23,2019-07-24,2019-07-25,2019-07-26,2019-07-27,2019-07-28,2019-07-29,2019-07-30,2019-07-31
0,CDEC:ADM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,CDEC:AGP,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2,CDEC:ALP,2.4,2.2,2.1,1.9,1.5,1.4,1.1,0.9,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CDEC:BCB,9.8,9.8,9.7,9.7,9.8,9.9,9.9,10.0,10.1,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,CDEC:BCH,-9999.0,-9999.0,0.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.3,...,0.6,0.9,0.6,0.9,0.9,0.9,0.9,0.9,0.6,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,SNOTEL:988_ID_SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,SNOTEL:989_ID_SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,SNOTEL:990_WA_SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,SNOTEL:992_UT_SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import GeoDF

output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        GeoDF.add_geospatial_threaded(region, output_res)
    else:
        print(f"No ASO data for {region}")

# Get NASA VIIRS fraction snow covered area for each location 

* Make sure the code grabs all dates for each region


In [None]:
# import importlib
# importlib.reload(get_VIIRS_SCA)

import get_VIIRS_SCA
output_res = 300
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS - I think all of this data is for the incorrect year...
#get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)
    else:
        print(f"No ASO data for {region}")


In [4]:
import get_Precip

'''
note*, if using python > 3.9, you will likely need to change the ee package to from io import StringIO
'''

import os
HOME = os.path.expanduser('~')

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
output_res = 300
threshold = 20

region_list = ['GBasin']

for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_Precip.get_precip_threaded(region, output_res, years)
    else:
        print(f"No ASO data for {region}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(region, output_res, threshold)


GBasin
['2013-04-03', '2013-04-29', '2013-05-03', '2013-05-25', '2013-06-01', '2013-06-08', '2014-03-20', '2014-03-23', '2014-03-24', '2014-04-06', '2014-04-07', '2014-04-14', '2014-04-20', '2014-04-23', '2014-04-28', '2014-04-29', '2014-05-02', '2014-05-03', '2014-05-11', '2014-05-12', '2014-05-17', '2014-05-27', '2014-05-31', '2014-06-05', '2015-02-17', '2015-03-05', '2015-03-25', '2015-03-26', '2015-04-03', '2015-04-09', '2015-04-12', '2015-04-15', '2015-04-26', '2015-04-27', '2015-04-28', '2015-05-03', '2015-05-27', '2015-05-28', '2015-05-31', '2015-06-08', '2015-06-09', '2016-03-26', '2016-04-01', '2016-04-07', '2016-04-16', '2016-04-26', '2016-05-09', '2016-05-27', '2016-06-07', '2016-06-14', '2016-06-21', '2016-06-26', '2016-07-08', '2017-01-28', '2017-01-29', '2017-07-17', '2017-07-18', '2017-07-19', '2017-07-27', '2017-08-15', '2017-08-16', '2018-03-04', '2018-04-22', '2018-04-23', '2018-04-25', '2018-04-26', '2018-05-28', '2018-06-01', '2018-06-02', '2019-03-09', '2019-03-15'

  0%|          | 0/299705 [00:00<?, ?it/s]

100%|██████████| 299705/299705 [01:01<00:00, 4852.16it/s]


KeyboardInterrupt: 

In [None]:
import get_Seasonality

region = 'N_Co_Rockies'
output_res = 300
threshold = 20

#get the Day of season metric for each dataframe
get_Seasonality.get_DOS(region, output_res, threshold)

## Next steps
* Explore why errors in precip sites above
* add in situ obs - seasonality based on the historical neareste x monitoring stations - like a historical average to-date swe value unit hydrograph based on the day of year? This will include a historical time of year of normal swe value and a swe value of year compared to normal
* albedo metric


In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

HOME = os.path.expanduser('~')

region = 'N_Co_Rockies'
output_res = 300

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/sites/"

pptfiles = [filename for filename in os.listdir(Precippath)]

ppt = pd.read_parquet(f"{Precippath}NLDAS_PPT_N_Co_Rockies_300M_39.015_-107.027.parquet")
ppt.rename(columns={'datetime':'Date'}, inplace = True)
#ppt.set_index('cell_id', inplace=True)

ppt.head()


In [None]:
DFpath = '/home/rjohnson18/SWEMLv2.0/data/TrainingDFs/N_Co_Rockies/300M_Resolution/PrecipVIIRSGeoObsDFs_20_fSCA_Thresh'
geofile = 'Precip_VIIRS_GeoObsDF_20160404.parquet'

GDF = pd.read_parquet(os.path.join(DFpath, geofile))
GDF

In [None]:
import importlib
importlib.reload(get_Seasonality)

In [None]:
import get_Seasonality

region = 'N_Co_Rockies'
output_res = 300
threshold = 20

#process snotel sites to make "snow hydrograph features" to determine above/below average WY conditions
get_Seasonality.seasonal_snotel()


#get the Day of season metric for each dataframe
get_Seasonality.add_Seasonality(region, output_res, threshold)

In [None]:
import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import os
import warnings
import pickle as pkl
warnings.filterwarnings("ignore")

HOME = os.path.expanduser('~')

region = 'N_Co_Rockies'
output_res = 300
threshold = 20


DFpath = f'{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution/Seasonality_PrecipVIIRSGeoObsDFs_{threshold}_fSCA_Thresh'
files = [filename for filename in os.listdir(DFpath)]

df = pd.read_parquet(os.path.join(DFpath, files[0]))
df

In [None]:
# make a unit hydrograph ish meetric for each site

#load data
DFpath = f'{HOME}/SWEMLv2.0/data/SNOTEL_Data'
snotel =  pd.read_parquet(os.path.join(DFpath, 'seasonal_snotel.parquet'))

#find location average peak swe and divide dataframe by this number
#snotel = snotel/snotel.max(0)
snotel

import pandas as pd
import numpy as np

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data"
year_df = pd.read_parquet(f"{snotel_path}/2015_ground_measures_dp.parquet")

year_df = year_df.replace({-9999.0: np.nan})
year_df.head(5)

cols = year_df.columns
year_df[cols[0]] = pd.Series(year_df[cols[0]].values).interpolate(method='nearest').values

import matplotlib.pyplot as plt
import numpy as np

cols = year_df.columns
year_df.reset_index(inplace=True)

for s in np.arange(0,10,1):

       site = cols[s]

       fig, ax = plt.subplots(figsize=(22, 12))
       ax.plot(year_df.index, year_df[site])

       ax.set(xlabel='date', ylabel='SWE',
              title=f'{site} SWE time series')
       #ax.grid()
       plt.xticks(rotation=70)
       #fig.savefig("test.png")
       plt.show()

In [None]:
#load data
DFpath = f'{HOME}/SWEMLv2.0/data/SNOTEL_Data'
snotel =  pd.read_parquet(os.path.join(DFpath, 'ground_measures.parquet'))

#find location average peak swe and divide dataframe by this number
#snotel = snotel/snotel.max(0)
snotel = snotel.T

#change bad values = 7.65006, 9.60454, 27.139000,22.172265, 31.247021	  change - values to 0
cols = snotel.columns
for col in cols:
    snotel[col][(snotel[col]> 7.65) & (snotel[col]< 7.651)] = 0
    snotel[col][(snotel[col]> 9.604) & (snotel[col]< 9.605)] = 0
    snotel[col][(snotel[col]> 27.139) & (snotel[col]< 23.140)] = 0
    snotel[col][(snotel[col]> 22.172265) & (snotel[col]< 22.172266)] = 0
    snotel[col][(snotel[col]> 31.242265) & (snotel[col]< 31.242266)] = 0
    snotel[col][snotel[col]<0] = 0
snotel.reset_index(inplace = True)

#build in data checking script to fix outliers

In [None]:
snotel.loc[250:300]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import numpy as np

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data"
year_df = pd.read_parquet(f"{snotel_path}/ground_measures_dp.parquet")

year_df = year_df.replace({-9999.0: np.nan})
year_df.head(5)


site = cols[0]

fig, ax = plt.subplots(figsize=(22, 12))
ax.plot(snotel.index, snotel[site])

ax.set(xlabel='date', ylabel='SWE',
       title=f'{site} SWE time series')
#ax.grid()
plt.xticks(rotation=70)
#fig.savefig("test.png")
plt.show()

In [None]:
def zscore(s, window, thresh=2, return_all=False):
    roll = s.rolling(window=window, min_periods=1, center=True)
    avg = roll.mean()
    std = roll.std(ddof=0)
    z = s.sub(avg).div(std)   
    m = z.between(-thresh, thresh)
    
    if return_all:
        return z, avg, std, m
    return s.where(m, avg)


N = 1000
np.random.seed(1)
#df = pd.DataFrame({'MW': np.sin(np.linspace(0, 10, num=N))+np.random.normal(scale=0.6, size=N)})

df =pd.DataFrame(snotel[cols[0]])

z, avg, std, m = zscore(df[cols[0]], window=2, return_all=True)

ax = plt.subplots(figsize=(22, 12))

df[cols[0]].plot(label='data')
avg.plot(label='mean')
df.loc[~m, cols[0]].plot(label='outliers', marker='o', ls='')
avg[~m].plot(label='replacement', marker='o', ls='')
plt.legend()

In [None]:
N = 1000
np.random.seed(1)
df = pd.DataFrame({'MW': np.sin(np.linspace(0, 10, num=N))+np.random.normal(scale=0.6, size=N)})

z, avg, std, m = zscore(df['MW'], window=50, return_all=True)

ax = plt.subplots(figsize=(22, 12))

df['MW'].plot(label='data')
avg.plot(label='mean')
df.loc[~m, 'MW'].plot(label='outliers', marker='o', ls='')
avg[~m].plot(label='replacement', marker='o', ls='')
plt.legend()