# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

In [1]:
import os
import model_Domain
HOME = os.path.expanduser('~')

#make SWEMLv2.0 modeling domain for western USA
region_list = model_Domain.modeldomain()
region_list.remove('NorthernRockies') # There is no ASO data for this region
region_list = ['Northwest', 'SouthernRockies', 'Southwest']
region_list

Checking for required files
ground_measures_metadata.parquet is local


['Northwest', 'SouthernRockies', 'Southwest']

In [None]:
from ASOget import ASODownload, ASODataProcessing

# import earthaccess https://earthaccess.readthedocs.io/en/latest/howto/authenticate/
# earthaccess.login(persist=True)

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data
for region in region_list:
    print(region)
    folder_name = f"{region}/{directory}"
    data_tool = ASODownload(short_name, version)
    b_box = data_tool.BoundingBox(region)  
    url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing()
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

## Get Snotel and CDEC in situ observations
- clean in situ observations, specifically the CDEC sites, need a data processing method to remove outtliers and nan/0 obs
- Ideas - add nearest sites elevation, distance from cell, then can bypass sites with bad data. 

In [None]:
# Get in situ observations
import get_InSitu_obs
import numpy as np

#make a list of dates to aligns with the ASO observations (they go as early as Jan-29 and as far out as the July-17)
years = np.arange(2013,2020,1)
start_month_day = '10-01'
end_month_day = '08-31'
#datelist = get_InSitu_obs.make_dates(years, start_month_day, end_month_day, WY = True)

# observations 
get_InSitu_obs.Get_Monitoring_Data_Threaded_dp(years, start_month_day, end_month_day, WY = True)

#combine years
get_InSitu_obs.combine_dfs(years)

# Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
import GeoDF
output_res = 300

region_list = ['Southwest']

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        #load snotel meta location data, use haversive function
        #GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites

        # Get geophysical attributes for each site, need to see how to add output resolution
        gdf = GeoDF.GeoSpatial(region, output_res)

        #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
        metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)
    else:
        print(f"No ASO data for {region}")




Southwest
Loading geospatial data for Southwest
Converting to geodataframe
Calculating dataframe bounding box
-121 35 -117 40
Retrieving Copernicus 90m DEM tiles


  0%|          | 0/30 [00:00<?, ?it/s]

There are 30 tiles in the region
Determining Grid Cell Spatial Features


  0%|          | 0/299557 [00:00<?, ?it/s]

Southwest_300M_38.163_-119.389 does not have copernicus DEM data, manual input


In [None]:
import pandas as pd

HOME = os.path.expanduser('~')
region = 'Southwest'
output_res = '300'

dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"

SWmeta = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")

import UpdateDataFrame

#need to update the topographic features for every dataframe
output_res = '300'
training_cats = ['Obsdf']
fSCA = '' #'20_fSCA_Thresh'


for training_cat in training_cats:
    print(training_cat)

    for region in region_list:
        print(region)
        dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"
        #file to be used to updated training DF
        updatefile = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")


        #Update Dataframe
        UpdateDataFrame.updateTrainingDF(region, output_res, training_cat, fSCA, updatefile)

trainfile = pd.read_parquet(f"{dfpath}/{training_cat}/{fSCA}/Sturm_Season_Precip_VIIRS_GeoObsDF_20150406.parquet")

import matplotlib.pyplot as plt
import geopandas as gpd

from mpl_toolkits.axes_grid1 import make_axes_locatable

def SpatialAnalysis(EvalDF):
    #Convert to a geopandas DF
    Pred_Geo = gpd.GeoDataFrame(EvalDF, geometry = gpd.points_from_xy(EvalDF.cen_lon, EvalDF.cen_lat))

    Pred_Geo.plot(column='Elevation_m',
                  legend=False,
                )
    
SpatialAnalysis(trainfile)

Obsdf
Northwest


  0%|          | 0/2 [00:00<?, ?it/s]

/home/whitelightning450/SWEMLv2.0/data/TrainingDFs/Northwest/300M_Resolution/Obsdf//                                     Date     swe_m   ns_1   ns_2  ns_3  \
cell_id                                                                   
Northwest_300M_48.09_-123.57   2016-03-29  0.000000  111.8  157.2  88.1   
Northwest_300M_48.09_-123.568  2016-03-29  0.000951  111.8  157.2  88.1   
Northwest_300M_48.09_-123.565  2016-03-29  0.000589  111.8  157.2  88.1   
Northwest_300M_48.09_-123.562  2016-03-29  0.000000  111.8  157.2  88.1   
Northwest_300M_48.09_-123.56   2016-03-29  0.000000  111.8  157.2  88.1   
...                                   ...       ...    ...    ...   ...   
Northwest_300M_47.496_-123.892 2016-03-29  0.000000  157.2  111.8  88.1   
Northwest_300M_47.496_-123.889 2016-03-29  0.000000  157.2  111.8  88.1   
Northwest_300M_47.496_-123.886 2016-03-29  0.000000  157.2  111.8  88.1   
Northwest_300M_47.496_-123.884 2016-03-29  0.000000  157.2  111.8  88.1   
Northwest_300M_4

  0%|          | 0/14 [00:00<?, ?it/s]

/home/whitelightning450/SWEMLv2.0/data/TrainingDFs/SouthernRockies/300M_Resolution/Obsdf//                                           Date  swe_m  ns_1  ns_2  ns_3  \
cell_id                                                                    
SouthernRockies_300M_37.981_-107.569 2016-04-03    0.0  50.8  37.1  50.3   
SouthernRockies_300M_37.981_-107.567 2016-04-03    0.0  50.8  37.1  50.3   
SouthernRockies_300M_37.981_-107.564 2016-04-03    0.0  50.8  37.1  50.3   
SouthernRockies_300M_37.981_-107.561 2016-04-03    0.0  50.8  37.1  50.3   
SouthernRockies_300M_37.981_-107.559 2016-04-03    0.0  50.8  37.1  50.3   
...                                         ...    ...   ...   ...   ...   
SouthernRockies_300M_37.392_-106.471 2016-04-03    0.0  24.4  36.1  66.5   
SouthernRockies_300M_37.392_-106.468 2016-04-03    0.0  24.4  36.1  66.5   
SouthernRockies_300M_37.392_-106.465 2016-04-03    0.0  24.4  36.1  66.5   
SouthernRockies_300M_37.392_-106.462 2016-04-03    0.0  24.4  36.1  66.5  

  0%|          | 0/99 [00:00<?, ?it/s]

/home/whitelightning450/SWEMLv2.0/data/TrainingDFs/Southwest/300M_Resolution/Obsdf//                                     Date  swe_m  ns_1  ns_2  ns_3  ns_4  \
cell_id                                                                    
Southwest_300M_38.193_-119.804 2014-04-07    0.0  57.0  69.9  73.7  58.0   
Southwest_300M_38.193_-119.802 2014-04-07    0.0  57.0  69.9  73.7  58.0   
Southwest_300M_38.193_-119.799 2014-04-07    0.0  57.0  69.9  73.7  58.0   
Southwest_300M_38.193_-119.796 2014-04-07    0.0  57.0  69.9  73.7  58.0   
Southwest_300M_38.193_-119.794 2014-04-07    0.0  57.0  69.9  73.7  58.0   
...                                   ...    ...   ...   ...   ...   ...   
Southwest_300M_37.729_-119.2   2014-04-07    0.0  33.7  24.9   8.1  40.1   
Southwest_300M_37.729_-119.197 2014-04-07    0.0  33.7  24.9   8.1  40.8   
Southwest_300M_37.729_-119.194 2014-04-07    0.0  33.7  24.9   8.1  40.8   
Southwest_300M_37.729_-119.192 2014-04-07    0.0  33.7  24.9   8.1  40.8   
Sout

## Connect Snotel to each ASO obs


In [None]:
import Obs_to_DF
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works - need to just load the SNOTEL file, not collect them as in the function
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        dates = []
        manual = False
        Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res, manual, dates) 
    else:
        print(f"No ASO data for {region}")


In [None]:
import GeoDF

output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        GeoDF.add_geospatial_threaded(region, output_res)
    else:
        print(f"No ASO data for {region}")

# Get NASA VIIRS fraction snow covered area for each location 

* Make sure the code grabs all dates for each region


In [None]:
import get_VIIRS_SCA
output_res = 300
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS - I think all of this data is for the incorrect year...
#get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)
    else:
        print(f"No ASO data for {region}")


In [None]:
import get_Precip

'''
note*, if using python > 3.9, you will likely need to change the ee package to from io import StringIO
'''

import os
HOME = os.path.expanduser('~')

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
output_res = 300
threshold = 20

for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_Precip.get_precip_threaded(region, output_res, years)
    else:
        print(f"No ASO data for {region}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(region, output_res, threshold)


In [None]:
import get_Seasonality

output_res = 300
threshold = 20
for region in region_list:
    #process snotel sites to make "snow hydrograph features" to determine above/below average WY conditions
    get_Seasonality.seasonal_snotel()


    #get the Day of season metric for each dataframe
    get_Seasonality.add_Seasonality(region, output_res, threshold)

# Use Sturm's snow classification as features within model framework

In [None]:
import sturm_processer as stpro


for region in region_list:
    input_directory = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/300M_Resolution/Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    sturm_file = f"{HOME}/SWEMLv2.0/data/SnowClassification/SnowClass_NA_300m_10.0arcsec_2021_v01.0.tif" #https://nsidc.org/data/nsidc-0768/versions/1
    output_directory = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/300M_Resolution/Sturm_Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    stpro.process_sturm_data_for_files(input_directory, sturm_file, output_directory)

In [1]:
import vegetation_processer as vegpro
import os

HOME = os.path.expanduser('~')
region_list = ['Northwest', 'SouthernRockies', 'Southwest']
for region in region_list:
    input_directory = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/300M_Resolution/Sturm_Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    vegetation_file = f"{HOME}/SWEMLv2.0/data/LandCover/USA_NALCMS_landcover_2020v2_30m.tif"
    output_directory = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/300M_Resolution/Vegetation_Sturm_Seasonality_PrecipVIIRSGeoObsDFs/20_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    vegpro.process_vegetation_data_for_files(input_directory, vegetation_file, output_directory)

Vegetation file bounds: BoundingBox(left=-2043060.0, bottom=-2113150.0, right=2529600.0, top=732440.0)
Vegetation CRS: PROJCS["WGS_1984_Lambert_Azimuthal_Equal_Area",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",45],PARAMETER["longitude_of_center",-100],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1],AXIS["Easting",EAST],AXIS["Northing",NORTH]]


Sampling Vegetation Data: 100%|██████████| 85185/85185 [00:03<00:00, 26538.25it/s]
Sampling Vegetation Data: 100%|██████████| 85185/85185 [00:03<00:00, 23469.18it/s]
Processing Parquet Files: 100%|██████████| 2/2 [01:01<00:00, 30.83s/it]


Vegetation file bounds: BoundingBox(left=-2043060.0, bottom=-2113150.0, right=2529600.0, top=732440.0)
Vegetation CRS: PROJCS["WGS_1984_Lambert_Azimuthal_Equal_Area",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",45],PARAMETER["longitude_of_center",-100],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1],AXIS["Easting",EAST],AXIS["Northing",NORTH]]


Sampling Vegetation Data: 100%|██████████| 47402/47402 [00:02<00:00, 20751.53it/s]
Sampling Vegetation Data: 100%|██████████| 16241/16241 [00:00<00:00, 23256.92it/s]
Sampling Vegetation Data: 100%|██████████| 12458/12458 [00:00<00:00, 17874.43it/s]
Sampling Vegetation Data: 100%|██████████| 23791/23791 [00:00<00:00, 29297.85it/s]
Sampling Vegetation Data: 100%|██████████| 23791/23791 [00:00<00:00, 23804.06it/s]
Sampling Vegetation Data: 100%|██████████| 29145/29145 [00:01<00:00, 26038.44it/s]
Sampling Vegetation Data: 100%|██████████| 33987/33987 [00:01<00:00, 24757.26it/s]
Sampling Vegetation Data: 100%|██████████| 9378/9378 [00:00<00:00, 28095.87it/s]
Sampling Vegetation Data: 100%|██████████| 2545/2545 [00:00<00:00, 27943.73it/s]
Sampling Vegetation Data: 100%|██████████| 47402/47402 [00:02<00:00, 22029.96it/s]
Sampling Vegetation Data: 100%|██████████| 9378/9378 [00:00<00:00, 30168.67it/s]
Sampling Vegetation Data: 100%|██████████| 9378/9378 [00:00<00:00, 21750.30it/s]
Sampling Veg

Vegetation file bounds: BoundingBox(left=-2043060.0, bottom=-2113150.0, right=2529600.0, top=732440.0)
Vegetation CRS: PROJCS["WGS_1984_Lambert_Azimuthal_Equal_Area",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",45],PARAMETER["longitude_of_center",-100],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1],AXIS["Easting",EAST],AXIS["Northing",NORTH]]


Sampling Vegetation Data: 100%|██████████| 21472/21472 [00:00<00:00, 25235.93it/s]
Sampling Vegetation Data: 100%|██████████| 18780/18780 [00:00<00:00, 29962.09it/s]
Sampling Vegetation Data: 100%|██████████| 16523/16523 [00:00<00:00, 30785.22it/s]
Sampling Vegetation Data: 100%|██████████| 3741/3741 [00:00<00:00, 30897.74it/s]
Sampling Vegetation Data: 100%|██████████| 20022/20022 [00:00<00:00, 27120.29it/s]
Sampling Vegetation Data: 100%|██████████| 49436/49436 [00:02<00:00, 24185.04it/s]
Sampling Vegetation Data: 100%|██████████| 19140/19140 [00:00<00:00, 27153.63it/s]
Sampling Vegetation Data: 100%|██████████| 16508/16508 [00:00<00:00, 29336.66it/s]
Sampling Vegetation Data: 100%|██████████| 16523/16523 [00:00<00:00, 31083.24it/s]
Sampling Vegetation Data: 100%|██████████| 9169/9169 [00:00<00:00, 29712.80it/s]
Sampling Vegetation Data: 100%|██████████| 9056/9056 [00:00<00:00, 26920.77it/s]
Sampling Vegetation Data: 100%|██████████| 39617/39617 [00:01<00:00, 30220.62it/s]
Sampling V

## Next steps
* Explore why errors in precip sites above
* add in situ obs - seasonality based on the historical neareste x monitoring stations - like a historical average to-date swe value unit hydrograph based on the day of year? This will include a historical time of year of normal swe value and a swe value of year compared to normal
* albedo metric
