# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 

- the SE and SW rockies have the same number of sites, make sure they are not the same...
- process ASO data, e.g. swe_m < 0.1 = 0, convert to cm to be consistent with monitoring sites and traditional mesurement. 
- document scripts
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance

In [1]:
import os
import model_Domain
HOME = os.path.expanduser('~')

#make SWEMLv2.0 modeling domain for western USA
region_list = model_Domain.modeldomain()
region_list.remove('NorthernRockies') # There is no ASO data for this region
region_list = ['Northwest', 'SouthernRockies', 'Southwest']
region_list

['Northwest', 'SouthernRockies', 'Southwest']

In [None]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
output_res = 300 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"

#Get ASO data
for region in region_list:
    print(region)
    folder_name = f"{region}/{directory}"
    data_tool = ASODownload(short_name, version)
    b_box = data_tool.BoundingBox(region)  
    url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
    data_tool.cmr_download(directory, region)

    #Convert ASO tifs to parquet
    data_processor = ASODataProcessing()
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

## Get Snotel and CDEC in situ observations
- clean in situ observations, specifically the CDEC sites, need a data processing method to remove outtliers and nan/0 obs
- Ideas - add nearest sites elevation, distance from cell, then can bypass sites with bad data. 

In [None]:
# Get in situ observations
import get_InSitu_obs
import numpy as np

#make a list of dates to aligns with the ASO observations (they go as early as Jan-29 and as far out as the July-17)
years = np.arange(2013,2020,1)
start_month_day = '10-01'
end_month_day = '08-31'
#datelist = get_InSitu_obs.make_dates(years, start_month_day, end_month_day, WY = True)

# observations 
get_InSitu_obs.Get_Monitoring_Data_Threaded_dp(years, start_month_day, end_month_day, WY = True)

#combine years
get_InSitu_obs.combine_dfs(years)

# Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
import GeoDF
output_res = 300

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        #load snotel meta location data, use haversive function
        GeoDF.fetch_snotel_sites_for_cellids(region, output_res) # Using known up to date sites

        # Get geophysical attributes for each site, need to see how to add output resolution
        gdf = GeoDF.GeoSpatial(region, output_res)

        #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
        metadf = GeoDF.extract_terrain_data_threaded(gdf, region, output_res)
    else:
        print(f"No ASO data for {region}")




## Connect Snotel to each ASO obs

- change nearest_sites name to ns

In [None]:
import Obs_to_DF
output_res = 300

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works - need to just load the SNOTEL file, not collect them as in the function
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        dates = []
        manual = False
        Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res, manual, dates) 
    else:
        print(f"No ASO data for {region}")


In [None]:
import GeoDF

output_res = 300

#Connect cell ids with ASO obs and snotel obs to geospatial features
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        GeoDF.add_geospatial_threaded(region, output_res)
    else:
        print(f"No ASO data for {region}")

# Get NASA VIIRS fraction snow covered area for each location 

* Make sure the code grabs all dates for each region


In [None]:
import get_VIIRS_SCA
output_res = 300
threshold = 20

#check to see if the VIIRS data is available locally, if not, get from CIROH AWS - I think all of this data is for the incorrect year...
#get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_VIIRS_SCA.augment_SCA_mutliprocessing(region, output_res, threshold)
    else:
        print(f"No ASO data for {region}")


In [None]:
import get_Precip

'''
note*, if using python > 3.9, you will likely need to change the ee package to from io import StringIO
'''

import os
HOME = os.path.expanduser('~')

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
output_res = 300
threshold = 20

for region in region_list:
    path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(region)
        get_Precip.get_precip_threaded(region, output_res, years)
    else:
        print(f"No ASO data for {region}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(region, output_res, threshold)


In [None]:
import importlib
importlib.reload(get_Seasonality)

In [2]:
import get_Seasonality

output_res = 300
threshold = 20
for region in region_list:
    #process snotel sites to make "snow hydrograph features" to determine above/below average WY conditions
    get_Seasonality.seasonal_snotel()


    #get the Day of season metric for each dataframe
    get_Seasonality.add_Seasonality(region, output_res, threshold)

Adding Day of Season, seasonal nearest monitoring site averages, and seasonal nearest monitoring site relationship to averages to all Northwest dataframes...


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 85185/85185 [01:37<00:00, 875.15it/s]
100%|██████████| 85185/85185 [01:38<00:00, 867.57it/s]


Adding Day of Season, seasonal nearest monitoring site averages, and seasonal nearest monitoring site relationship to averages to all SouthernRockies dataframes...


  0%|          | 0/14 [00:00<?, ?it/s]

100%|██████████| 2545/2545 [00:02<00:00, 882.86it/s]
100%|██████████| 9378/9378 [00:10<00:00, 862.86it/s]
100%|██████████| 9378/9378 [00:10<00:00, 860.93it/s]
100%|██████████| 9378/9378 [00:10<00:00, 859.09it/s]
100%|██████████| 12317/12317 [00:14<00:00, 857.30it/s]
100%|██████████| 12458/12458 [00:14<00:00, 869.09it/s]
100%|██████████| 16241/16241 [00:18<00:00, 873.45it/s]
100%|██████████| 23791/23791 [00:27<00:00, 869.63it/s]
100%|██████████| 23791/23791 [00:27<00:00, 871.83it/s]
100%|██████████| 28353/28353 [00:32<00:00, 859.67it/s]
100%|██████████| 29145/29145 [00:33<00:00, 861.76it/s]
100%|██████████| 33987/33987 [00:39<00:00, 859.01it/s]
100%|██████████| 47402/47402 [00:54<00:00, 870.28it/s]
100%|██████████| 47402/47402 [00:54<00:00, 866.26it/s]


Adding Day of Season, seasonal nearest monitoring site averages, and seasonal nearest monitoring site relationship to averages to all Southwest dataframes...


  0%|          | 0/99 [00:00<?, ?it/s]

100%|██████████| 389/389 [00:01<00:00, 300.72it/s]
100%|██████████| 389/389 [00:01<00:00, 213.81it/s]
100%|██████████| 390/390 [00:01<00:00, 260.43it/s]
100%|██████████| 389/389 [00:02<00:00, 155.85it/s]
100%|██████████| 389/389 [00:04<00:00, 86.90it/s]
100%|██████████| 399/399 [00:07<00:00, 56.56it/s]
100%|██████████| 389/389 [00:07<00:00, 51.45it/s]
100%|██████████| 3741/3741 [00:45<00:00, 82.83it/s] 
100%|██████████| 3741/3741 [00:50<00:00, 74.56it/s] 
100%|██████████| 3741/3741 [00:48<00:00, 77.81it/s] 
100%|██████████| 3741/3741 [00:45<00:00, 81.48it/s] ]
100%|██████████| 6830/6830 [00:57<00:00, 118.72it/s]
100%|██████████| 9169/9169 [01:09<00:00, 132.10it/s]]
100%|██████████| 12384/12384 [01:10<00:00, 176.35it/s]
100%|██████████| 13569/13569 [01:18<00:00, 172.08it/s]
100%|██████████| 15824/15824 [01:20<00:00, 197.65it/s]
100%|██████████| 16508/16508 [01:21<00:00, 203.73it/s]
100%|██████████| 390/390 [00:01<00:00, 341.28it/s]
100%|██████████| 15817/15817 [01:27<00:00, 179.91it/s]


## Next steps
* Explore why errors in precip sites above
* add in situ obs - seasonality based on the historical neareste x monitoring stations - like a historical average to-date swe value unit hydrograph based on the day of year? This will include a historical time of year of normal swe value and a swe value of year compared to normal
* albedo metric


In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

HOME = os.path.expanduser('~')

region = 'N_Co_Rockies'
output_res = 300

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/sites/"

pptfiles = [filename for filename in os.listdir(Precippath)]

ppt = pd.read_parquet(f"{Precippath}NLDAS_PPT_N_Co_Rockies_300M_39.015_-107.027.parquet")
ppt.rename(columns={'datetime':'Date'}, inplace = True)
#ppt.set_index('cell_id', inplace=True)

ppt.head()


In [None]:
DFpath = '/home/rjohnson18/SWEMLv2.0/data/TrainingDFs/N_Co_Rockies/300M_Resolution/PrecipVIIRSGeoObsDFs_20_fSCA_Thresh'
geofile = 'Precip_VIIRS_GeoObsDF_20160404.parquet'

GDF = pd.read_parquet(os.path.join(DFpath, geofile))
GDF

In [None]:
import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import os
import warnings
import pickle as pkl
warnings.filterwarnings("ignore")

HOME = os.path.expanduser('~')

region = 'N_Co_Rockies'
output_res = 300
threshold = 20


DFpath = f'{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution/Seasonality_PrecipVIIRSGeoObsDFs_{threshold}_fSCA_Thresh'
files = [filename for filename in os.listdir(DFpath)]

df = pd.read_parquet(os.path.join(DFpath, files[0]))
df

In [None]:
# make a unit hydrograph ish meetric for each site

#load data
DFpath = f'{HOME}/SWEMLv2.0/data/SNOTEL_Data'
snotel =  pd.read_parquet(os.path.join(DFpath, 'seasonal_snotel.parquet'))

#find location average peak swe and divide dataframe by this number
#snotel = snotel/snotel.max(0)
snotel

import pandas as pd
import numpy as np

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data"
year_df = pd.read_parquet(f"{snotel_path}/2015_ground_measures_dp.parquet")

year_df = year_df.replace({-9999.0: np.nan})
year_df.head(5)

cols = year_df.columns
year_df[cols[0]] = pd.Series(year_df[cols[0]].values).interpolate(method='nearest').values

import matplotlib.pyplot as plt
import numpy as np

cols = year_df.columns
year_df.reset_index(inplace=True)

for s in np.arange(0,10,1):

       site = cols[s]

       fig, ax = plt.subplots(figsize=(22, 12))
       ax.plot(year_df.index, year_df[site])

       ax.set(xlabel='date', ylabel='SWE',
              title=f'{site} SWE time series')
       #ax.grid()
       plt.xticks(rotation=70)
       #fig.savefig("test.png")
       plt.show()

In [None]:
#load data
DFpath = f'{HOME}/SWEMLv2.0/data/SNOTEL_Data'
snotel =  pd.read_parquet(os.path.join(DFpath, 'ground_measures.parquet'))

#find location average peak swe and divide dataframe by this number
#snotel = snotel/snotel.max(0)
snotel = snotel.T

#change bad values = 7.65006, 9.60454, 27.139000,22.172265, 31.247021	  change - values to 0
cols = snotel.columns
for col in cols:
    snotel[col][(snotel[col]> 7.65) & (snotel[col]< 7.651)] = 0
    snotel[col][(snotel[col]> 9.604) & (snotel[col]< 9.605)] = 0
    snotel[col][(snotel[col]> 27.139) & (snotel[col]< 23.140)] = 0
    snotel[col][(snotel[col]> 22.172265) & (snotel[col]< 22.172266)] = 0
    snotel[col][(snotel[col]> 31.242265) & (snotel[col]< 31.242266)] = 0
    snotel[col][snotel[col]<0] = 0
snotel.reset_index(inplace = True)

#build in data checking script to fix outliers

In [None]:
snotel.loc[250:300]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import numpy as np

snotel_path = f"{HOME}/SWEMLv2.0/data/SNOTEL_Data"
year_df = pd.read_parquet(f"{snotel_path}/ground_measures_dp.parquet")

year_df = year_df.replace({-9999.0: np.nan})
year_df.head(5)


site = cols[0]

fig, ax = plt.subplots(figsize=(22, 12))
ax.plot(snotel.index, snotel[site])

ax.set(xlabel='date', ylabel='SWE',
       title=f'{site} SWE time series')
#ax.grid()
plt.xticks(rotation=70)
#fig.savefig("test.png")
plt.show()

In [None]:
def zscore(s, window, thresh=2, return_all=False):
    roll = s.rolling(window=window, min_periods=1, center=True)
    avg = roll.mean()
    std = roll.std(ddof=0)
    z = s.sub(avg).div(std)   
    m = z.between(-thresh, thresh)
    
    if return_all:
        return z, avg, std, m
    return s.where(m, avg)


N = 1000
np.random.seed(1)
#df = pd.DataFrame({'MW': np.sin(np.linspace(0, 10, num=N))+np.random.normal(scale=0.6, size=N)})

df =pd.DataFrame(snotel[cols[0]])

z, avg, std, m = zscore(df[cols[0]], window=2, return_all=True)

ax = plt.subplots(figsize=(22, 12))

df[cols[0]].plot(label='data')
avg.plot(label='mean')
df.loc[~m, cols[0]].plot(label='outliers', marker='o', ls='')
avg[~m].plot(label='replacement', marker='o', ls='')
plt.legend()

In [None]:
N = 1000
np.random.seed(1)
df = pd.DataFrame({'MW': np.sin(np.linspace(0, 10, num=N))+np.random.normal(scale=0.6, size=N)})

z, avg, std, m = zscore(df['MW'], window=50, return_all=True)

ax = plt.subplots(figsize=(22, 12))

df['MW'].plot(label='data')
avg.plot(label='mean')
df.loc[~m, 'MW'].plot(label='outliers', marker='o', ls='')
avg[~m].plot(label='replacement', marker='o', ls='')
plt.legend()