In [None]:
pwd

In [None]:
cd ~/cg1/Golitzin/SWEMLv2.0/Dataprocessing

In [None]:
import numpy as np
import pandas as pd
import ee #pip install earthengine-api
import geemap
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
import concurrent.futures as cf
import pyarrow as pa
import pyarrow.parquet as pq
import pickle as pkl
# ee.Authenticate()
# ee.Initialize()
import warnings
import boto3
import s3fs
warnings.filterwarnings("ignore")

import pydaymet as daymet
import pynldas2 as nldas
import pygridmet as gridmet
from pygridmet import GridMET
import rasterio
import geopandas as gpd
from shapely.geometry import box
import pyproj
from rasterio.plot import show
import geopandas as gpd
import rioxarray as rxr
import contextily as cx
import xarray as xr

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [None]:
os.environ['PROJ_LIB'] =pyproj.datadir.get_data_dir()

In [None]:
#load access key
#HOME = os.getcwd()
HOME = os.chdir('..')
HOME = os.getcwd()
#HOME = os.path.expanduser('~')

import utils.EE_funcs as EE_funcs

KEYPATH = "utils/AWSaccessKeys.csv"
print(f"{HOME}/{KEYPATH}")

if os.path.isfile(f"{HOME}/{KEYPATH}") == True:
    ACCESS = pd.read_csv(f"{HOME}/{KEYPATH}")

    #start session
    SESSION = boto3.Session(
        aws_access_key_id=ACCESS['Access key ID'][0],
        aws_secret_access_key=ACCESS['Secret access key'][0],
    )
    S3 = SESSION.resource('s3')
    #AWS BUCKET information
    BUCKET_NAME = 'national-snow-model'
    #S3 = boto3.resource('S3', config=Config(signature_version=UNSIGNED))
    BUCKET = S3.Bucket(BUCKET_NAME)
else:
    print("no AWS credentials present, skipping")

In [None]:
ASO_Key=pd.read_csv(f"{HOME}/utils/ASONameKey.csv",header=3)
ASO_Key.head()

In [None]:
test = pd.read_parquet(f'{HOME}/Predictions/NLDASDaymet_Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/1000M_Resolution/10_fSCA_Thresh/2024/HoldWYsout_1000M_Resolution_Taylor_2024-04-04.parquet')
pb = np.sum(-test['XGBoost_swe_cm'] + test['ASO_swe_cm'])/np.sum(test['ASO_swe_cm'])*100
# (test['XGBoost_swe_cm'] - test['ASO_swe_cm']).describe()
pb

In [None]:
def filename_parse(filename):
    date = next(element for element in os.path.splitext(filename)[0].split("_") if element.startswith('20'))
    if date[4].isnumeric() == False:
        date_singleday = os.path.splitext(date)[0].split("-")[0]
        datetime_object = datetime.strptime(date_singleday, "%Y%b%d")
        date = datetime_object.strftime('%Y%m%d')
    #identify basin from site code if applicable, else identify basin from name
    if filename[:12] == "ASO_50M_SWE_":
        # print(file[12:18])
        sitecode = filename[12:18]
        index = ASO_Key['SITE CODE']==sitecode
        sitename=(ASO_Key.loc[index,'SITE NAME']).item().replace(" ","_")
        # print(sitename)
        newfilename = f"{sitename}_{sitecode}_{date}"
        # print(newfilename)
    else:
        sitename = os.path.splitext(filename)[0].split("_")[1]
        newfilename = f"{sitename}_{date}"
    return(date, newfilename)

In [None]:
# getting stuck here b/c file structure is still governed by metadata for entire WY regardless of basin
# is there a way to set up the pipeline so the file dependencies are agnostic to basin and date? 
training_df_path = f"{HOME}/data/TrainingDFs/{2013}/{1000}M_Resolution/VIIRSGeoObsDFs/{20}_fSCA_Thresh/VIIRS_GeoObsDF_20130403.parquet"
training_df = pd.read_parquet(training_df_path)
meta = training_df[['cell_id','cen_lat','cen_lon']]
meta.head()

In [None]:
def get_daymet_precip(WY,output_res,thresh):
    
    # set start date for precip obs to 10-1 of previous year
    WY_start = datetime(WY-1, 10, 1)
    obs_start = WY_start.strftime('%Y-%m-%d')
    print("Water Year start date:",obs_start)
    
    # select basins, dates by ASO observation
    ASO_dir = f"{HOME}/data/ASO/{WY}/Raw_ASO_Data"
    files = [filename for filename in os.listdir(ASO_dir)
             if filename.endswith(".tif")
            ]
    print(files)
    for file in files:
        filepath = f'{ASO_dir}/{file}'
        date, newfilename = filename_parse(file)
        obs_end = f'{date[:4]}-{date[4:6]}-{date[6:]}'
        print("Getting precipitation data for",obs_end)
        with rxr.open_rasterio(filepath) as src:
            # reproject to WGS84
            transformed = src.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))
            left, bottom, right, top = transformed.rio.bounds()
            # add some padding to bbox
            left -= 0.1
            bottom -= 0.1
            right += 0.1
            top += 0.1
            bbox = rasterio.coords.BoundingBox(left, bottom, right, top)
            print(bbox)  
        obs_precip = daymet.get_bygeom(bbox,dates=(obs_start,obs_end),variables="prcp",crs=4326)
        obs_precip_transformed = obs_precip.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))
          
        # load previous training DF to extract metadata for specific observation
        training_df_path = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh/VIIRS_GeoObsDF_{date}.parquet"
        training_df = pd.read_parquet(training_df_path)
        meta = training_df[['cell_id','cen_lat','cen_lon']]
        # coordinates get rounded in get_VIIRS script, reassess later if need more precision
        print(meta['cen_lon'].min(),meta['cen_lon'].max(),meta['cen_lat'].min(),meta['cen_lat'].max())
        precip_arr = []
        season_precip_cm = []
        nsites = len(meta)
        for i in range(nsites):
            lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
            cellid = meta.iloc[i]['cell_id']
            if ((lon>bbox[0] and lon<bbox[2]) and (lat>bbox[1] and lat<bbox[3])):
                # print('got here')
                prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest')['prcp']
                season_precip = np.round(np.array(prcp.values).sum()/10,2)
            # if season_precip >= 0:
                precip_arr.append([cellid,lat,lon,np.array(prcp.values)])
                season_precip_cm.append(season_precip)
        precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
        precip_df['season_precip_cm'] = season_precip_cm    
        
        # print(precip_df.head())
        
        # save raw data for each basin and date
        precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_Daymet_Precip"
        if not os.path.exists(precip_data_path):
            os.makedirs(precip_data_path, exist_ok=True)
            
        table = pa.Table.from_pandas(precip_df)
        pq.write_table(table, f"{precip_data_path}/Daymet_{newfilename}.parquet", compression='BROTLI')
        
    # return season_precip

In [None]:
def get_daymet_precip_multithreading(WY,output_res,thresh):
    # set start date for precip obs to Oct 1 of previous year
    WY_start = datetime(WY-1, 10, 1)
    obs_start = WY_start.strftime('%Y-%m-%d')
    print("Water Year start date:",obs_start)
    
    # select basins, dates by training DF
    training_df_dir = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
    files = [filename for filename in os.listdir(training_df_dir)
             if filename.endswith(".parquet")
            ]
    
    with cf.ThreadPoolExecutor(max_workers=CPUS/2) as executor: 
        # Start the load operations and mark each future with its process function
        {executor.submit(get_daymet_precip_single_date, (file,training_df_dir,obs_start,WY,output_res,thresh)): \
            file for file in tqdm(files)}
       

In [None]:
def get_daymet_precip_single_date(file,training_df_dir,obs_start,WY,output_res,thresh):
    # get daymet precip by grabbing bounding box from previous training DF for basin + date
    print(file)
    filepath = f'{training_df_dir}/{file}'
    #Get timestamp
    timestamp = file.split('_')[-1].split('.')[0]
    #Get region
    region = file.split('_')[-2]
    # print(timestamp,region)
    obs_end = f'{timestamp[:4]}-{timestamp[4:6]}-{timestamp[6:]}'
        
    print(f"Getting precipitation data for {obs_end} at {region}")
        
    training_df = pd.read_parquet(filepath)
    # get bounding box by min/max coordinates
    left, right = training_df['cen_lon'].min(), training_df['cen_lon'].max()
    bottom, top = training_df['cen_lat'].min(), training_df['cen_lat'].max()
    # add some padding to bbox
    left -= 0.1
    bottom -= 0.1
    right += 0.1
    top += 0.1
    bbox = rasterio.coords.BoundingBox(left, bottom, right, top)
    print(bbox)
       
    # get precip from Daymet server from beginning of WY through observation date and reproject
    obs_precip = daymet.get_bygeom(bbox,dates=(obs_start,obs_end),variables="prcp",crs=4326)
    obs_precip_transformed = obs_precip.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))  
    # print(bbox)    
     
    # extract metadata 
    meta = training_df[['cell_id','cen_lat','cen_lon']]
    # coordinates get rounded in get_VIIRS script, reassess later if need more precision
    # print(meta['cen_lon'].min(),meta['cen_lon'].max(),meta['cen_lat'].min(),meta['cen_lat'].max())
    precip_arr = []
    season_precip_cm = []
    nsites = len(meta)
    for i in range(nsites):
        lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
        cellid = meta.iloc[i]['cell_id']
        prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest')['prcp']
        season_precip = np.round(np.array(prcp.values).sum()/10,2)
        # if season_precip >= 0:
        precip_arr.append([cellid,lat,lon,np.array(prcp.values)])
        season_precip_cm.append(season_precip)
    precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
    precip_df['season_precip_cm'] = season_precip_cm    
    # print(season_precip_cm)
    # print(precip_df.head())
        
        # save raw data for each basin and date
    precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_Daymet_Precip"
    if not os.path.exists(precip_data_path):
        os.makedirs(precip_data_path, exist_ok=True)
            
    table = pa.Table.from_pandas(precip_df)
    pq.write_table(table, f"{precip_data_path}/Daymet_{region}_{timestamp}.parquet", compression='BROTLI')
        
    # return season_precip

In [None]:
# get_daymet_precip(2013,750,10)

In [None]:
# def get_daymet_precip_df(WY,output_res,thresh):
#     # get daymet precip by grabbing bounding box from previous training DF for basin + date
#     # set start date for precip obs to Oct 1 of previous year
#     WY_start = datetime(WY-1, 10, 1)
#     obs_start = WY_start.strftime('%Y-%m-%d')
#     print("Water Year start date:",obs_start)
    
#     # select basins, dates by training DF
#     training_df_dir = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
#     files = [filename for filename in os.listdir(training_df_dir)
#              if filename.endswith(".parquet")
#             ]
#     # print(files)
#     for file in files:
#         filepath = f'{training_df_dir}/{file}'
#         #Get timestamp
#         timestamp = file.split('_')[-1].split('.')[0]
#         #Get region
#         region = file.split('_')[-2]
#         # print(timestamp,region)
#         obs_end = f'{timestamp[:4]}-{timestamp[4:6]}-{timestamp[6:]}'
        
#         print(f"Getting precipitation data for {obs_end} at {region}")
        
#         training_df = pd.read_parquet(filepath)
#         # get bounding box by min/max coordinates
#         left, right = training_df['cen_lon'].min(), training_df['cen_lon'].max()
#         bottom, top = training_df['cen_lat'].min(), training_df['cen_lat'].max()
#         # add some padding to bbox
#         left -= 0.1
#         bottom -= 0.1
#         right += 0.1
#         top += 0.1
#         bbox = rasterio.coords.BoundingBox(left, bottom, right, top)
#         print(bbox)    

#         # get precip from Daymet server from beginning of WY through observation date and reproject
#         obs_precip = daymet.get_bygeom(bbox,dates=(obs_start,obs_end),variables="prcp",crs="epsg:4326")
#         obs_precip_transformed = obs_precip.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))  
#         print(bbox)    
        
#         # extract metadata 
#         meta = training_df[['cell_id','cen_lat','cen_lon']]
#         # coordinates get rounded in get_VIIRS script, reassess later if need more precision
#         # print(meta['cen_lon'].min(),meta['cen_lon'].max(),meta['cen_lat'].min(),meta['cen_lat'].max())
#         precip_arr = []
#         season_precip_cm = []
#         nsites = len(meta)
#         for i in range(nsites):
#             lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
#             cellid = meta.iloc[i]['cell_id']
#             # if ((lon>bbox[0] and lon<bbox[2]) and (lat>bbox[1] and lat<bbox[3])):
#                 # print('got here')
#             prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest')['prcp']
#             season_precip = np.round(np.array(prcp.values).sum()/10,2)
#             # if season_precip >= 0:
#             precip_arr.append([cellid,lat,lon,np.array(prcp.values)])
#             season_precip_cm.append(season_precip)
#         precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
#         precip_df['season_precip_cm'] = season_precip_cm    
#         # print(season_precip_cm)
#         # print(precip_df.head())
        
#         # save raw data for each basin and date
#         precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_Daymet_Precip"
#         if not os.path.exists(precip_data_path):
#             os.makedirs(precip_data_path, exist_ok=True)
            
#         table = pa.Table.from_pandas(precip_df)
#         pq.write_table(table, f"{precip_data_path}/Daymet_{region}_{timestamp}.parquet", compression='BROTLI')
        
#     # return season_precip

In [None]:
def get_hyriver_precip_df(WY,output_res,thresh,dataset):
    # get precip by grabbing bounding box from previous training DF for basin + date
    # set start date for precip obs to Oct 1 of previous year
    valid = ['daymet','gridmet','nldas']
    if dataset.lower() not in valid:
        raise ValueError("dataset must be one of %r." % valid)
    if dataset.lower() == 'daymet':
        dataset = 'Daymet'
    elif dataset.lower() == 'nldas':
        dataset = 'NLDAS'
    elif dataset.lower() == 'gridmet':
        dataset = 'gridMET'
        
    WY_start = datetime(WY-1, 10, 1)
    obs_start = WY_start.strftime('%Y-%m-%d')
    print("Water Year start date:",obs_start)
    
    # select basins, dates by training DF
    training_df_dir = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
    files = [filename for filename in os.listdir(training_df_dir)
             if filename.endswith(".parquet")
            ]
    # print(files)
    for file in files:
        filepath = f'{training_df_dir}/{file}'
        #Get timestamp
        timestamp = file.split('_')[-1].split('.')[0]
        #Get region
        region = file.split('_')[-2]
        # print(timestamp,region)
        obs_end = f'{timestamp[:4]}-{timestamp[4:6]}-{timestamp[6:]}'
        
        print(f"Getting precipitation data for {obs_end} at {region}")
        
        training_df = pd.read_parquet(filepath)
        # get bounding box by min/max coordinates
        left, right = training_df['cen_lon'].min(), training_df['cen_lon'].max()
        bottom, top = training_df['cen_lat'].min(), training_df['cen_lat'].max()
        # add some padding to bbox
        left -= 0.1
        bottom -= 0.1
        right += 0.1
        top += 0.1
        bbox = rasterio.coords.BoundingBox(left, bottom, right, top)
        # print(bbox)    

        # get precip from appropriate server from beginning of WY through observation date and reproject
        if dataset == 'Daymet':
            var = "prcp"
            obs_precip = daymet.get_bygeom(bbox,dates=(obs_start,obs_end),variables=var,crs="epsg:4326")
        elif dataset == 'gridMET':
            var = 'pr'
            obs_precip = gridmet.get_bygeom(bbox,dates=(obs_start,obs_end),variables=var,crs="epsg:4326")
        elif dataset == 'NLDAS':
            var = "prcp"
            obs_precip = nldas.get_bygeom(bbox,obs_start,obs_end,variables=var,geo_crs=4326,source='netcdf')
        obs_precip_transformed = obs_precip.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))   
        
        # extract metadata 
        meta = training_df[['cell_id','cen_lat','cen_lon']]
        precip_arr = []
        season_precip_cm = []
        nsites = len(meta)
        for i in range(nsites):
            lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
            cellid = meta.iloc[i]['cell_id']
            prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest')[var]
            season_precip = np.round(np.array(prcp.values).sum()/10,2) # precip given in mm, convert to cm 
            # if season_precip >= 0:
            precip_arr.append([cellid,lat,lon,np.array(prcp.values)])
            season_precip_cm.append(season_precip)
        precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
        precip_df['season_precip_cm'] = season_precip_cm    
        
        # save raw data for each basin and date
        precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_{dataset}_Precip"
        if not os.path.exists(precip_data_path):
            os.makedirs(precip_data_path, exist_ok=True)
            
        table = pa.Table.from_pandas(precip_df)
        pq.write_table(table, f"{precip_data_path}/{dataset}_{region}_{timestamp}.parquet", compression='BROTLI')
        
    # return season_precip

In [None]:
get_hyriver_precip_df(2019,1000,10,'nldas')
# get_hyriver_precip_df(2017,1000,10,'gridmet')
# nldas.get_bygeom(

#### Use HydroShare module to import AORC precip from AWS

In [None]:
# Variable name to retrieve data (look at the following table for valid variable names)
variable_name = 'APCP_surface'
# User-defined aggregation interval - valid values are 'hour','day','month','year'
agg_interval = 'day'
# Start date - In Year-Month-Day format the earliest start date can be '1979-02-01'
start_datetime = '2016-10-01'
# End date - In Year-Month-Day format the latest end date can be '2023-01-31'
end_datetime = '2017-08-31'

## Create a list of years to retrieve data 
WY = 2017
WY_start = datetime(WY-1, 10, 1)
# obs_start = WY_start.strftime('%Y-%m-%d')
start_yr = WY_start.year # datetime.strptime(start_datetime, '%Y-%m-%d').year
end_yr = datetime.strptime(end_datetime, '%Y-%m-%d').year
yrs = list(range(start_yr, end_yr+1))

## Loading data (AORC data are organized by years, look at https://noaa-nws-aorc-v1-1-1km.s3.amazonaws.com/index.html)
# Base URL
base_url = f's3://noaa-nws-aorc-v1-1-1km'
# Creating a connection to Amazon S3 bucket using the s3fs library (https://s3fs.readthedocs.io/en/latest/api.html).
s3_out = s3fs.S3FileSystem(anon=True)              # access S3 as if it were a file system. 
fileset = [s3fs.S3Map(                             # maps each year's Zarr dataset from S3 to a local-like object.
            root=f"s3://{base_url}/{yr}.zarr",     # Zarr dataset for each year
            s3=s3_out,                             # connection
            check=False                            # checking if the dataset exists before trying to load it
        ) for yr in yrs]                           # loops through each year

## Load data for specified years and veriable of interest using the xarray library
ds_yrs = xr.open_mfdataset(fileset, engine='zarr')
da_yrs_var = ds_yrs[variable_name]
variable_long_name = da_yrs_var.attrs.get('long_name')
da_yrs_var
left,bottom,right,top = -119.892, 37.64, -119.107, 38.281
da_bbox = da_yrs_var.sel(latitude=slice(bottom, top), longitude=slice(left, right))

if variable_name == 'APCP_surface':
    units = f"mm/{agg_interval}"
    # Temporal aggregation
    da_bbox_TimeAgg = da_bbox.loc[dict(time=slice(start_datetime, end_datetime))].resample(time='d').sum()

obs_precip_transformed = da_bbox_TimeAgg.rio.reproject(rasterio.crs.CRS.from_epsg('4326')) #.load()  
obs_precip_transformed

In [None]:
obs_precip_transformed.sel(time='2017-01-05').plot.pcolormesh()

In [None]:
obs_precip_transformed.sel(x=-119.555,y=37.95,time='2017-01-05',method='nearest').item()

In [None]:
def get_aorc_precip(WY,output_res,thresh):
    # define start of water year
    WY_start = datetime(WY-1, 10, 1)
    obs_start = WY_start.strftime('%Y-%m-%d')
    print("Water Year start date:",obs_start)

    ## Create a list of years to retrieve data 
    yrs = [WY-1, WY]
    
    ## Loading data (AORC data are organized by years, look at https://noaa-nws-aorc-v1-1-1km.s3.amazonaws.com/index.html)
    # Grab data for entire WY before clipping to each basin per training DFs 
    # Base URL
    base_url = f's3://noaa-nws-aorc-v1-1-1km'
    # Creating a connection to Amazon S3 bucket using the s3fs library (https://s3fs.readthedocs.io/en/latest/api.html).
    s3_out = s3fs.S3FileSystem(anon=True)              # access S3 as if it were a file system. 
    fileset = [s3fs.S3Map(                             # maps each year's Zarr dataset from S3 to a local-like object.
                root=f"s3://{base_url}/{yr}.zarr",     # Zarr dataset for each year
                s3=s3_out,                             # connection
                check=False                            # checking if the dataset exists before trying to load it
            ) for yr in yrs]                           # loops through each year
    
    ## Load data for specified years and variable of interest using the xarray library
    var = 'APCP_surface'
    ds_yrs = xr.open_mfdataset(fileset, engine='zarr')
    da_yrs_var = ds_yrs[var].rio.write_crs(4326,inplace=True).fillna(0)
    variable_long_name = da_yrs_var.attrs.get('long_name')
    # Temporal aggregation
    da_TimeAgg = da_yrs_var.resample(time='d').sum()
    
    # select basins, dates by training DF
    training_df_dir = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
    files = [filename for filename in os.listdir(training_df_dir)
             if filename.endswith(".parquet")
            ]
    # print(files)
    for file in files:
        filepath = f'{training_df_dir}/{file}'
        #Get timestamp
        timestamp = file.split('_')[-1].split('.')[0]
        #Get region
        region = file.split('_')[-2]
        obs_end = f'{timestamp[:4]}-{timestamp[4:6]}-{timestamp[6:]}'
        
        print(f"Getting precipitation data for {obs_end} at {region}")
        
        training_df = pd.read_parquet(filepath)
        # get bounding box by min/max coordinates
        left, right = training_df['cen_lon'].min(), training_df['cen_lon'].max()
        bottom, top = training_df['cen_lat'].min(), training_df['cen_lat'].max()
        # add some padding to bbox
        left -= 0.1
        bottom -= 0.1
        right += 0.1
        top += 0.1  
        da_WYagg = da_TimeAgg.loc[dict(time=slice(obs_start, obs_end))]
        da_bbox = da_WYagg.sel(latitude=slice(bottom, top), longitude=slice(left, right))
        
        obs_precip_transformed = da_bbox.rio.reproject(rasterio.crs.CRS.from_epsg('4326')).load()  
        
        # extract metadata 
        meta = training_df[['cell_id','cen_lat','cen_lon']]
        precip_arr = []
        season_precip_cm = []
        nsites = len(meta)
        for i in range(nsites):
            lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
            cellid = meta.iloc[i]['cell_id']
            prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest') 
            season_precip = np.round(np.array(prcp.values).sum()/10,2) # precip given in mm, convert to cm 
            # if season_precip >= 0:
            precip_arr.append([cellid,lat,lon,np.array(prcp.values)])
            season_precip_cm.append(season_precip)
        precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
        precip_df['season_precip_cm'] = season_precip_cm  
        # print(season_precip_cm)
        
        # save raw data for each basin and date
        precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_AORC_Precip"
        if not os.path.exists(precip_data_path):
            os.makedirs(precip_data_path, exist_ok=True)
            
        table = pa.Table.from_pandas(precip_df)
        pq.write_table(table, f"{precip_data_path}/AORC_{region}_{timestamp}.parquet", compression='BROTLI')

In [None]:
years = np.arange(2013,2025)
for WY in years:
    get_aorc_precip(WY,1000,10)

#### Try PRISM packages

In [None]:
import subprocess
import urllib.request
import zipfile

In [None]:
def _progress_hook(block_num, block_size, total_size, t):
    """
    Callback function to update tqdm progress bar during file download.
    """
    downloaded = block_num * block_size
    if total_size > 0:
        t.update(min(block_size, total_size - t.n))
    else:
        t.update(downloaded - t.n)

def prism_download(start,stop,path,var):
    while start <= stop:
        day = start.strftime("%Y%m%d")
        url = f"{base_url}/{var}/{day}?format=nc"
        output_file = os.path.join(path, day)

        with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=f'Downloading {day}') as t:
            urllib.request.urlretrieve(url, output_file, reporthook=lambda block_num, block_size, total_size: _progress_hook(block_num, block_size, total_size, t))

        start += timedelta(days=1)

def unzip_prism(start,stop,zipped_path,unzipped_path):
    while start <= stop:
        day = start.strftime("%Y%m%d")
        zip_file_path = os.path.join(zipped_path, day)

        # Check if the ZIP file exists
        if os.path.exists(zip_file_path):
            # Unzip the file
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(unzipped_path)
                print(f"UNZIP file for {day} is completed.")
            os.remove(zip_file_path)
        else:
            print(f"ZIP file for {day} not found.")

        start += timedelta(days=1)

year = 2013
def get_prism(WY):
    prism_path = f'{HOME}/data/Precipitation/{WY}/prism_data/'
    zipped_path = f'{prism_path}/zipped'
    unzipped_path = f'{prism_path}/unzipped'
    os.makedirs(zipped_path,exist_ok=True)
    os.makedirs(unzipped_path,exist_ok=True)

    base_url = "https://services.nacse.org/prism/data/get/us/4km/"
    var = "ppt"
    start = datetime.strptime(f"{WY-1}-10-01", "%Y-%m-%d")
    stop = datetime.strptime(f"{WY}-09-30", "%Y-%m-%d")
    
    prism_download(start,stop,zipped_path,var)
    unzip_prism(start,stop,zipped_path,unzipped_path)

    date_idx = pd.Index(pd.date_range(start=start,end=stop),name='time')
    timeser = xr.open_mfdataset(f'{unzipped_path}/*.nc', 
                                    combine='nested',
                                    concat_dim=[date_idx,]
                                    )
    timeser.to_netcdf(path=f'{prism_path}/{WY}.nc')

In [None]:
get_prism(2016)

In [None]:
files = [filename for filename in os.listdir(prism_path) if filename.endswith('.nc')]

In [None]:
# test = xr.open_dataset(f'{prism_path}/prism_ppt_us_25m_20121215.nc')
# # test.crs
# test = test.rio.write_crs('EPSG:4269')
# test_reproj = test.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))
# # test['Band1'].plot.pcolormesh()
# test['Band1'].sel(lat=40.723857, lon=-111.884616,method='nearest')

date_idx = pd.Index(pd.date_range(start="2014-10-01",end="2015-09-30"),name='time')
path = f'{HOME}/data/Precipitation/2015/prism_data/unzipped'
# date_idx
test_timeser = xr.open_mfdataset(f'{path}/*.nc', 
                                 combine='nested',
                                 concat_dim=[date_idx,]
                                 )

In [None]:
test_timeser['Band1'].sel(lat=40.5,lon=-118.5,method='nearest').plot.line()

In [None]:
path=f'{HOME}/data/Precipitation/2015/prism_data/'
test_timeser.to_netcdf(path=f'{path}/2015.nc')

#### Plot results for sanity check

In [None]:
test = pd.read_parquet(f'{HOME}/data/Precipitation/2013/500M_gridMET_Precip/gridMET_USCATB_20130403.parquet')
test.head()

In [None]:
test  = pd.read_parquet(f'{HOME}/data/Precipitation/2019/1000M_gridMET_Precip/gridMET_USCOGT_20190408.parquet')
test2 = pd.read_parquet(f'{HOME}/data/Precipitation/2019/1000M_Daymet_Precip/Daymet_USCOGT_20190408.parquet')
test3 = pd.read_parquet(f'{HOME}/data/Precipitation/2019/1000M_NLDAS_Precip/NLDAS_USCOGT_20190408.parquet')
test4 = pd.read_parquet(f'{HOME}/data/Precipitation/2019/1000M_AORC_Precip/AORC_USCOGT_20190408.parquet')
gdf  = gpd.GeoDataFrame(test['season_precip_cm'],
                       geometry=gpd.points_from_xy(test['cen_lon'], test['cen_lat']), 
                        crs="EPSG:4326")
gdf2 = gpd.GeoDataFrame(test2['season_precip_cm'],
                       geometry=gpd.points_from_xy(test2['cen_lon'], test2['cen_lat']), 
                        crs="EPSG:4326")
gdf3 = gpd.GeoDataFrame(test3['season_precip_cm'],
                       geometry=gpd.points_from_xy(test2['cen_lon'], test2['cen_lat']), 
                        crs="EPSG:4326")
gdf4 = gpd.GeoDataFrame(test4['season_precip_cm'],
                       geometry=gpd.points_from_xy(test2['cen_lon'], test2['cen_lat']), 
                        crs="EPSG:4326")
# print(test.head)
# print(test2.head)
# gdf3.head()

In [None]:
fig,axs = plt.subplots(2,2,figsize=(8,8),layout='tight',dpi=150)
gdf.plot(ax=axs[0][0],column='season_precip_cm',cmap='viridis',legend=True)
gdf2.plot(ax=axs[0][1],column='season_precip_cm',cmap='viridis',legend=True)
gdf3.plot(ax=axs[1][0],column='season_precip_cm',cmap='viridis',legend=True)
gdf4.plot(ax=axs[1][1],column='season_precip_cm',cmap='viridis',legend=True)
axs[0][0].set_title('GridMET')
axs[0][1].set_title('Daymet')
axs[1][0].set_title('NLDAS')
axs[1][1].set_title('AORC')
plt.suptitle('USCOGT 2019-04-08')
plt.savefig(f'{HOME}/Images/Precip_Comparison_20190408_Taylor.png')
plt.show()

In [None]:
fig,axs = plt.subplots(1,2,figsize=(12,5),layout='tight')
gdf3.plot(ax=axs[0],column='season_precip_cm',cmap='viridis',legend=True)
axs[0].set_title('NLDAS - New Method')
gdf4.plot(ax=axs[1],column='NLDAS',cmap='viridis',legend=True)
axs[1].set_title('NLDAS - Prev Method')


In [None]:
geom = rasterio.coords.BoundingBox(left=-119.896, bottom=37.644999, right=-119.102, top=38.284)
obs_start = '2016-10-01'
obs_end = '2017-07-18'
obs_precip_nl = nldas.get_bygeom(geom,obs_start,obs_end,variables="prcp",geo_crs=4326,source='netcdf')
obs_precip_gm = gridmet.get_bygeom(geom,(obs_start,obs_end),variables="pr",crs=4326,)

In [None]:
obs_precip_nl = obs_precip_nl.rename({'y':'lat',
                                      'x':'lon'}) #.prcp.sel(time='2024-01-11').plot.pcolormesh()

In [None]:
obs_precip_nl.sum(dim="time")['prcp'].plot.pcolormesh() #.sel(time='2024-01-11')

In [None]:
obs_precip_gm.sum(dim="time")['pr'].plot.pcolormesh() #sel(time='2024-01-11')

In [None]:
geom = rasterio.coords.BoundingBox(left=-119.896, bottom=37.644999999999996, right=-119.102, top=38.284)
obs_start = '2023-10-01'
obs_end = '2024-05-31'
obs_precip = gridmet.get_bygeom(geom,dates=(obs_start,obs_end),variables="pr",crs="epsg:4326")
obs_precip

In [None]:
WYs = [2013]
res = [750]
for year in WYs:
    for output_res in res:
        get_daymet_precip_df(year,output_res,thresh=10)

In [None]:
#set multiprocessing limits
CPUS = len(os.sched_getaffinity(0))
CPUS = int((CPUS/2)-2)

In [None]:
def Make_Precip_DF(WY,output_res,thresh,dataset):
    print(f"Adding precipitation features to ML dataframe for WY {WY}")
    precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_{dataset}_Precip"
    training_df_path = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"

    #make precip df path
    precip_df_path = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/PrecipVIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
    if not os.path.exists(precip_df_path):
        os.makedirs(precip_df_path, exist_ok=True)

    #Get list of dataframes
    GeoObsDF_files = [filename for filename in os.listdir(training_df_path)] 
    # print(GeoObsDF_files)
    
    # Multiprocessing 
    with cf.ProcessPoolExecutor(max_workers=CPUS) as executor: 
        # Start the load operations and mark each future with its process function
        [executor.submit(single_date_add_daymet_precip, (training_df_path, precip_data_path, geofile, precip_df_path, WY, dataset)) for geofile in GeoObsDF_files]
        
    # for geofile in GeoObsDF_files:
        # single_date_add_daymet_precip((training_df_path, precip_data_path, geofile, precip_df_path, WY, dataset))

In [None]:
WY=2015
output_res=1000
thresh=20
dataset = 'Daymet'
precip_data_path = f"{HOME}/data/Precipitation/{WY}/{output_res}M_{dataset}_Precip"
training_df_path = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{thresh}_fSCA_Thresh"
pptfiles = [filename for filename in os.listdir(precip_data_path) if filename.endswith('.parquet')]
pptfiles_dates = []
for i in range(len(pptfiles)):
    pptfiles_dates.append(pptfiles[i].split('_')[-1].split('.parquet')[0])

pptfiles_dates = np.array(pptfiles_dates)
unique_dates = np.unique(pptfiles_dates)
print(f'there are {len(unique_dates)} unique dates')

for i,date in enumerate(unique_dates):
    # print(date)
    idxarr = np.where(pptfiles_dates == date)
    # print(idxarr)
    # print(idxarr[0])
    date_obs = []
    for idx in idxarr[0]:
        # print(idx)
        # print(pptfiles[idx])
        date_obs.append(pptfiles[idx])
    print(i, date, date_obs)


In [None]:
def single_date_add_daymet_precip(args):
    training_df_path, precip_data_path, geofile, precip_df_path, WY, dataset = args
    #get date information
    # print(geofile)
    date = geofile.split('_')[-1].split('.parquet')[0]
    region = geofile.split('_')[-2]
    # print(region,date)
    region_date = f"{region}_{date}"
    # print(region_date)
    year = date[:4]
    mon = date[4:6]
    day = date[6:]
    strdate = f"{year}-{mon}-{day}"
    print(f"Connecting precipitation to ASO observations for {WY} on {strdate} at {region}")
    
    GDF = pd.read_parquet(os.path.join(training_df_path, geofile))
    GDF.set_index('cell_id', inplace = True)
    GDF['season_precip_cm'] = 0.0
    
    # get precip filenames
    pptfiles = [filename for filename in os.listdir(precip_data_path) if filename.endswith('.parquet')]
    # print(pptfiles)
    # need to connect GDF to precip file(s) by date and basin
    # this is clunky but will work for now
    pptfiles_region_date = []
    for i in range(len(pptfiles)):
        pptfile_date = pptfiles[i].split('_')[-1].split('.parquet')[0]
        pptfile_region = pptfiles[i].split('_')[-2]
        pptfile_reg_date = f"{pptfile_region}_{pptfile_date}"
        pptfiles_region_date.append(pptfile_reg_date)
    # unique_dates = np.unique(pptfiles_dates)
    
    ppt_filename = [filename for filename in pptfiles if region_date in filename]
    # print(region_date, ppt_filename)
    
    ppt_filepath = f"{precip_data_path}/{ppt_filename[0]}"
    ppt = pd.read_parquet(ppt_filepath)
        
#     # get unique cells
    sites = list(GDF.index)
    for site in sites:
        # print(site)
        # print(ppt[ppt['cell_id']== site])
        try:
            GDF.loc[site,'season_precip_cm'] = round(ppt['season_precip_cm'][ppt['cell_id']== site].values[0],1)
        except:
            print(f"{site} is bad, delete file from folder and rerun the get precipitation script")
    # print(ppt['season_precip_cm'].mean())
    #Convert DataFrame to Apache Arrow Table
    table = pa.Table.from_pandas(GDF)
    # Parquet with Brotli compression
    pq.write_table(table, f"{precip_df_path}/Precip{dataset if dataset == 'Daymet' else ''}_{geofile}", compression='BROTLI')
#          

In [None]:
## figure out how to interpolate 1000m daymet precip to 750m 

test = pd.read_parquet(f'{HOME}/data/Precipitation/2024/1000M_Daymet_Precip/Daymet_BigThompson_20240421.parquet')
gdf  = gpd.GeoDataFrame(test['season_precip_cm'],
                       geometry=gpd.points_from_xy(test['cen_lon'], test['cen_lat']), 
                        crs="EPSG:4326")
test.head()
# # test.loc[:,['cen_lat','cen_lon','season_precip_cm']]
# xtest = xr.DataArray(test['season_precip_cm'].values,
#                      dims=['x','y'],
#                      coords={'x':test['cen_lon'],'y':test['cen_lat']})
# xtest

gdf.plot(column='season_precip_cm',cmap='viridis',legend=True)


In [None]:
test=pd.read_parquet(f'{HOME}/data/TrainingDFs/2024/750M_Resolution/Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/10_fSCA_Thresh/Vegetation_Sturm_Season_VIIRS_GeoObsdf_American_20240211.parquet')
test.head()

In [None]:
from scipy.interpolate import LinearNDInterpolator

In [None]:
test_trainingdf = pd.read_parquet(f'{HOME}/data/TrainingDFs/2024/750M_Resolution/Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/10_fSCA_Thresh/Vegetation_Sturm_Season_VIIRS_GeoObsdf_BigThompson_20240421.parquet')
# test_trainingdf.head()
df_x = test_trainingdf['cen_lon'].sort_values().unique()
df_y = test_trainingdf['cen_lat'].sort_values().unique()
test_data = pd.read_parquet(f'{HOME}/data/Precipitation/2024/1000M_Daymet_Precip/Daymet_BigThompson_20240421.parquet')
test_grid = test_data[['cen_lon','cen_lat']].values
precip = test_data['season_precip_cm']

interpolator = LinearNDInterpolator(test_grid,precip)
X,Y = np.meshgrid(df_x,df_y)
precip_interp = interpolator(X.flat,Y.flat).reshape(X.shape)
xarr = xr.DataArray(precip_interp,dims=['lat','lon'],coords=[df_y,df_x])
xarr = xarr.interpolate_na(dim="lon", method="linear", fill_value="extrapolate").interpolate_na(dim="lat", method="linear", fill_value="extrapolate")
# extract metadata 
meta = test_trainingdf[['cell_id','cen_lat','cen_lon']]
season_precip_cm = []
nsites = len(meta)
for i in range(nsites):
    lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
    cellid = meta.iloc[i]['cell_id']
    prcp = xarr.sel(lon=lon,lat=lat,method='nearest').item()
    prcp=np.round(prcp,1)
    # if season_precip >= 0:
    season_precip_cm.append(prcp)


In [None]:
import fnmatch

In [None]:
training_df_path=f'{HOME}/data/TrainingDFs/2024/750M_Resolution/Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/10_fSCA_Thresh'
precip_df_path=f'{HOME}/data/TrainingDFs/2024/750M_Resolution/Daymet_Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/10_fSCA_Thresh'
os.makedirs(precip_df_path,exist_ok=True)
training_files = [filename for filename in os.listdir(training_df_path) if ~fnmatch.fnmatch(filename,'*Conejos_20240508.parquet')]
for file in training_files:
    date = file.split('_')[-1].split('.parquet')[0]
    region = file.split('_')[-2]
    # print(region,date)
    region_date = f"{region}_{date}"

    # get training df from previous step and extract cell locations
    training_df = pd.read_parquet(f'{training_df_path}/Vegetation_Sturm_Season_VIIRS_GeoObsdf_{region_date}.parquet')
    meta = training_df[['cell_id','cen_lat','cen_lon']]
    df_x = meta['cen_lon'].sort_values().unique()
    df_y = meta['cen_lat'].sort_values().unique()
    # get 1000M daymet precip and extract locations and values 
    prcp_df_path = f'{HOME}/data/Precipitation/2024/1000M_Daymet_Precip/Daymet_{region_date}.parquet'
    if os.path.exists(prcp_df_path)==False:
        print('skipping',region_date)
        continue
    prcp_df = pd.read_parquet(f'{HOME}/data/Precipitation/2024/1000M_Daymet_Precip/Daymet_{region_date}.parquet')
    prcp_grid = prcp_df[['cen_lon','cen_lat']].values
    precip = prcp_df['season_precip_cm']
    interpolator = LinearNDInterpolator(prcp_grid,precip)
    X,Y = np.meshgrid(df_x,df_y)
    precip_interp = interpolator(X.flat,Y.flat).reshape(X.shape)
    xarr = xr.DataArray(precip_interp,dims=['lat','lon'],coords=[df_y,df_x])
    xarr = xarr.interpolate_na(dim="lon", method="linear", fill_value="extrapolate").interpolate_na(dim="lat", method="linear", fill_value="extrapolate")
    # match cell ids from training df to cell ids in interpolated precip df
    season_precip_cm = []
    nsites = len(meta)
    for i in range(nsites):
        lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
        cellid = meta.iloc[i]['cell_id']
        prcp = xarr.sel(lon=lon,lat=lat,method='nearest').item()
        prcp=np.round(prcp,1)
        # if season_precip >= 0:
        season_precip_cm.append(prcp)
    training_df['Daymet'] = season_precip_cm
    table = pa.Table.from_pandas(training_df)
    # Parquet with Brotli compression
    pq.write_table(table, f"{precip_df_path}/PrecipDaymet_{file}", compression='BROTLI')

In [None]:
new_precip_files = [filename for filename in os.listdir(precip_df_path)] 
for file in new_precip_files:
    df = pd.read_parquet(f'{precip_df_path}/{file}')
    gdf = gpd.GeoDataFrame(df['Daymet'],
                       geometry=gpd.points_from_xy(df['cen_lon'], df['cen_lat']), 
                        crs="EPSG:4326")
    fig,ax=plt.subplots();
    gdf.plot(column='Daymet',cmap='viridis',legend=True,ax=ax)
    ax.set_title(file)
    # plt.close()

In [None]:
xarr.plot.pcolormesh()

In [None]:
test.head()

In [None]:
test = pd.read_parquet(f'{HOME}/data/TrainingDFs/2024/750M_Resolution/Daymet_Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/10_fSCA_Thresh/PrecipDaymet_Vegetation_Sturm_Season_VIIRS_GeoObsdf_WindyGap_20240321.parquet')
gdf  = gpd.GeoDataFrame(test['Daymet'],
                       geometry=gpd.points_from_xy(test['cen_lon'], test['cen_lat']), 
                        crs="EPSG:4326")
test.head()

gdf.plot(column='Daymet',cmap='viridis',legend=True)

In [None]:
WYs = [2024]
for year in WYs:
    Make_Precip_DF(year,output_res=1000,thresh=10,dataset=dataset)

In [None]:
test1 = f'{HOME}/data/Precipitation/2024/1000M_Daymet_Precip/Daymet_WindyGap_20240414.parquet'
# test3 = f'{HOME}/data/TrainingDFs/2017/1000M_Resolution/PrecipVIIRSGeoObsDFs/10_fSCA_Thresh/PrecipDaymet_VIIRS_GeoObsdf_USCATB_20170727.parquet'
test1 = pd.read_parquet(test1)
# test3 = pd.read_parquet(test3)
print(test1.loc[0,'precip'].shape)
test1.head()
# test3[test3['season_precip_cm'].isna()]

In [None]:
test2 = f'/uufs/chpc.utah.edu/common/home/civil-group1/Johnson/SWEMLv2.0/data/TrainingDFs/Southwest/1000M_Resolution/VIIRSGeoObsDFs/20_fSCA_Thresh/VIIRS_GeoObsDF_20150403.parquet'
# test2 = f'{HOME}/data/TrainingDFs/2015/1000M_Resolution/PrecipVIIRSGeoObsDFs/20_fSCA_Thresh/PrecipDaymet_VIIRS_GeoObsDF_20150403.parquet'
test2 = pd.read_parquet(test2)
# test2[test2[test_site]>0]
test2.shape
# test2.loc['2015_1000M_38.19_-119.801']
# test2.iloc[int(test2.shape[0]/2):int(test2.shape[0]/2)+5]

In [None]:
HOME

In [None]:
file
with rxr.open_rasterio(file) as src:
            # reproject to WGS84
            transformed = src.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))
            left, bottom, right, top = transformed.rio.bounds()
            # add some padding to bbox
            left -= 0.1
            bottom -= 0.1
            right += 0.1
            top += 0.1
            bbox = rasterio.coords.BoundingBox(left, bottom, right, top)

In [None]:
WY = 2024
filepath = f"{HOME}/data/ASO/{WY}/Raw_ASO_Data"
files = [filename for filename in os.listdir(filepath) 
    if filename.endswith(".tif")
    ]
files

In [None]:
file = f"{filepath}/{files[-1]}"
tiff = rasterio.open(file)
show(tiff)
left,bottom,right,top = tiff.bounds
tiff.bounds
tiff.crs # looks like this is in UTM zone, probs want WGS84 for consistency? 

In [None]:
tiff.crs == rasterio.CRS.from_epsg(32611)

In [None]:
raster = rxr.open_rasterio(file)
transformed = raster.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))
transformed[0]

In [None]:
left, bottom, right, top = transformed.rio.bounds()
left

In [None]:
transformed.where(transformed > -1).plot()

In [None]:
daily = daymet.get_bygeom(tiff.bounds,dates=('2023-10-01','2024-05-27'),variables="prcp",crs=32611)
daily

In [None]:
daily['time']

In [None]:
daily_transformed = daily.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))

In [None]:
daily_transformed.sel(time='2013-01-09')['prcp'].plot.pcolormesh()

In [None]:
# cg1/Golitzin/SWEMLv2.0/data/TrainingDFs/2013/1000M_Resolution/2013_metadata.parquet
WY = [2015]
output_res=1000
meta_path = f"{HOME}/data/TrainingDFs/{WY[0]}/{output_res}M_Resolution/{WY[0]}_metadata.parquet"
# ASO_meta_path = f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/ASO_meta.parquet"
meta = pd.read_parquet(meta_path)
# ASO_meta = pd.read_parquet(ASO_meta_path)

In [None]:
meta.head()

In [None]:
meta['cen_lat'].min(),meta['cen_lat'].max(),meta['cen_lon'].min(),meta['cen_lon'].max()

In [None]:
# choose a random point within study area to check precip values 
test_lat, test_lon = meta.iloc[500]['cen_lat'],meta.iloc[500]['cen_lon']
test_lat,test_lon

In [None]:
# check values, are the data real? 
daily_transformed.sel(x=test_lon,y=test_lat,method='nearest')['prcp'].plot()

In [None]:
ASO_Key=pd.read_csv(f"{HOME}/../ASOKey.csv",header=3)
ASO_Key.head()

In [None]:
# parse dates and site name from ASO observation
# code borrowed from ASOfigs
def filename_parse(filename):
    date = next(element for element in os.path.splitext(filename)[0].split("_") if element.startswith('20'))
    if date[4].isnumeric() == False:
        date_singleday = os.path.splitext(date)[0].split("-")[0]
        datetime_object = datetime.strptime(date_singleday, "%Y%b%d")
        date = datetime_object.strftime('%Y%m%d')
    #identify basin from site code if applicable, else identify basin from name
    if filename[:12] == "ASO_50M_SWE_":
        # print(file[12:18])
        sitecode = filename[12:18]
        index = ASO_Key['SITE CODE']==sitecode
        sitename=(ASO_Key.loc[index,'SITE NAME']).item().replace(" ","_")
        # print(sitename)
        newfilename = f"ASO_{sitename}_{sitecode}_{date}"
        # print(newfilename)
    else:
        sitename = os.path.splitext(filename)[0].split("_")[1]
        newfilename = f"ASO_{sitename}_{date}"
    return(date, newfilename)

In [None]:
# this is going to be very annoying
# fix later (7/16)
filename = files[-1] 
date, filename_std = filename_parse(filename)
date = datetime.strptime(date,'%Y%m%d')
date
date_str = datetime.strftime(date, '%Y-%m-%d')
date_str, filename_std

In [None]:
filename_std[4:]

In [None]:
obs_start = datetime.strftime(datetime(date.year-1, 10, 1),'%Y-%m-%d')
obs_end = date_str
obs_start,obs_end

In [None]:
tiff.crs

In [None]:
left,bottom,right,top = tiff.bounds
# add some padding to make sure all values in ASO obs are represented in daymet file
new_bounds = left-1500,bottom-1500,right+1500,top+1500
new_bounds[0]
new_bounds = rasterio.coords.BoundingBox(new_bounds[0],new_bounds[1],new_bounds[2],new_bounds[3])
print(tiff.bounds,'\n', new_bounds)

In [None]:
obs_precip = daymet.get_bygeom(new_bounds,dates=(obs_start,obs_end),variables="prcp",crs=32611)
obs_precip_transformed = obs_precip.rio.reproject(rasterio.crs.CRS.from_epsg('4326'))

In [None]:
obs_precip_transformed

In [None]:
obs_precip_transformed.sel(time='2013-01-09')['prcp'].plot.pcolormesh()

In [None]:
obs_precip_transformed.sel(x=test_lon,y=test_lat,method='nearest')['prcp']

In [None]:
# theoretically want seasonally accumulated precip up until ASO obs date for each obs
# possibly better to get daily precip values, then sum later when adding to training DF
# want to include basin label and date in any precip filename (per RJ)
precip_arr = []

nsites = len(meta)
# nsites = 100 # for now
for i in range(nsites):
    lat, lon = meta.iloc[i]['cen_lat'],meta.iloc[i]['cen_lon']
    cellid = meta.iloc[i].name
    prcp = obs_precip_transformed.sel(x=lon,y=lat,method='nearest')['prcp']
    precip_arr.append([cellid,lat,lon,np.array(prcp.values)])

In [None]:
precip_df = pd.DataFrame(precip_arr,columns = ['cell_id','cen_lat','cen_lon','precip'])
precip_df

In [None]:
Precippath

In [None]:
# check if values exist & are real
# choose a random cell and plot precip over WY
test_idx = np.random.randint(1,nsites,size=1)
print(test_idx[0])
print(precip_df.iloc[test_idx[0]]['precip'].sum())
plt.plot(precip_df.iloc[test_idx[0]]['precip'])
plt.show()

In [None]:
season_precip_cm = []
for i in range(nsites):
    season_precip_cm.append(np.round(precip_df.iloc[i]['precip'].sum()/10,2))

In [None]:
print(season_precip_cm[:50])
print(trainingDF.iloc[:50]['season_precip_cm'].values)

In [None]:
precip_df['season_precip_cm'] = season_precip_cm

In [None]:
precip_df.head()

In [None]:
year = 2013
output_res = 1000

Precippath = f"{HOME}/data/Precipitation/{year}/{output_res}M_Daymet_Precip"
if not os.path.exists(Precippath):
    os.makedirs(Precippath, exist_ok=True)

#Convert DataFrame to Apache Arrow Table
table = pa.Table.from_pandas(precip_df)
# Parquet with Brotli compression
pq.write_table(table, f"{Precippath}/Daymet_{filename_std[4:]}.parquet", compression='BROTLI')

In [None]:
# test_filepath = f'{HOME}/data/Precipitation/Southwest/1000M_NLDAS_Precip/sites/Southwest_1000M_37.816_-119.259.parquet'
# just wanna see what the old "sites" parquet looks like 
# just season accumulated precip for a single cell for all ASO obs dates at that cellid 
test_filepath = f'{HOME}/data/Precipitation/Southwest/1000M_NLDAS_Precip/sites/NLDAS_PPT_Southwest_1000M_38.181_-119.585.parquet'
test_file = pd.read_parquet(test_filepath)
test_file[test_file['datetime']=='2013-06-08']

In [None]:
test_file

In [None]:
# check structure of training DF w precip
filepath = f'{HOME}/data/TrainingDFs/2013/1000M_Resolution/PrecipVIIRSGeoObsDFs/20_fSCA_Thresh/Precip_VIIRS_GeoObsDF_20130608.parquet'
trainingDF = pd.read_parquet(filepath)
trainingDF.head()

In [None]:
# compare random entry in new precip DF to old training DF
test_idx = np.random.randint(1,nsites,size=1)[0]
print('testing index',test_idx)
print('total daymet precip =',precip_df.iloc[test_idx]['precip'].sum()/10)
print(precip_df.iloc[test_idx]['cell_id'])
print(trainingDF.iloc[test_idx].name)
print('total NLDAS precip =',trainingDF.iloc[test_idx]['season_precip_cm'])

In [None]:
timedelt = datetime(2013,5,3)-datetime(2012,10,1)
timedelt.days

In [None]:
f"{HOME}/data/TrainingDFs/{region}/{output_res}M_Resolution/VIIRSGeoObsDFs/{20}_fSCA_Thresh"

In [None]:
WY = 2013
output_res = 1000
threshold = 20

In [None]:
def Make_Precip_DF(WY, output_res, threshold):

    print(f"Adding precipitation features to ML dataframe for {WY}.")
    Precippath = f"{HOME}/data/Precipitation/{WY}/{output_res}M_Daymet_Precip/"
    DFpath = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/VIIRSGeoObsDFs/{threshold}_fSCA_Thresh"

    #make precip df path
    PrecipDFpath = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/PrecipVIIRSGeoObsDFs/{threshold}_fSCA_Thresh"
    if not os.path.exists(PrecipDFpath):
        os.makedirs(PrecipDFpath, exist_ok=True)

    #Get list of dataframes
    GeoObsDF_files = [filename for filename in os.listdir(DFpath)]
    
    for geofile in GeoObsDF_files:
        single_date_add_precip((DFpath, Precippath, geofile, PrecipDFpath, WY))
    # print(GeoObsDF_files)
    # with cf.ProcessPoolExecutor(max_workers=CPUS) as executor: 
    #     # Start the load operations and mark each future with its process function
    #     [executor.submit(single_date_add_precip, (DFpath, Precippath, geofile, PrecipDFpath, WY)) for geofile in GeoObsDF_files]

In [None]:
Make_Precip_DF(WY,output_res,threshold)

In [None]:
    pptfiles = [filename for filename in os.listdir(Precippath) if filename.endswith('parquet')]
    print(pptfiles)
    
pptfiles[0].split('_')[-1].split('.parquet')[0]

In [None]:
def single_date_add_precip(args):
    DFpath, Precippath, geofile, PrecipDFpath, WY = args
    #get date information
    date = geofile.split('VIIRS_GeoObsDF_')[-1].split('.parquet')[0]
    year = date[:4]
    mon = date[4:6]
    day = date[6:]
    strdate = f"{year}-{mon}-{day}"
    print(f"Connecting precipitation to ASO observations for {WY} on {strdate}")
    
    GDF = pd.read_parquet(os.path.join(DFpath, geofile))
    GDF.set_index('cell_id', inplace = True)
    GDF['season_precip_cm'] = 0.0
    
    # get precip filenames
    pptfiles = [filename for filename in os.listdir(Precippath)]
    print(pptfiles)
    
    ppt_idx = -1
    # connect GDF to correct precip file by date
    for i in range(len(pptfiles)):
        ppt_date = pptfiles[i].split('_')[-1].split('.parquet')[0]
        if ppt_date == date:
            ppt_idx = i
            break
    if ppt_idx > -1:     
        ppt = pd.read_parquet(f"{Precippath}/{pptfiles[ppt_idx]}")
    else:
        raise Exception('Failed to connect precip observations to dataframe')
        
    #get unique cells
    sites = list(GDF.index)
    for site in sites:
        try:
            GDF.loc[site,'season_precip_cm'] = round(ppt['season_precip_cm'][ppt['cell_id']== site].values[0],1)
        except:
            print(f"{site} is bad, delete file from folder and rerun the get precipitation script")

    #Convert DataFrame to Apache Arrow Table
    table = pa.Table.from_pandas(GDF)
    # Parquet with Brotli compression
    pq.write_table(table, f"{PrecipDFpath}/PrecipDaymet_{geofile}", compression='BROTLI')


In [None]:
new_training_df = pd.read_parquet(f'{HOME}/data/TrainingDFs/2013/1000M_Resolution/PrecipVIIRSGeoObsDFs/20_fSCA_Thresh/PrecipDaymet_VIIRS_GeoObsDF_20130608.parquet')
new_training_df.head()

In [None]:
precip_df.iloc[1]['cell_id']

In [None]:
precip_df.iloc[1]['precip'].sum()/10

In [None]:
plt.plot(precip_df.iloc[1]['precip'])

In [None]:
old_file = pd.read_parquet(f'{HOME}/data/Precipitation/Northwest/1000M_NLDAS_Precip/sites/NLDAS_PPT_Northwest_1000M_47.888_-123.856.parquet')

In [None]:
meta.iloc[1].name

In [None]:
old_file

In [None]:
print(meta['cen_lat'].min(),meta['cen_lat'].max())
print(meta['cen_lon'].min(),meta['cen_lon'].max())

In [None]:
lat,long = meta[['cen_lat','cen_lon']].median()
location = ee.Geometry.Point(long,lat)

In [None]:
print(lat,long)

In [None]:
startdate='2015-10'
enddate='2019-07'
precip = ee.ImageCollection('NASA/NLDAS/FORA0125_H002').select('total_precipitation').filterDate(startdate, enddate)
daymet_precip = ee.ImageCollection("NASA/ORNL/DAYMET_V4").select('prcp').filterDate(startdate,enddate)

In [None]:
precip_poi = precip.getRegion(location,scale=1000).getInfo()

In [None]:
daymet_precip_poi = daymet_precip.getRegion(location,scale=1000).getInfo

In [None]:
pwd

In [None]:
daymet_gdf = geemap.ee_to_df(daymet_precip_poi)

In [None]:
site_precip = EE_funcs.ee_array_to_df(precip_poi,['total_precipitation'])
daymet_site_precip = EE_funcs.ee_array_to_df(daymet_precip_poi,['prcp'])

In [None]:
temporal_resample = 'D'
kgm2_to_cm = 0.1

site_precip.set_index('datetime', inplace = True)
site_precip = site_precip.resample(temporal_resample).sum()
site_precip.reset_index(inplace = True)

        #make columns for cms
site_precip['total_precipitation'] = site_precip['total_precipitation']*kgm2_to_cm
site_precip.rename(columns={'total_precipitation':'daily_precipitation_cm'}, inplace = True)
site_precip.pop('time')
site_precip.set_index('datetime',inplace=True)


In [None]:
WYdict = {2016,2019}

In [None]:
precip_daymet = ee.ImageCollection('NASA/ORNL/DAYMET_V4').select('prcp').filterDate(startdate, enddate)
precip_daymet_poi = precip_daymet.getRegion(location,scale=1000).getInfo()

In [None]:
site_precip_daymet = EE_funcs.ee_array_to_df(precip_daymet_poi,['prcp'])

In [None]:
site_precip_daymet['prcp'] /= 10
site_precip_daymet.pop('time')
site_precip_daymet.set_index('datetime',inplace=True)
site_precip_daymet.rename(columns={'prcp':'daily_precipitation_cm'},inplace=True)

In [None]:
site_precip_daymet