# Data Processing script v2 for the SWEML v2.0
This .ipynb script uses python module for processing predownloaded NASA ASO observations by Water Year, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

In [None]:
import os
import numpy as np
HOME = os.getcwd()

#If you get a proj.db error below, run the following and put the following into the terminal
import pyproj
# Get the PROJ data directory
proj_data_dir = pyproj.datadir.get_data_dir()
proj_db_path = proj_data_dir + "/proj.db"
os.environ['PROJ_LIB'] =pyproj.datadir.get_data_dir()
os.environ['PROJ_LIB']
#set multiprocessing limits
CPUS = len(os.sched_getaffinity(0))
CPUS = int((CPUS/2)-2)

#set home to the head of the SWEMLv2.0 directory
HOME = os.chdir('..')
HOME = os.getcwd()

#Add your module here
from utils.ASOget import ASODataProcessing_v2
import utils.get_InSitu_obs as get_InSitu_obs
import utils.GeoDF as GeoDF 
import utils.Obs_to_DF as Obs_to_DF 
import utils.get_VIIRS_SCA as get_VIIRS_SCA
import utils.get_Precip as get_Precip
import utils.get_Seasonality as get_Seasonality
import utils.vegetation_processer as vegpro
import utils.sturm_processer as stpro



#make SWEMLv2.0 modeling domain for western USA
# WY_list = ['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'] #'2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024' - Trying the first bit to prove concept, then can expand
WY_list = np.arange(2022,2024)
output_res = 1000 #desired spatial resulution in meters (m)
threshold = 10
print(f"The current session is using {WY_list} years, {output_res}m resolution, and {CPUS} CPUs")

In [None]:
# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
directory = "Raw_ASO_Data"

#Get ASO data, sometime sites will give error and break code, most times you can just rerun it using the data_processor sections below (e.g., comment out other parts
for WY in WY_list:
    #Convert ASO tifs to parquet
    print(f"Converting ASO images for WY: {WY}")
    folder_name = f"{WY}/{directory}"
    data_processor = ASODataProcessing_v2() #note, 2019-5-1, 2019-06-11 seems to be bad, manually removed from SW region
    data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, WY) 

## Get Snotel and CDEC in situ observations
- Ideas - add nearest sites elevation, distance from cell, then can bypass sites with bad data. 

In [None]:
# Only needed once. Other spatial resolutions can use the same data
#Get in situ observations

#make a list of dates to align with the ASO observations (they go as early as Jan-29 and as far out as the July-17)
years = np.arange(2013,2025,1)#Needs to go 1yr out
start_month_day = '10-01'
end_month_day = '08-31'

# observations 
get_InSitu_obs.Get_Monitoring_Data_Threaded_Updated(years, start_month_day, end_month_day, WY = True)

#combine years
get_InSitu_obs.combine_dfs(years)

## Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
for WY in WY_list:
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(WY)
        #load snotel meta location data, use haversive function
        GeoDF.fetch_snotel_sites_for_cellids(WY, output_res) # Using known up to date sites

        # Get geophysical attributes for each site, need to see how to add output resolution
        gdf = GeoDF.GeoSpatial(WY, output_res)

        #use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
        metadf = GeoDF.extract_terrain_data_threaded(gdf, WY, output_res)
    else:
        print(f"No ASO data for {WY}")




## Connect Snotel to each ASO obs


In [None]:
#Rerun

#Connect nearest snotel observations with ASO data, makes a parquet file for each date  -  test to see if this works - need to just load the SNOTEL file, not collect them as in the function
for WY in WY_list:
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(WY)
        dates = []
        manual = False
        Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(WY, output_res, manual, dates) 
    else:
        print(f"No ASO data for {WY}")


In [None]:
#Connect cell ids with ASO obs and snotel obs to geospatial features
for WY in WY_list:
    #path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(WY)
        GeoDF.add_geospatial_threaded(WY, output_res)
    else:
        print(f"No ASO data for {region}")

## Get NASA VIIRS fraction snow covered area for each location 

* Make sure the code grabs all dates for each region, may have to run multiple times
* run until "No granules found for DATE, requesting data from NSIDC..." no longer occurs


In [None]:
get_VIIRS_SCA.get_VIIRS_from_AWS()

In [None]:
#check to see if the VIIRS data is available locally, if not, get from CIROH AWS - I think all of this data is for the incorrect year...
#get_VIIRS_SCA.get_VIIRS_from_AWS()

#Connect VIIRS data to dataframes
for WY in WY_list:
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"
    if os.path.isdir(path) == True:
        print(WY)
        get_VIIRS_SCA.augment_SCA_multiprocessing(WY, output_res, threshold)
    else:
        print(f"No ASO data for {WY}")


## Add seasonality metrics to the dataframe

In [None]:
for WY in WY_list:
    #process snotel sites to make "snow hydrograph features" to determine above/below average WY conditions
    get_Seasonality.seasonal_snotel()


    #get the Day of season metric for each dataframe
    get_Seasonality.add_Seasonality(WY, output_res, threshold)

## Use Sturm's snow classification as features within model framework

Using the originally created env, it looks like the rasterio package does not contain the correct ECS driver. Trying to address this with conda install conda-forge::rasterio in my SWEML_310 env from the shell in CHPC

In [None]:
#download sturm data
stpro.get_Sturm_data()

for WY in WY_list:
    print(WY)
    input_directory = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/Seasonality_VIIRSGeoObsDFs/{threshold}_fSCA_Thresh"
    sturm_file = f"{HOME}/data/SnowClassification/SnowClass_NA_300m_10.0arcsec_2021_v01.0.tif" #https://nsidc.org/data/nsidc-0768/versions/1
    output_directory = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/Sturm_Seasonality_VIIRSGeoObsDFs/{threshold}_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    stpro.process_sturm_data_for_files(input_directory, sturm_file, output_directory)

## Add vegetation data to the dataframe from the North American land Cover Management Systemoutput_path

This script needs to be multiprocessed, too slow

In [None]:
#get data
url = "http://www.cread_parquetiles/atlas_layers/1_terrestrial_ecosystems/1_01_0_land_cover_2020_30m/usa_land_cover_2020v2_30m_tif.zip"
output_path = f"{HOME}/data/LandCover/"
file = "usa_land_cover_2020v2_30m_tif.zip" 
vegpro.get_data(url, output_path, file)
#unzip the file is not already done
#vegpro.unzip_LC_data(output_path, file)
#output = 1000 

for WY in WY_list:
    print(WY)
    input_directory = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/Sturm_Seasonality_VIIRSGeoObsDFs/{threshold}_fSCA_Thresh"
    vegetation_file = f"{HOME}/data/LandCover/usa_land_cover_2020v2_30m_tif/USA_NALCMS_landcover_2020v2_30m/data/USA_NALCMS_landcover_2020v2_30m.tif"
    output_directory = f"{HOME}/data/TrainingDFs/{WY}/{output_res}M_Resolution/Vegetation_Sturm_Seasonality_VIIRSGeoObsDFs/{threshold}_fSCA_Thresh"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    vegpro.process_vegetation_data_for_files(input_directory, vegetation_file, output_directory)

## Get Daymet Precipitation for each cell

In [None]:
WY_list = [2023]

In [None]:
dataset = 'Daymet'

for WY in WY_list:
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(WY)
        # get_Precip.get_daymet_precip(WY, output_res, years)
    else:
        print(f"No ASO data for {WY}, {path}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(WY, output_res, threshold, dataset)

## Get NLDAS Precipitation for each cell

In [None]:
'''
note*, if using python > 3.9, you will likely need to change the ee package to from io import StringIO
sometimes there will be an ASO file that is inproperly named in the ASO yr folder, it typically also makes a ,ipynb checkpoint that crash the code
'''
#gets precipitation for each location, accumulates it through the water year
#This step could be made much more efficient by collecting all of the tiles in one step, then multiprocessing later

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
dataset = 'NLDAS'

for WY in WY_list:
    path = f"{HOME}/data/ASO/{WY}/{output_res}M_SWE_parquet"

    if os.path.isdir(path) == True:
        print(WY)
        # get_Precip.get_precip_threaded(WY, output_res, years)
    else:
        print(f"No ASO data for {WY}, {path}")

    #Connect precipitation to processed DFs
    get_Precip.Make_Precip_DF(WY, output_res, threshold, dataset)


## Next steps
* Explore why errors in precip sites above
* add in situ obs - seasonality based on the historical neareste x monitoring stations - like a historical average to-date swe value unit hydrograph based on the day of year? This will include a historical time of year of normal swe value and a swe value of year compared to normal
* albedo metric


In [None]:
import pandas as pd

HOME = os.path.expanduser('~')
region = 'Southwest'
output_res = '300'

dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"

SWmeta = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")

import UpdateDataFrame

#need to update the topographic features for every dataframe
output_res = '300'
training_cats = ['Obsdf']
fSCA = '' #'20_fSCA_Thresh'


for training_cat in training_cats:
    print(training_cat)

    for region in region_list:
        print(region)
        dfpath = f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/{output_res}M_Resolution"
        #file to be used to updated training DF
        updatefile = pd.read_parquet(f"{dfpath}/{region}_metadata.parquet")


        #Update Dataframe
        UpdateDataFrame.updateTrainingDF(region, output_res, training_cat, fSCA, updatefile)

trainfile = pd.read_parquet(f"{dfpath}/{training_cat}/{fSCA}/Sturm_Season_Precip_VIIRS_GeoObsDF_20150406.parquet")

import matplotlib.pyplot as plt
import geopandas as gpd

from mpl_toolkits.axes_grid1 import make_axes_locatable

def SpatialAnalysis(EvalDF):
    #Convert to a geopandas DF
    Pred_Geo = gpd.GeoDataFrame(EvalDF, geometry = gpd.points_from_xy(EvalDF.cen_lon, EvalDF.cen_lat))

    Pred_Geo.plot(column='Elevation_m',
                  legend=False,
                )
    
SpatialAnalysis(trainfile)