In [31]:
# hydrological packages
import hydroeval as he
from hydrotools.nwm_client import utils 

# basic packages
import numpy as np
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq
import bz2file as bz2

# system packages
from progressbar import ProgressBar
from datetime import datetime, date
import datetime
import pickle as pkl
import warnings
warnings.filterwarnings("ignore")
import platform
import time

# data analysi packages
from scipy import optimize
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import joblib

# deep learning packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

#Shared/Utility scripts
import sys
import boto3
import s3fs
sys.path.insert(0, '..') #sys allows for the .ipynb file to connect to the shared folder files

#load access key
HOME = os.path.expanduser('~')
KEYPATH = "NWM_ML/AWSaccessKeys.csv"
ACCESS = pd.read_csv(f"{HOME}/{KEYPATH}")

#start session
SESSION = boto3.Session(
    aws_access_key_id=ACCESS['Access key ID'][0],
    aws_secret_access_key=ACCESS['Secret access key'][0],
)
S3 = SESSION.resource('s3')
#AWS BUCKET information
BUCKET_NAME = 'streamflow-app-data'
BUCKET = S3.Bucket(BUCKET_NAME)

#s3fs
fs = s3fs.S3FileSystem(anon=False, key=ACCESS['Access key ID'][0], secret=ACCESS['Secret access key'][0])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


modelname = "LSTM"

print(f"{modelname} development script")

Device: cuda
LSTM development script


In [32]:
#Get streamstats data 
datapath = f"{HOME}/NWM_ML/Data/input"
file = "Streamstats.csv"
filepath = f"{datapath}/{file}"
try:
    StreamStats = pd.read_csv(filepath)
except:
    print("Data not found, retreiving from AWS S3")
    if not os.path.exists(datapath):
        os.makedirs(datapath, exist_ok=True)
    key = 'Streamstats/Streamstats.csv'      
    S3.meta.client.download_file(BUCKET_NAME, key,filepath)
    StreamStats = pd.read_csv(filepath)

#Get processed training data 
datapath = f"{HOME}/NWM_ML/Data/Processed"
file = "raw_training_data.parquet"
filepath = f"{datapath}/{file}"
try:
    raw_training_data = pd.read_parquet(filepath)
except:
    print("Data not found, retreiving from AWS S3")
    if not os.path.exists(datapath):
        os.makedirs(datapath, exist_ok=True)
    key = "NWM_ML"+datapath.split("NWM_ML",1)[1]+'/'+file       
    S3.meta.client.download_file(BUCKET_NAME, key,filepath)
    raw_training_data = pd.read_parquet(filepath)

raw_training_data.pop('Unnamed: 0')
raw_training_data['station_id'] = raw_training_data['station_id'].astype('str')
raw_training_data.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Mean_Basin_Elev_ft,Perc_Forest,Perc_Develop,Perc_Imperv,Perc_Herbace,Perc_Slop_30,Mean_Ann_Precip_in,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
0,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-28,78.55521,-0.891007,-0.453991,0.0,1.2,55.0,301
1,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-29,98.61146,-0.891007,-0.453991,0.0,1.2,55.0,302
2,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-30,97.60208,-0.891007,-0.453991,0.0,1.1,54.0,303
3,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-31,99.33125,-0.891007,-0.453991,0.0,1.2,54.0,304
4,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-11-01,95.76354,-0.99863,0.052336,0.0,1.2,54.0,305


### Dataprocessing
* Editing the features based on the feature importance
* Remove headwater stations from dataset
* make sure dates are in datetime format

In [33]:
# Editing the features based on the feature importance should be done here!!!!!!!!!!!!!!!
Training_DF = raw_training_data.copy()
Training_DF.drop(['Mean_Ann_Precip_in', 'Perc_Herbace', 'Perc_Forest',
                        'Mean_Basin_Elev_ft'], axis=1, inplace=True)

#remove headwater stations
headwater_stations = ['10011500', '10109000', '10113500', '10128500', '10131000', '10146400', '10150500', '10154200',
'10172700', '10172800', '10172952']
Training_DF = Training_DF[~raw_training_data['station_id'].isin(headwater_stations)]

#convert dates to datetime format
Training_DF.datetime = pd.to_datetime(Training_DF.datetime)

#Select training data - testing is going to be done on 2020
x_train_temp = Training_DF[Training_DF.datetime.dt.year != 2020]
x_train_temp.pop('station_id')
x_train_temp.pop('datetime')
y_train_temp = x_train_temp['flow_cfs']
x_train_temp.pop('flow_cfs')

#Convert dataframe to numpy, scale, save scalers
y_train = y_train_temp.to_numpy()
x_train = x_train_temp.to_numpy()

scalername_x = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_x.save"
scalername_y = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_y.save"
modelpath = f"{HOME}/NWM_ML/Model/{modelname}"
if not os.path.exists(modelpath):
    os.makedirs(modelpath, exist_ok=True)

scalerfilepath_x = f"{modelpath}/{scalername_x}"
scalerfilepath_y = f"{modelpath}/{scalername_y}"

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
joblib.dump(scaler, scalerfilepath_x)

scaler = MinMaxScaler()
y_scaled_train = scaler.fit_transform(y_train.reshape(-1, 1))
joblib.dump(scaler, scalerfilepath_y)  
y_scaled_train.shape

(128879, 1)

### Set up Testing year
* Select year(s) not used in training
* Convert to numpy array
* Load scaler and scale data

In [34]:
#Get water year for testing from larger dataset
x_test_temp = Training_DF[Training_DF.datetime.dt.year == 2020]
x_test_temp_1 = x_test_temp.copy()
station_index_list = x_test_temp_1['station_id']
x_test_temp_1.pop('station_id')
x_test_temp_1.pop('datetime')

#Get target variable (y) and convert to numpy arrays
y_test_temp_1 = x_test_temp_1['flow_cfs']
x_test_temp_1.pop('flow_cfs')
x_test_1_np = x_test_temp_1.reset_index(drop=True).to_numpy()
y_test_1_np = y_test_temp_1.reset_index(drop=True).to_numpy()

#load scalers and scale
scalername_x = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_x.save"
scalername_y = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_y.save"
modelpath = f"{HOME}/NWM_ML/Model/{modelname}"
scalerfilepath_x = f"{modelpath}/{scalername_x}"
scalerfilepath_y = f"{modelpath}/{scalername_y}"

#load scalers
scaler_x = joblib.load(scalerfilepath_x)
scaler_y = joblib.load(scalerfilepath_y)

#scale the testing data
x_test_1_scaled = scaler_x.fit_transform(x_test_1_np)
y_scaled_test_1 = scaler_y.fit_transform(y_test_1_np.reshape(-1, 1))


### Set up model training framework

In [35]:
# %% LSTM

n_targets = 1
tries = 10
#model performance metrics
cri_temp_nse = np.zeros([3, n_targets, tries])
cri_temp_rmse = np.zeros([3, n_targets, tries])
cri_temp_r2 = np.zeros([3, n_targets, tries])
cri_temp_kge = np.zeros([3, n_targets, tries])
cri_temp_lognse = np.zeros([3, n_targets, tries])

# Convert to tensor for PyTorch, Reshape Input for LSTM Model
x_train_scaled_t = torch.Tensor(x_train_scaled).unsqueeze(1)
y_train_scaled_t = torch.Tensor(y_scaled_train).unsqueeze(1)
#Make sure the tensors on are the respective device (cpu/gpu)
x_train_scaled_t = x_train_scaled_t.to(device)
y_train_scaled_t = y_train_scaled_t.to(device)

x_train_scaled_t.shape


torch.Size([128879, 1, 12])

## Train the model

* add lookback
* make the model a .py file and class when finalized. PyTorch only saves the weights of the layer/node, not the overall structure.

In [36]:
#Train the model
model_path = f"{HOME}/NWM_ML/Model/{modelname}"
start_time = time.time()

# Assuming you have your data loaded into NumPy arrays as x_train_scaled, y_train_scaled, x_test_scaled, y_test_scaled, x_scaled, y_scaled
# Hyperparameters
epochs = 5
batch_size = 50
learning_rate = 0.0001
decay = 1e-2
validation_split = 0.2
neurons = 300

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(x_train_scaled_t, y_train_scaled_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False ) # might shuffle this

# Build the model
model = nn.LSTM(input_size=x_train_scaled_t.shape[2], hidden_size=neurons, bidirectional=True, batch_first=True).to(device)
fc = nn.Linear(neurons * 2, 1).to(device)  # Multiply by 2 for bidirectional LSTM

# Define loss and optimizer
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=decay) #

# Training loop 
for epoch in range(epochs):
    model.train()
    fc.train()
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        

        output, _ = model(batch_x)
        output = fc(output[:, -1, :])
        loss = criterion(output, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

print('finish')
print("Run Time:" + " %s minutes " % ((time.time() - start_time)/60))
#save model - https://pytorch.org/tutorials/beginner/saving_loading_models.html, for a more efficient way to save... need to make the model as a class and we save the class...
if os.path.exists(model_path) == False:
    os.mkdir(model_path)
torch.save(model.state_dict(), f"{model_path}/{modelname}_model.pkl")
torch.save(fc.state_dict(), f"{model_path}/{modelname}_model_fc.pkl")

Epoch 1/5, Loss: 0.009890668032775395
Epoch 2/5, Loss: 0.00969704137502152
Epoch 3/5, Loss: 0.009692728656836194
Epoch 4/5, Loss: 0.009664511577092512
Epoch 5/5, Loss: 0.00967886494779579
finish
Run Time: 0.45086693366368613 minutes 


## Load the model for evaluation

In [37]:
# Build and load the model
model_path = f"{HOME}/NWM_ML/Model/{modelname}"

# Build the model
model_P = nn.LSTM(input_size=x_train_scaled_t.shape[2], hidden_size=neurons, bidirectional=True, batch_first=True).to(device)
fc_P = nn.Linear(neurons * 2, 1).to(device)  # Multiply by 2 for bidirectional LSTM

#this requires the model structure to be preloaded
model_P.load_state_dict(torch.load(f"{model_path}/{modelname}_model.pkl"))
fc_P.load_state_dict(torch.load(f"{model_path}/{modelname}_model_fc.pkl"))

#put the model scores into a dataframe for comparison
#Evaluation columns for prediction time series
cols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{modelname}_rmse", 'NWM_pbias', f"{modelname}_pbias", 
        'NWM_kge', f"{modelname}__kge", 'NWM_mape',  f"{modelname}_mape"]

#Evaluation columns for accumulated supply time series
supcols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{modelname}_rmse", 'NWM_pbias', f"{modelname}_pbias", 
        'NWM_kge', f"{modelname}__kge", 'NWM_mape',  f"{modelname}_mape", 'Obs_vol', 'NWM_vol', f"{modelname}_vol",
        'NWM_vol_err', f"{modelname}_vol_err", 'NWM_vol_Perc_diff', f"{modelname}_vol_Perc_diff"]


EvalDF = pd.DataFrame(columns = cols)
SupplyEvalDF = pd.DataFrame(columns = supcols)

# Make a prediction for each location, save as compressed pkl file, and send predictions to AWS for use in CSES

In [38]:
#get annual supply diffs
cfsday_AFday = 1.983
year = 2020

model_P = model_P.to(device)
fc_P = fc_P.to(device)


Preds_Dict = {}
for station_number in station_index_list.drop_duplicates():
  index = station_index_list == station_number
  X_test = x_test_temp_1[index]
  X_test_scaled_t = torch.Tensor(x_test_1_scaled[index]).unsqueeze(1)
  X_test_scaled_t = X_test_scaled_t.to(device)
  l = len(y_test_temp_1.values)
  y_test = torch.Tensor(np.array(y_test_temp_1.values).reshape(l,1))
  y_test = y_test.to(device)

  # Evaluation
  model_P.eval()
  with torch.no_grad():
    predictions_scaled, _ = model_P(X_test_scaled_t)
    predictions_scaled = fc_P(predictions_scaled[:, -1, :])

  # Invert scaling for actual
  predictions = scaler_y.inverse_transform(predictions_scaled.to('cpu').numpy())
  predictions[predictions<0] = 0
  predictions = pd.DataFrame(predictions, columns=[f"{modelname}_flow"])

  #save predictions, need to convert to NHDPlus reach - Need to add Datetime column and flow predictions
  #make daterange
  dates = pd.date_range(pd.to_datetime("2020-01-01"), periods=len(predictions)).strftime("%Y-%m-%d").tolist()
  predictions['Datetime'] = dates
    
  #get reach id for model eval
  nhdreach = utils.crosswalk(usgs_site_codes=station_number)
  nhdreach = nhdreach['nwm_feature_id'].iloc[0]

  #put columns in correct order
  cols = ['Datetime', f"{modelname}_flow"]
  predictions = predictions[cols]

  #save predictions to AWS so we can use CSES
  state = StreamStats['state_id'][StreamStats['NWIS_site_id'].astype(str)== station_number].values[0].lower()
  csv_key = f"{modelname}/NHD_segments_{state}.h5/{modelname[:3]}_{nhdreach}.csv"
  predictions.to_csv(f"s3://{BUCKET_NAME}/{csv_key}", index = False,  storage_options={'key': ACCESS['Access key ID'][0],
                           'secret': ACCESS['Secret access key'][0]})

  #Concat DFS and put into dictionary
  x_test_temp['nwm_feature_id'] = nhdreach
  Dfs = [predictions.reset_index(drop=True),x_test_temp[x_test_temp['station_id']==station_number].reset_index(drop=True)]
  Preds_Dict[station_number] = pd.concat(Dfs, axis=1)

  #reorganize columns
  Preds_Dict[station_number].pop('datetime')
  Preds_Dict[station_number].insert(1, f"{modelname}_flow", Preds_Dict[station_number].pop(f"{modelname}_flow"))
  Preds_Dict[station_number].insert(1, "NWM_flow", Preds_Dict[station_number].pop("NWM_flow"))
  Preds_Dict[station_number].insert(1, "flow_cfs", Preds_Dict[station_number].pop("flow_cfs"))
  Preds_Dict[station_number].insert(1, "nwm_feature_id", Preds_Dict[station_number].pop("nwm_feature_id"))
  Preds_Dict[station_number].insert(1, "station_id", Preds_Dict[station_number].pop("station_id"))

  #push data to AWS so we can use CSES
  
  
#save predictions as compressed pkl file
pred_path = f"{HOME}/NWM_ML/Predictions/Hindcast/{modelname}/{year}"
file_path = f"{pred_path}/{modelname}_predictions.pkl"
if os.path.exists(pred_path) == False:
  os.makedirs(pred_path)

with open(file_path, 'wb') as handle:
  pkl.dump(Preds_Dict, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [29]:
Preds_Dict['10105900']

Unnamed: 0,Datetime,station_id,nwm_feature_id,flow_cfs,NWM_flow,LSTM_flow,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,s1,s2,storage,swe,DOY
0,2020-01-01,10105900,666170,48.351044,61.0,11.592534,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.438371,0.898794,0.0,6.90,1
1,2020-01-02,10105900,666170,50.033333,61.0,11.503805,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.438371,0.898794,0.0,7.25,2
2,2020-01-03,10105900,666170,48.821877,60.0,11.505720,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.438371,0.898794,0.0,7.30,3
3,2020-01-04,10105900,666170,47.367710,60.0,11.527790,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.438371,0.898794,0.0,7.30,4
4,2020-01-05,10105900,666170,46.633335,60.0,11.549849,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.438371,0.898794,0.0,7.30,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,2020-09-26,10105900,666170,25.114584,71.0,32.627487,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.529919,-0.848048,0.0,0.00,270
270,2020-09-27,10105900,666170,25.642708,71.0,32.649532,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.529919,-0.848048,0.0,0.00,271
271,2020-09-28,10105900,666170,26.336458,71.0,32.671574,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.529919,-0.848048,0.0,0.00,272
272,2020-09-29,10105900,666170,26.250000,70.0,32.689312,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.529919,-0.848048,0.0,0.00,273
