In [1]:
# hydrological packages
import hydroeval as he
from hydrotools.nwm_client import utils 

# basic packages
import numpy as np
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq
import bz2file as bz2

# system packages
from progressbar import ProgressBar
from datetime import datetime, date
import datetime
import pickle as pkl
import warnings
warnings.filterwarnings("ignore")
import platform
import time

# data analysi packages
from scipy import optimize
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import joblib

# deep learning packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

#Shared/Utility scripts
import sys
import boto3
import s3fs
sys.path.insert(0, '..') #sys allows for the .ipynb file to connect to the shared folder files

#load access key
HOME = os.path.expanduser('~')
KEYPATH = "NWM_ML/AWSaccessKeys.csv"
ACCESS = pd.read_csv(f"{HOME}/{KEYPATH}")

#start session
SESSION = boto3.Session(
    aws_access_key_id=ACCESS['Access key ID'][0],
    aws_secret_access_key=ACCESS['Secret access key'][0],
)
S3 = SESSION.resource('s3')
#AWS BUCKET information
BUCKET_NAME = 'streamflow-app-data'
BUCKET = S3.Bucket(BUCKET_NAME)

#s3fs
fs = s3fs.S3FileSystem(anon=False, key=ACCESS['Access key ID'][0], secret=ACCESS['Secret access key'][0])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

modelname = "MLP"

print(f"{modelname} development script")

Device: cuda
MLP development script


In [2]:
#Get streamstats data 
datapath = f"{HOME}/NWM_ML/Data/input"
file = "Streamstats.csv"
filepath = f"{datapath}/{file}"
try:
    StreamStats = pd.read_csv(filepath)
except:
    print("Data not found, retreiving from AWS S3")
    if not os.path.exists(datapath):
        os.makedirs(datapath, exist_ok=True)
    key = 'Streamstats/Streamstats.csv'      
    S3.meta.client.download_file(BUCKET_NAME, key,filepath)
    StreamStats = pd.read_csv(filepath)

#Get processed training data 
datapath = f"{HOME}/NWM_ML/Data/Processed"
file = "raw_training_data.parquet"
filepath = f"{datapath}/{file}"
try:
    raw_training_data = pd.read_parquet(filepath)
except:
    print("Data not found, retreiving from AWS S3")
    if not os.path.exists(datapath):
        os.makedirs(datapath, exist_ok=True)
    key = "NWM_ML"+datapath.split("NWM_ML",1)[1]+'/'+file       
    S3.meta.client.download_file(BUCKET_NAME, key,filepath)
    raw_training_data = pd.read_parquet(filepath)

raw_training_data.pop('Unnamed: 0')
raw_training_data['station_id'] = raw_training_data['station_id'].astype('str')
raw_training_data.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Mean_Basin_Elev_ft,Perc_Forest,Perc_Develop,Perc_Imperv,Perc_Herbace,Perc_Slop_30,Mean_Ann_Precip_in,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
0,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-28,78.55521,-0.891007,-0.453991,0.0,1.2,55.0,301
1,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-29,98.61146,-0.891007,-0.453991,0.0,1.2,55.0,302
2,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-30,97.60208,-0.891007,-0.453991,0.0,1.1,54.0,303
3,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-31,99.33125,-0.891007,-0.453991,0.0,1.2,54.0,304
4,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-11-01,95.76354,-0.99863,0.052336,0.0,1.2,54.0,305


### Dataprocessing
* Editing the features based on the feature importance
* Remove headwater stations from dataset
* make sure dates are in datetime format

In [8]:
# Editing the features based on the feature importance should be done here!!!!!!!!!!!!!!!
Training_DF = raw_training_data.copy()
Training_DF.drop(['Mean_Ann_Precip_in', 'Perc_Herbace', 'Perc_Forest',
                        'Mean_Basin_Elev_ft'], axis=1, inplace=True)

#remove headwater stations
headwater_stations = ['10011500', # Bear River headwaters before WY state line
                      '10109000', # Logan River above dams
                      '10113500', # HW Blacksmith fork
                      '10128500', # Upper Weber above Oakley
                      '10131000', #Chalk creek before Weber - lots of upstream irrigation, potentially include
                        '10146400', #Currant Creek above Mona Reservoir - lots of upstream irrigation, potentially include
                        '10150500', #Spanish fork after diamond fork - potentially include because of 6th water diversion CUP
                        '10154200', #Upper Provo river after confluence of N/S forks - potentially include because of duchense tunnel water diversion CUP
                        '10172700', #Vernon creek 2 ranges west of Utah Lake, shouldnt be included because not in GSL basin 
                        '10172800', #Willow creek west of Gransville,  shouldnt be included because does not make it to GSL
                          '10172952'] #Dunn creek in Raft River Range, shouldnt be included because drains to bonnevile salt flats 
Training_DF = Training_DF[~raw_training_data['station_id'].isin(headwater_stations)]

#convert dates to datetime format
Training_DF.datetime = pd.to_datetime(Training_DF.datetime)

#Select training data - testing is going to be done on 2020
x_train_temp = Training_DF[Training_DF.datetime.dt.year != 2020]
x_train_temp.pop('station_id')
x_train_temp.pop('datetime')
y_train_temp = x_train_temp['flow_cfs']
x_train_temp.pop('flow_cfs')

#Convert dataframe to numpy, scale, save scalers
y_train = y_train_temp.to_numpy()
x_train = x_train_temp.to_numpy()

scalername_x = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_x.save"
scalername_y = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_y.save"
modelpath = f"{HOME}/NWM_ML/Model/{modelname}"
if not os.path.exists(modelpath):
    os.makedirs(modelpath, exist_ok=True)

scalerfilepath_x = f"{modelpath}/{scalername_x}"
scalerfilepath_y = f"{modelpath}/{scalername_y}"

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
joblib.dump(scaler, scalerfilepath_x)

scaler = MinMaxScaler()
y_scaled_train = scaler.fit_transform(y_train.reshape(-1, 1))
joblib.dump(scaler, scalerfilepath_y)  
print(y_scaled_train.shape)
print(x_train_scaled.shape)

(128879, 1)
(128879, 12)


In [9]:
x_train_temp

Unnamed: 0,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,s1,s2,storage,swe,NWM_flow,DOY
3079,41.575490,-111.85522,180.0,1.01,0.0653,44.20,-0.891007,-0.453991,0.0,0.0,37.0,275
3080,41.575490,-111.85522,180.0,1.01,0.0653,44.20,-0.891007,-0.453991,0.0,0.0,36.0,276
3081,41.575490,-111.85522,180.0,1.01,0.0653,44.20,-0.891007,-0.453991,0.0,0.0,36.0,277
3082,41.575490,-111.85522,180.0,1.01,0.0653,44.20,-0.891007,-0.453991,0.0,0.0,36.0,278
3083,41.575490,-111.85522,180.0,1.01,0.0653,44.20,-0.891007,-0.453991,0.0,0.0,36.0,279
...,...,...,...,...,...,...,...,...,...,...,...,...
179201,40.733557,-111.92327,3430.0,14.70,4.3700,4.94,-0.829038,0.559193,0.0,0.0,3309.0,361
179202,40.733557,-111.92327,3430.0,14.70,4.3700,4.94,-0.829038,0.559193,0.0,0.0,3311.0,362
179203,40.733557,-111.92327,3430.0,14.70,4.3700,4.94,-0.829038,0.559193,0.0,0.0,3313.0,363
179204,40.733557,-111.92327,3430.0,14.70,4.3700,4.94,-0.829038,0.559193,0.0,0.0,3315.0,364


### Set up Testing year
* Select year(s) not used in training
* Convert to numpy array
* Load scaler and scale data

In [10]:
#Get water year for testing from larger dataset
x_test_temp = Training_DF[Training_DF.datetime.dt.year == 2020]
x_test_temp_1 = x_test_temp.copy()
station_index_list = x_test_temp_1['station_id']
x_test_temp_1.pop('station_id')
x_test_temp_1.pop('datetime')

#Get target variable (y) and convert to numpy arrays
y_test_temp_1 = x_test_temp_1['flow_cfs']
x_test_temp_1.pop('flow_cfs')
x_test_1_np = x_test_temp_1.reset_index(drop=True).to_numpy()
y_test_1_np = y_test_temp_1.reset_index(drop=True).to_numpy()

#load scalers and scale
scalername_x = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_x.save"
scalername_y = "Area_Perc_Seas_stor_swe_NWM_DOY_scaler_y.save"
modelpath = f"{HOME}/NWM_ML/Model/{modelname}"
scalerfilepath_x = f"{modelpath}/{scalername_x}"
scalerfilepath_y = f"{modelpath}/{scalername_y}"

#load scalers
scaler_x = joblib.load(scalerfilepath_x)
scaler_y = joblib.load(scalerfilepath_y)

#scale the testing data
x_test_1_scaled = scaler_x.fit_transform(x_test_1_np)
y_scaled_test_1 = scaler_y.fit_transform(y_test_1_np.reshape(-1, 1))
print(y_scaled_test_1.shape)
print(x_test_1_scaled.shape)

(5473, 1)
(5473, 12)


### Set up model training framework

In [11]:
# %% MLP

n_targets = 1
tries = 10
#model performance metrics
cri_temp_nse = np.zeros([3, n_targets, tries])
cri_temp_rmse = np.zeros([3, n_targets, tries])
cri_temp_r2 = np.zeros([3, n_targets, tries])
cri_temp_kge = np.zeros([3, n_targets, tries])
cri_temp_lognse = np.zeros([3, n_targets, tries])

# Convert to tensor for PyTorch
x_train_scaled_t = torch.Tensor(x_train_scaled)
y_train_scaled_t = torch.Tensor(y_scaled_train)
#Make sure the tensors on are the respective device (cpu/gpu)
x_train_scaled_t = x_train_scaled_t.to(device)
y_train_scaled_t = y_train_scaled_t.to(device)


## Train the model

In [6]:
#Train the model
model_path = f"{HOME}/NWM_ML/Model/{modelname}"

start_time = time.time()

# Hyperparameters
epochs = 20
batch_size = 100
learning_rate = 0.001
decay = 1e-2
validation_split = 0.2
neurons = 150
LD1=128
LD2=128
LD3=64
LD4=64
LD5=32
LD6=16
LD7=5

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(x_train_scaled_t, y_train_scaled_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False )

# Build the model
model = nn.Sequential(
    nn.Linear(12, LD1),
    nn.ReLU(),
    nn.Linear(LD1, LD2),
    nn.ReLU(),
    nn.Linear(LD2, LD3),
    nn.ReLU(),
    nn.Linear(LD3, LD4),
    nn.ReLU(),
    nn.Linear(LD4, LD5),
    nn.ReLU(),
    nn.Linear(LD5, LD6),
    nn.ReLU(),
    nn.Linear(LD6, 1)
).to(device)


# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop

for epoch in range(epochs):
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

print('finish')
print("Run Time:" + " %s seconds " % (time.time() - start_time))

#save model
if os.path.exists(model_path) == False:
    os.makedirs(model_path)
torch.save(model.state_dict(), f"{model_path}/{modelname}_model.pkl")

Epoch 1/20, Loss: 0.0012856289789382174
Epoch 2/20, Loss: 0.001025355682417852
Epoch 3/20, Loss: 0.0008956932600489864
Epoch 4/20, Loss: 0.000903131655158326
Epoch 5/20, Loss: 0.0006740855669195825
Epoch 6/20, Loss: 0.0005337522594036082
Epoch 7/20, Loss: 0.0005204764505290946
Epoch 8/20, Loss: 0.0006055072047146098


KeyboardInterrupt: 

## Load the model for evaluation

In [None]:
# Build and load the model
model_path = f"{HOME}/NWM_ML/Model/{modelname}"
# Hyperparameters
epochs = 20
batch_size = 100
learning_rate = 0.001
decay = 1e-2
validation_split = 0.2
neurons = 150
LD1=128
LD2=128
LD3=64
LD4=64
LD5=32
LD6=16
LD7=5

#device = torch.device('cpu') # for some reason had to change to cpu
models = nn.Sequential(
    nn.Linear(12, LD1),
    nn.ReLU(),
    nn.Linear(LD1, LD2),
    nn.ReLU(),
    nn.Linear(LD2, LD3),
    nn.ReLU(),
    nn.Linear(LD3, LD4),
    nn.ReLU(),
    nn.Linear(LD4, LD5),
    nn.ReLU(),
    nn.Linear(LD5, LD6),
    nn.ReLU(),
    nn.Linear(LD6, 1)
).to(device)

models.load_state_dict(torch.load(f"{model_path}/{modelname}_model.pkl"))

#put the model scores into a dataframe for comparison
#Evaluation columns for prediction time series
cols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{modelname}_rmse", 'NWM_pbias', f"{modelname}_pbias", 
        'NWM_kge', f"{modelname}__kge", 'NWM_mape',  f"{modelname}_mape"]

#Evaluation columns for accumulated supply time series
supcols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{modelname}_rmse", 'NWM_pbias', f"{modelname}_pbias", 
        'NWM_kge', f"{modelname}__kge", 'NWM_mape',  f"{modelname}_mape", 'Obs_vol', 'NWM_vol', f"{modelname}_vol",
        'NWM_vol_err', f"{modelname}_vol_err", 'NWM_vol_Perc_diff', f"{modelname}_vol_Perc_diff"]


EvalDF = pd.DataFrame(columns = cols)
SupplyEvalDF = pd.DataFrame(columns = supcols)

# Make a prediction for each location, save as compressed pkl file, and send predictions to AWS for use in CSES

In [None]:
#get annual supply diffs
cfsday_AFday = 1.983
year = 2020


Preds_Dict = {}
for station_number in station_index_list.drop_duplicates():
  #print(station_number)
  index = station_index_list == station_number
  X_test = x_test_temp_1[index]
  X_test_scaled_t = torch.Tensor(x_test_1_scaled[index])
  X_test_scaled_t = X_test_scaled_t.to(device)
  l = len(y_test_temp_1.values)
  y_test = torch.Tensor(np.array(y_test_temp_1.values).reshape(l,1))
  y_test = y_test.to(device)

  # Evaluation
  models.eval()
  with torch.no_grad():
      predictions_scaled= models(X_test_scaled_t)

  # Invert scaling for actual
  predictions = scaler_y.inverse_transform(predictions_scaled.to('cpu').numpy())
  predictions[predictions<0] = 0

  #print('Model Predictions complete')

  predictions = pd.DataFrame(predictions, columns=[f"{modelname}_flow"])

  #save predictions, need to convert to NHDPlus reach - Need to add Datetime column and flow predictions
  #make daterange
  dates = pd.date_range(pd.to_datetime("2020-01-01"), periods=len(predictions)).strftime("%Y-%m-%d").tolist()
  predictions['Datetime'] = dates
    
  #get reach id for model eval
  nhdreach = utils.crosswalk(usgs_site_codes=station_number)
  nhdreach = nhdreach['nwm_feature_id'].iloc[0]

  #put columns in correct order
  cols = ['Datetime', f"{modelname}_flow"]
  predictions = predictions[cols]

  #save predictions to AWS so we can use CSES
  state = StreamStats['state_id'][StreamStats['NWIS_site_id'].astype(str)== station_number].values[0].lower()
  csv_key = f"{modelname}/NHD_segments_{state}.h5/{modelname[:3]}_{nhdreach}.csv"
  predictions.to_csv(f"s3://{BUCKET_NAME}/{csv_key}", index = False,  storage_options={'key': ACCESS['Access key ID'][0],
                           'secret': ACCESS['Secret access key'][0]})

  #Concat DFS and put into dictionary
  x_test_temp['nwm_feature_id'] = nhdreach
  Dfs = [predictions.reset_index(drop=True),x_test_temp[x_test_temp['station_id']==station_number].reset_index(drop=True)]
  Preds_Dict[station_number] = pd.concat(Dfs, axis=1)

  #reorganize columns
  Preds_Dict[station_number].pop('datetime')
  Preds_Dict[station_number].insert(1, f"{modelname}_flow", Preds_Dict[station_number].pop(f"{modelname}_flow"))
  Preds_Dict[station_number].insert(1, "NWM_flow", Preds_Dict[station_number].pop("NWM_flow"))
  Preds_Dict[station_number].insert(1, "flow_cfs", Preds_Dict[station_number].pop("flow_cfs"))
  Preds_Dict[station_number].insert(1, "nwm_feature_id", Preds_Dict[station_number].pop("nwm_feature_id"))
  Preds_Dict[station_number].insert(1, "station_id", Preds_Dict[station_number].pop("station_id"))

  #push data to AWS so we can use CSES
  
  
#save predictions as compressed pkl file
pred_path = f"{HOME}/NWM_ML/Predictions/Hindcast/{modelname}/{year}"
file_path = f"{pred_path}/{modelname}_predictions.pkl"
if os.path.exists(pred_path) == False:
    os.makedirs(pred_path)
with open(file_path, 'wb') as handle:
    pkl.dump(Preds_Dict, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [10]:
Preds_Dict['10157500']

Unnamed: 0,Datetime,station_id,nwm_feature_id,flow_cfs,NWM_flow,MLP_flow,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,s1,s2,storage,swe,DOY
0,2020-01-01,10157500,10375690,0.225000,49.0,35.583889,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.438371,0.898794,0.0,0.0,1
1,2020-01-02,10157500,10375690,0.270000,49.0,35.578308,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.438371,0.898794,0.0,0.0,2
2,2020-01-03,10157500,10375690,0.275000,48.0,35.515900,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.438371,0.898794,0.0,0.0,3
3,2020-01-04,10157500,10375690,0.250000,48.0,35.510326,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.438371,0.898794,0.0,0.0,4
4,2020-01-05,10157500,10375690,0.285000,48.0,35.504749,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.438371,0.898794,0.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,2020-09-19,10157500,10375690,0.010000,63.0,40.438568,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.529919,-0.848048,0.0,0.0,270
263,2020-09-20,10157500,10375690,0.010000,62.0,40.336639,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.529919,-0.848048,0.0,0.0,271
264,2020-09-21,10157500,10375690,0.010833,62.0,40.302391,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.529919,-0.848048,0.0,0.0,272
265,2020-09-22,10157500,10375690,0.017273,62.0,40.268143,40.460789,-111.472687,49.8,2.37,0.16,49.1,-0.529919,-0.848048,0.0,0.0,273
