In [None]:
# %pip install progressbar xgboost matplotlib boto3 openpyxl tqdm hydroeval hydrotools

In [1]:
# hydrological packages
from hydrotools.nwm_client import utils 

# my packages
from g_evaluation_metric import MAPE, RMSE, KGE, PBias
from s_evalaution_table import evtab
import s_FigureGenerator

# basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import importlib

# system packages
from progressbar import ProgressBar
from datetime import datetime, date
import pickle
import warnings
warnings.filterwarnings("ignore")
import platform
import time

# data analysi packages
from scipy import optimize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# deep learning packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


ModuleNotFoundError: No module named 'evaluation_metric'

In [None]:
if platform.system() == 'Windows':
    onedrive_path = 'E:/OneDrive/OneDrive - The University of Alabama/10.material/01.data/usgs_data/'
    box_path = 'C:/Users/snaserneisary/Box/NWM-ML/'

elif platform.system() == 'Darwin':
    onedrive_path = '/Users/savalan/Library/CloudStorage/OneDrive-TheUniversityofAlabama/02.projects/03.ciroh/04.data/'
    box_path = '/Users/savalan/Library/CloudStorage/Box-Box/NWM-ML/Data/NWM/ut/'
    
elif platform.system() == 'Linux':
    path_01 = '/home/snaserneisary/01.projects/01.ciroh_p8/NWM-ML/Savalan/'

In [None]:
raw_training_data = pd.read_csv(path_01 + '03.output/raw_training_data.csv')
raw_training_data.pop('Unnamed: 0')
raw_training_data['station_id'] = raw_training_data['station_id'].astype('str')
raw_training_data.head()

In [None]:
Training_DF = raw_training_data.copy()

### Editing the features based on the feature importance should be in the next cell!!!!!!!!!!!!!!!

In [None]:
# Editing the features based on the feature importance should be done here!!!!!!!!!!!!!!!

Training_DF.drop(['precipitation_in', 'temperature_F', 'Mean_Ann_Precip_in', 'Perc_Herbace', 'Perc_Forest',
                        'Mean_Basin_Elev_ft'], axis=1, inplace=True)



### Remove headwater stations!!!!!!!

In [None]:
headwater_stations = ['10011500', '10109000', '10113500', '10128500', '10131000', '10146400', '10150500', '10154200',
'10172700', '10172800', '10172952']
Training_DF = Training_DF[~raw_training_data['station_id'].isin(headwater_stations)]

In [None]:
Training_DF.datetime = pd.to_datetime(Training_DF.datetime)
Training_DF.head()

In [None]:
x_train_temp = Training_DF[Training_DF.datetime < '01-01-2015']
x_train_temp.pop('station_id')
x_train_temp.pop('datetime')
y_train_temp = x_train_temp['flow_cfs']
x_train_temp.pop('flow_cfs')
x_train_temp.head()

In [None]:
# Scale the train inputs of the NN model
# First we need to convert it from pandas dataframe to a numpy array 
y_train = y_train_temp.to_numpy()
x_train = x_train_temp.to_numpy()
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
y_scaled_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_scaled_train.shape

In [None]:
# Determining the test dataset. 
x_test_temp = Training_DF[Training_DF.datetime >= '01-01-2015']
x_test_temp.head()

In [None]:
# Scale the test inputs of the NN model
# First we need to convert it from pandas dataframe to a numpy array 
x_test_temp_1 = x_test_temp.copy()
station_index_list = x_test_temp_1['station_id']
x_test_temp_1.pop('station_id')
x_test_temp_1.pop('datetime')
y_test_temp_1 = x_test_temp_1['flow_cfs']
x_test_temp_1.pop('flow_cfs')
x_test_1_np = x_test_temp_1.reset_index(drop=True).to_numpy()
y_test_1_np = y_test_temp_1.reset_index(drop=True).to_numpy()
x_test_1_scaled = scaler.fit_transform(x_test_1_np)
y_scaled_test_1 = scaler.fit_transform(y_test_1_np.reshape(-1, 1))

In [None]:
# Reshape input for MLP model
x_train_scaled_test = torch.Tensor(x_train_scaled)
y_train_scaled_test = torch.Tensor(y_scaled_train)
print('test shape', x_train_scaled_test.shape)
print('train shape', y_train_scaled_test.shape)

In [None]:
# MODEL CLASS
import torch
import torch.nn as nn
import torch.optim as optim

class CustomMLP(nn.Module):
    def __init__(self, layer_sizes, optimizer, device=None):
        super(CustomMLP, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
        self.loss_function = nn.MSELoss()
        self.device = device
        self.to(self.device)
        self.validation_indicator = 0
        self.optim = optimizer

    def forward(self, x):
        for i in range(len(self.layers) - 1):
            x = torch.relu(self.layers[i](x))
        x = self.layers[-1](x)
        return x

    def train_model(self, train_loader, epochs, early_stopping_patience=0, save_path=None, val_loader=None):
        best_val_loss = float('inf')
        epochs_no_improve = 0


        for epoch in range(epochs):
            self.train()  # Set the model to training mode
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)

                optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = self.loss_function(outputs, targets)
                loss.backward()
                optimizer.step()

            
            val_loss = 0
            if val_loader is not None:
                self.validation_indicator = 1
                val_loss = self.evaluate_model(val_loader)[1]

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(self.state_dict(), save_path)
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

                if epochs_no_improve == early_stopping_patience and early_stopping_patience > 0:
                    print('Early stopping triggered')
                    break
            print(f'Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}', f'Validation Loss: {val_loss}')
        self.validation_indicator = 0
        print('Training is done!')

    def evaluate_model(self, data_loader):
        self.eval()  # Set the model to evaluation mode
        total_loss = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in data_loader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)

                outputs = self.forward(inputs)
                loss = self.loss_function(outputs, targets)
                total_loss += loss.item() * inputs.size(0)
                total += inputs.size(0)
        avg_loss = total_loss / total
        if self.validation_indicator == 0:
            print(f'Validation Loss: {avg_loss}')
        return outputs, avg_loss
        #outputs if self.validation_indicator == 0 else avg_loss

    def save_model(self, file_path):
        torch.save(self.state_dict(), file_path)

    def load_model(self, file_path):
        self.load_state_dict(torch.load(file_path, map_location=self.device))


In [None]:
# Create PyTorch datasets and dataloaders

X_train, X_valid, y_train, y_valid = train_test_split(x_train_scaled_test, y_train_scaled_test)
train_dataset = TensorDataset(x_train_scaled_test, y_train_scaled_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
validation_dataset = TensorDataset(X_valid, y_valid)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

In [None]:
mod='MLP'
tries = 1
epochs = 10
batch_size = 50
learning_rate = 1e-4
early_stopping_patience = 5
decay = 0
path_model_save = f"{path_01}/03.output/mlp/best_model.pkl"
layer_sizes = [x_train_scaled_test.shape[1] ,128, 128, 64, 64, 32, 16, 1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
start_time = time.time()

# Create variables
test_best_val = float('inf')
EvalDF = {}
SupplyEvalDF = {}
EvalDF_all = np.zeros([len(station_index_list.drop_duplicates()), 10])
SupplyEvalDF_all = np.zeros([len(station_index_list.drop_duplicates()), 17])

# Start running the model several times. 
for try_number in range(1, tries+1):

    # Create the variables. 
    EvalDF[try_number] = np.zeros([len(station_index_list.drop_duplicates()), 10])
    SupplyEvalDF[try_number] = np.zeros([len(station_index_list.drop_duplicates()), 17])
    SitesDict = {}
    val_loss_all = 0
    print(f'Trial Number {try_number} ==========================================================')
    
    # Set the optimizer, create the model, and train it. 
    mlp_optimizer = optim.Adam(self.layers.parameters(), lr=learning_rate, weight_decay=decay)
    mlp_model = CustomMLP(layer_sizes, mlp_optimizer, device)
    mlp_model.train_model(train_loader, epochs, early_stopping_patience, path_model_save, validation_loader)
    
    # Evaluate it for different stations. 
    for station_index, station_number in enumerate(station_index_list.drop_duplicates()):
        index = station_index_list == station_number # Finind the rows that have this station number.
        temp_x_scaled_test = torch.Tensor(x_test_1_scaled)
        temp_y_scaled_test = torch.Tensor(y_scaled_test_1)
        index_np = torch.tensor(index.to_numpy())
        test_dataset = TensorDataset(temp_x_scaled_test[index_np], temp_y_scaled_test[index_np])
        test_loader = DataLoader(test_dataset, batch_size=test_dataset.tensors[0].shape[0], shuffle=False)
        
        # Evaluation
        yhat_test, val_loss = mlp_model.evaluate_model(test_loader)
        
        # Invert scaling for actual and concat it with the rest of the dataset. 
        inv_yhat_test = scaler.inverse_transform(yhat_test.numpy())
        inv_yhat_test[inv_yhat_test<0] = 0 # THIS IS NOT CORRECT !!!!!!!!!!!!!!!
        nwm_test = pd.DataFrame(inv_yhat_test, columns=['MLP_flow'])
        Dfs = [nwm_test.reset_index(drop=True), x_test_temp[index].reset_index(drop=True)]
        Eval_DF_mine = pd.concat(Dfs, axis=1)
        SitesDict[nhdreach] = Eval_DF_mine
    
        # Get reach id for model eval.
        nhdreach = utils.crosswalk(usgs_site_codes=station_number)
        nhdreach = nhdreach['nwm_feature_id'].iloc[0]
        
        # Calculate the results. 
        prediction_columns = ['NWM_flow', f"{mod}_flow"]
        observation_column = 'flow_cfs'
        result = evtab(Eval_DF_mine, prediction_columns, nhdreach, observation_column, mod)
        EvalDF[try_number][station_index, :] = result[0]
        SupplyEvalDF[try_number][station_index, :] = result[1]

    # Finding the best model. 
    val_loss_all += val_loss
    val_loss_all = val_loss_all / len(station_index_list.drop_duplicates())
    if val_loss_all < test_best_val:
        test_best_val = val_loss_all
        best_model = mlp_model.state_dict()
        best_try = try_number
        best_output = SitesDict
    EvalDF_all = EvalDF[try_number] + EvalDF_all
    print(EvalDF_all.shape)
    SupplyEvalDF_all = SupplyEvalDF[try_number] + SupplyEvalDF_all
        
# Save the average results for all of the trials. 
EvalDF_all = EvalDF_all / tries
SupplyEvalDF_all = SupplyEvalDF_all / tries

# Sort the outputs of the best model based on date. 
keys = list(best_output.keys())
for key_number in keys:
    best_output[key_number] = best_output[key_number].sort_values(by='datetime')
    
print('finish')
print("Run Time:" + " %s seconds " % (time.time() - start_time))


In [None]:
# Put the model scores into a dataframe for comparison
mod = 'MLP'

#Evaluation columns for prediction time series
cols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{mod}_rmse", 'NWM_pbias', f"{mod}_pbias", 
        'NWM_kge', f"{mod}__kge", 'NWM_mape',  f"{mod}_mape"]

#Evaluation columns for accumulated supply time series
supcols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{mod}_rmse", 'NWM_pbias', f"{mod}_pbias", 
        'NWM_kge', f"{mod}__kge", 'NWM_mape',  f"{mod}_mape", 'Obs_vol', 'NWM_vol', f"{mod}_vol",
        'NWM_vol_err', f"{mod}_vol_err", 'NWM_vol_Perc_diff', f"{mod}_vol_Perc_diff"]
    
#save model results
EvalDF_all = pd.DataFrame(EvalDF_all, columns=cols)
SupplyEvalDF_all = pd.DataFrame(SupplyEvalDF_all, columns=supcols)
path_save_data = f"{path_01}/03.output/02.mlp/012.data/" 
EvalDF.to_csv(f"{path_save_data}}/{mod}_Performance.csv")   
SupplyEvalDF.to_csv(f"{path_save_data}}/{mod}_Supply_Performance.csv")


In [None]:
print("Model Performance for Daily cfs")
display(EvalDF_all)   
print("Model Performance for Daily Accumulated Supply (Acre-Feet)")
display(SupplyEvalDF_all)


In [None]:
importlib.reload(s_FigureGenerator)

model = 'MLP'
plotname = 'MLP_TS_plot'
freq = 'D'
supply = True
title = 'Observed and Modeled flows for NHDPlus Reaches \n with Upstream Reservoirs in the Great Salt Lake Basin'
path_figures = f"{path_01}/03.output/02.mlp/01.figures/{plotname}.png"
s_FigureGenerator.TS_plot(best_output, model, path, title, freq, supply)

In [None]:
importlib.reload(s_FigureGenerator)
plotname = 'MLP_ParityPlot'
path_figures = f"{path_01}/03.output/02.mlp/01.figures/{plotname}.png"
s_FigureGenerator.Parity_plot(best_output, model, path_figures)

In [None]:
importlib.reload(s_FigureGenerator)
reach = 10273232
variables =['NWM_flow', 'flow_cfs']
colors = ['blue', 'green']
model = 'MLP'
plotname = 'NWMFlow'
path_figures = f"{path_01}/03.output/02.mlp/01.figures/{plotname}.png"
units = 'cfs'
y_lab = f"Flow ({units})"
title = f"Daily NWM Estimates \n Reach: {str(reach)}"

s_FigureGenerator.Var_TS_plot(best_output, reach, variables, colors, model,y_lab, path_figures, title, units, supply = False)

In [None]:
import AWS_transfer
model = 'MLP'
state = 'ut'
AWS_transfer.Predictions2AWS(model, state)