In [1]:
# Hydrological packages
import hydroeval as he
from hydrotools.nwm_client import utils # I had to pip install this
import joblib

# My packages
from g_evaluation_metric import MAPE, RMSE, KGE, PBias
from s_evalaution_table import evtab
import s_FigureGenerator
from g_xgboost import XGBoostRegressorCV

# Basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

# System packages
from progressbar import ProgressBar
from datetime import datetime, date
import pickle
import warnings
warnings.filterwarnings("ignore")
import platform
import time

# Data analysis packages

from xgboost import XGBRegressor
from scipy import optimize
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split




In [2]:
#load access key
HOME = os.path.expanduser('~')

path_general = f'{HOME}/NWM-ML/Savalan/'
path_model_save = f"{path_general}/03.output/05.xgboost/03.model_parameters/"
path_save_data = f"{path_general}/03.output/05.xgboost/02.data/" 
path_save_figure = f"{path_general}/03.output/05.xgboost/01.figures"

## 2. Prepare the data.

In [3]:
raw_training_data = pd.read_csv(path_general + '03.output/01.data_preparation/raw_training_data.csv')
raw_training_data.pop('Unnamed: 0')
raw_training_data['station_id'] = raw_training_data['station_id'].astype('str')
raw_training_data.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Mean_Basin_Elev_ft,Perc_Forest,Perc_Develop,Perc_Imperv,Perc_Herbace,Perc_Slop_30,Mean_Ann_Precip_in,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
0,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-28,78.55521,-0.891007,-0.453991,0.0,1.2,55.0,301
1,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-29,98.61146,-0.891007,-0.453991,0.0,1.2,55.0,302
2,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-30,97.60208,-0.891007,-0.453991,0.0,1.1,54.0,303
3,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-10-31,99.33125,-0.891007,-0.453991,0.0,1.2,54.0,304
4,10011500,40.965225,-110.853508,174.0,9720.0,67.7,1.2,0.12,2.94,27.2,34.8,2010-11-01,95.76354,-0.99863,0.052336,0.0,1.2,54.0,305


In [4]:
Training_DF = raw_training_data.copy()

### Editing the features based on the feature importance should be in the next cell!!!!!!!!!!!!!!!

In [5]:
# Editing the features based on the feature importance should be done here!!!!!!!!!!!!!!!

Training_DF.drop(['Mean_Ann_Precip_in', 'Perc_Herbace', 'Perc_Forest',
                        'Mean_Basin_Elev_ft'], axis=1, inplace=True)  # 'precipitation_in', 'temperature_F',


### Remove headwater stations!!!!!!!

In [6]:
headwater_stations = ['10011500', '10109000', '10113500', '10128500', '10131000', '10146400', '10150500', '10154200',
'10172700', '10172800', '10172952']
Training_DF = Training_DF[~raw_training_data['station_id'].isin(headwater_stations)]

In [7]:
Training_DF.datetime = pd.to_datetime(Training_DF.datetime)

Training_DF.head()


Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
3079,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,1992-10-01,9.25,-0.891007,-0.453991,0.0,0.0,37.0,275
3080,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,1992-10-02,8.654167,-0.891007,-0.453991,0.0,0.0,36.0,276
3081,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,1992-10-03,9.466667,-0.891007,-0.453991,0.0,0.0,36.0,277
3082,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,1992-10-04,11.833333,-0.891007,-0.453991,0.0,0.0,36.0,278
3083,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,1992-10-05,10.195833,-0.891007,-0.453991,0.0,0.0,36.0,279


### 2.1. Create the training dataset.

In [8]:
# Training is from 1980 to the end of 2015.
x_train_temp = Training_DF[Training_DF.datetime < '01-01-2015']
x_train_temp.pop('station_id')
x_train_temp.pop('datetime')
y_train_temp = x_train_temp['flow_cfs']
x_train_temp.pop('flow_cfs')
x_train_temp.head()

Unnamed: 0,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,s1,s2,storage,swe,NWM_flow,DOY
3079,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.891007,-0.453991,0.0,0.0,37.0,275
3080,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.891007,-0.453991,0.0,0.0,36.0,276
3081,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.891007,-0.453991,0.0,0.0,36.0,277
3082,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.891007,-0.453991,0.0,0.0,36.0,278
3083,41.57549,-111.85522,180.0,1.01,0.0653,44.2,-0.891007,-0.453991,0.0,0.0,36.0,279


#### 2.1.1. Scale the train inputs of the NN model

In [9]:
# First we need to convert it from pandas dataframe to a numpy array 
y_train = y_train_temp.to_numpy()
x_train = x_train_temp.to_numpy()
scaler = MinMaxScaler()
x_train_scaled_test = scaler.fit_transform(x_train)
y_train_scaled_test = scaler.fit_transform(y_train.reshape(-1, 1))


In [10]:
# Reshape Input for CNN1D Model

print('train x shape', x_train_scaled_test.shape)
print('train y shape', y_train_scaled_test.shape)

train x shape (93299, 12)
train y shape (93299, 1)


### 2.2. Create the test dataset. 

In [11]:
# Determining the test dataset. 
x_test_temp = Training_DF[Training_DF.datetime >= '01-01-2015']
x_test_temp.head()

Unnamed: 0,station_id,Lat,Long,Drainage_area_mi2,Perc_Develop,Perc_Imperv,Perc_Slop_30,datetime,flow_cfs,s1,s2,storage,swe,NWM_flow,DOY
10567,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,2015-01-01,21.627083,-0.438371,0.898794,0.0,5.75,39.0,1
10568,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,2015-01-02,23.53125,-0.438371,0.898794,0.0,5.75,39.0,2
10569,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,2015-01-03,25.044792,-0.438371,0.898794,0.0,5.75,39.0,3
10570,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,2015-01-04,26.103125,-0.438371,0.898794,0.0,5.75,39.0,4
10571,10105900,41.57549,-111.85522,180.0,1.01,0.0653,44.2,2015-01-05,26.742708,-0.438371,0.898794,0.0,5.75,39.0,5


#### 2.2.1. Scale the test inputs of the NN model

In [12]:
# First we need to convert it from pandas dataframe to a numpy array 
x_test_temp_1 = x_test_temp.copy()
station_index_list = x_test_temp_1['station_id']
x_test_temp_1.pop('station_id')
x_test_temp_1.pop('datetime')
y_test_temp_1 = x_test_temp_1['flow_cfs']
x_test_temp_1.pop('flow_cfs')
x_test_1_np = x_test_temp_1.reset_index(drop=True).to_numpy()
y_test_1_np = y_test_temp_1.reset_index(drop=True).to_numpy()
x_test_1_scaled = scaler.fit_transform(x_test_1_np)
y_scaled_test_1 = scaler.fit_transform(y_test_1_np.reshape(-1, 1))

## 3. XGBOOST Model Preparation.

### 3.1. Create the model variables.

In [13]:
model_name='XGBoost'
tries = 1
hyperparameters = {
    'max_depth': range (2, 11, 1),
    'n_estimators': range(100, 2100, 200),
    'eta': [0.1, 0.01, 0.05]
}
path_model_save_hyperparameters = f"{path_model_save}/best_model_hyperparameters.pkl"
path_model_save = f"{path_model_save}/best_model.pkl"


### 3.2. Create input data.

### 3.3. Run and evaluate the model
Here first we train the model and then we test it.
We do it 30 times so we have firm evaluation. 

In [16]:
start_time = time.time()

# Create variables
test_best_val = float('inf')
EvalDF = {}
SupplyEvalDF = {}
EvalDF_all = np.zeros([len(station_index_list.drop_duplicates()), 10])
SupplyEvalDF_all = np.zeros([len(station_index_list.drop_duplicates()), 17])

# Start running the model several times. 
for try_number in range(tries):

    # Create the variables. 
    EvalDF[try_number] = np.zeros([len(station_index_list.drop_duplicates()), 10])
    SupplyEvalDF[try_number] = np.zeros([len(station_index_list.drop_duplicates()), 17])
    SitesDict = {}
    val_loss_all = 0
    print(f'Trial Number {try_number} ==========================================================')
    
    # # Set the optimizer, create the model, and train it. 
    xgboost_model = XGBoostRegressorCV(hyperparameters, path_model_save_hyperparameters)
    new_data_len = int(len(x_train_scaled_test) * 0.01)
    # xgboost_model.tune_hyperparameters(x_train_scaled_test[:new_data_len, :], y_train_scaled_test[:new_data_len, :])
    # xgboost_model.evaluate(x_train_scaled_test[:new_data_len, :], y_train_scaled_test[:new_data_len, :])
    # xgboost_model.train(x_train_scaled_test, y_train_scaled_test)
    # print('Saving Model')
    
    # #adjust this to match changing models
    # pickle.dump(xgboost_model, open(path_model_save, "wb"))  
    
    # Evaluate it for different stations. 
    for station_index, station_number in enumerate(station_index_list.drop_duplicates()):
        index = station_index_list == station_number # Finind the rows that have this station number.
        
 
        
        # Evaluation
        yhat_test = xgboost_model.predict(x_test_1_scaled[index])
        
        # Invert scaling for actual and concat it with the rest of the dataset. 
        inv_yhat_test = scaler.inverse_transform(yhat_test.reshape(-1, 1))
        inv_yhat_test[inv_yhat_test<0] = 0 # THIS IS NOT CORRECT !!!!!!!!!!!!!!!
        nwm_test = pd.DataFrame(inv_yhat_test, columns=[f"{model_name}_flow"])
        Dfs = [nwm_test.reset_index(drop=True), x_test_temp[index].reset_index(drop=True)]
        Eval_DF_mine = pd.concat(Dfs, axis=1)
    
        # Get reach id for model eval.
        nhdreach = utils.crosswalk(usgs_site_codes=station_number)
        nhdreach = nhdreach['nwm_feature_id'].iloc[0]
        SitesDict[nhdreach] = Eval_DF_mine
        
        # Calculate the results. 
        prediction_columns = ['NWM_flow', f"{model_name}_flow"]
        observation_column = 'flow_cfs'
        result = evtab(Eval_DF_mine, prediction_columns, nhdreach, observation_column, model_name)
        EvalDF[try_number][station_index, :] = result[0]
        SupplyEvalDF[try_number][station_index, :] = result[1]

    # Finding the best model. 
    # val_loss_all += val_loss
    # val_loss_all = val_loss_all / len(station_index_list.drop_duplicates())
    # if val_loss_all < test_best_val:
    #     test_best_val = val_loss_all
    #     best_model_parameters = model_parameters
    #     best_try = try_number
    #     best_output = SitesDict
    EvalDF_all = EvalDF[try_number] + EvalDF_all
    SupplyEvalDF_all = SupplyEvalDF[try_number] + SupplyEvalDF_all
        
# Calculate the average results for all of the trials. 
EvalDF_all = EvalDF_all / tries
SupplyEvalDF_all = SupplyEvalDF_all / tries

# Sort the outputs of the best model based on date. 
keys = list(SitesDict.keys())
for key_number in keys:
    SitesDict[key_number] = SitesDict[key_number].sort_values(by='datetime')
    
print('Run is Done!' + "Run Time:" + " %s seconds " % (time.time() - start_time))


Model is not trained yet. Please train the model first.


AttributeError: 'NoneType' object has no attribute 'reshape'

### 3.4. Create and save the final results.


In [12]:
#Evaluation columns for prediction time series
cols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{model_name}_rmse", 'NWM_pbias', f"{model_name}_pbias", 
        'NWM_kge', f"{model_name}__kge", 'NWM_mape',  f"{model_name}_mape"]

#Evaluation columns for accumulated supply time series
supcols = ['USGSid', 'NHDPlusid', 'NWM_rmse', f"{model_name}_rmse", 'NWM_pbias', f"{model_name}_pbias", 
        'NWM_kge', f"{model_name}__kge", 'NWM_mape',  f"{model_name}_mape", 'Obs_vol', 'NWM_vol', f"{model_name}_vol",
        'NWM_vol_err', f"{model_name}_vol_err", 'NWM_vol_Perc_diff', f"{model_name}_vol_Perc_diff"]
    
#save model results
EvalDF_all = pd.DataFrame(EvalDF_all, columns=cols)
SupplyEvalDF_all = pd.DataFrame(SupplyEvalDF_all, columns=supcols)
EvalDF_all.to_csv(f"{path_save_data}/{model_name}_Performance.csv")   
SupplyEvalDF_all.to_csv(f"{path_save_data}/{model_name}_Supply_Performance.csv")


NameError: name 'EvalDF_all' is not defined

In [13]:
print("Model Performance for Daily cfs")
display(EvalDF_all)   
print("Model Performance for Daily Accumulated Supply (Acre-Feet)")
display(SupplyEvalDF_all)


Model Performance for Daily cfs


NameError: name 'EvalDF_all' is not defined

In [14]:
plotname = 'XGBoost_TS_plot'
freq = 'D'
supply = True
title = 'Observed and Modeled flows for NHDPlus Reaches \n with Upstream Reservoirs in the Great Salt Lake Basin'
path_figure_1 = f"{path_save_figure}/{plotname}.png"
s_FigureGenerator.TS_plot(SitesDict, model_name, path_figure_1, title, freq, supply)

NameError: name 'SitesDict' is not defined

In [None]:
plotname = 'XGBoost_ParityPlot'
path_figure_2 = f"{path_save_figure}/{plotname}.png"
s_FigureGenerator.Parity_plot(SitesDict, model_name, path_figure_2)

In [None]:
reach = 10273232
variables =['NWM_flow', 'flow_cfs']
colors = ['blue', 'green']
plotname = 'NWMFlow'
path_figure_3 = f"{path_save_figure}/{plotname}.png"
units = 'cfs'
y_lab = f"Flow ({units})"
title = f"Daily NWM Estimates \n Reach: {str(reach)}"

s_FigureGenerator.Var_TS_plot(best_output, reach, variables, colors, model_name,y_lab, path_figure_3, title, units, supply = False)

In [None]:
import AWS_transfer
state = 'ut'
AWS_transfer.Predictions2AWS(model_name, state)