In [1]:
#!/usr/bin/env python3
import tensorflow as tf

import os

from ForecastModel.utils.metrics import calculate_nse, calculate_kge, calculate_bias

import numpy as np
import pandas as pd

import json

In [35]:
class Model:
    def __init__(self, name, model_folder, n_trial=-1, target_name="", feat_hindcast=[], feat_forecast=[], is_external_model= False, is_final_model= False, color="r", ls="-"):
        self.name  = name
        self.color = color
        self.ls    = ls
        self.is_external_model = is_external_model
        
        if is_final_model:
            self.lg_path = model_folder
            self.hp_path = model_folder
            if is_external_model:
                is_external_model
                # do nothing
            else:
                with open(os.path.join(self.lg_path, "features.txt"), "r") as f:
                    dic = json.load(f)
                self.target_name   = dic["target_name"]
                self.feat_hindcast = dic["feat_hindcast"]
                self.feat_forecast = dic["feat_forecast"]
        else:
            self.lg_path = os.path.join(model_folder, "logs", f"trial_{n_trial:02d}")
            self.hp_path = os.path.join(model_folder,  "hp", f"trial_{n_trial:02d}")
            self.target_name   = target_name
            self.feat_hindcast = feat_hindcast
            self.feat_forecast = feat_forecast

 
    
models = {
    "arima": Model("ARIMA",
                r"C:\GitHub\ForecastModel-Complete\rst\ARIMA",
                is_final_model = True,
                is_external_model = True,
                color = "#E69F00",
                ls = "--",
                  ),
     "lstm": Model("PBHM-HLSTM",
                   r"F:\11_EFFORS\python\models\20240617_PBHM-HLSTM-MSE-min_max",
                   is_final_model = True,
                   color = "#56B4E9",
                   ls = "-",
                 ),
     "elstm": Model("eLSTM",
                   r"F:\11_EFFORS\python\models\20240618_ELSTM-MSE-min_max",
                   is_final_model = True,
                   color = "#D55E00",
                   ls = "-",
                 ),
     }

In [36]:
# define paths

PLOT_PATH = r"C:\GitHub\ForecastModel-Revised-10062024\answer-plots"
DATA_PATH = r"C:\GitHub\ForecastModel-Revised-10062024\data\Dataset.csv"
CROSS_INDICES_PATH = r"C:\GitHub\ForecastModel-Revised-10062024\data\indices"


In [37]:
# helper functions to create latex tables

def load_metrics(path):
    with open(path, "r") as f:
        metrics = json.load(f)
    return metrics

def df2latex(df, file, bold_mask=None, fmt=None):
    print(file)
    if type(bold_mask) == type(None):
        bold_mask = np.zeros(df.shape)
    
    with open(file, "w") as f:
        f.write("index&"+"&".join(df.columns)+r"\\")
        f.write("\n")
        i = -1
        for n, row in df.iterrows():
            i += 1
            f.write(str(n))
            for j,num in enumerate(row.values):
                if bold_mask[i,j] == 1:
                    f.write(r"&\textbf{")
                else:
                    f.write(r"&")
                if type(fmt) == type(None):
                    if np.abs(num) >= 100:
                        f.write(f"{int(num): 6d}")
                    elif np.abs(num) >= 10:
                        f.write(f"{num: 6.1f}")
                    elif np.abs(num) >= 1:
                        f.write(f"{num: 6.2f}")
                    else:
                        f.write(f"{num: 6.3f}")
                elif type(fmt) == type([]):
                    if fmt[j] == 'd':
                       num = int(num)
                    f.write(format(num, fmt[j]))
                else:
                    if fmt == 'd':
                        num = int(num)
                    f.write(format(num, fmt))
                if bold_mask[i,j] == 1:
                    f.write(r"}")
                    
            f.write(r"\\")
            f.write("\n")

def get_bold_mask(df, fcn=np.argmax, n_multi_cols=3, offset=0):
    mask = np.zeros(df.shape)
    for n in range(n_multi_cols):
        if type(fcn) == type([]):
            idx = fcn[n](df.values[:, (offset+n)::n_multi_cols], axis=1)
        else:
            idx = fcn(df.values[:, (offset+n)::n_multi_cols], axis=1)
        for j in range(df.shape[0]):
            mask[j, idx[j]*n_multi_cols+n+offset] = 1
    return mask


# Table 1: Statistics of the catchment

In [16]:
# calculate metrics of hydrologic model per fold

dfh = pd.read_csv(DATA_PATH, parse_dates=["time"])

hyd_metrics = {}
for year in range(2011, 2018):
      dfp = dfh.loc[dfh["time"].dt.year == year]
      hyd_metrics[year] = {
          "kge":  calculate_kge(dfp.qmeasval.values, dfp.qsim.values),
          "nse":  calculate_nse(dfp.qmeasval.values, dfp.qsim.values),
          "bias": calculate_bias(dfp.qmeasval.values, dfp.qsim.values),
          "q_mean": dfp.qmeasval.mean(),
          "q_std":  dfp.qmeasval.std(),
          "q_max":  dfp.qmeasval.max(),
          "q_sum":  dfp.qmeasval.sum()*0.25*60*60 / 1e6,
          "qs_mean": dfp.qsim.mean(),
          "qs_std":  dfp.qsim.std(),
          "qs_max":  dfp.qsim.max(),
          "qs_sum":  dfp.qsim.sum()*0.25*60*60 / 1e6,
          "pmx_max": dfp.pmax.max()*4,
          "pmx_sum": dfp.pmax.sum(),
          "p_max":   dfp.pmean.max()*4,
          "p_sum":   dfp.pmean.sum(),
          "t_mean": dfp.tmean.mean(),
          "t_std":  dfp.tmean.std(),
      }    


In [17]:
df_catchment_stats = pd.DataFrame(hyd_metrics).transpose()
df_catchment_stats = df_catchment_stats[df_catchment_stats.columns[3:]].transpose()
df_catchment_stats.columns = [str(x) for x in df_catchment_stats.columns]
df_catchment_stats

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017
q_mean,0.571149,1.00931,1.208354,1.167666,0.711204,0.829312,0.56743
q_std,0.245171,0.842681,0.681078,0.669319,0.332219,0.682278,0.229892
q_max,9.61,25.2,15.0,7.27,5.85,17.94,9.21
q_sum,18.011757,31.916797,38.106657,36.823518,22.428531,26.224821,17.698878
qs_mean,0.617823,1.103376,1.399735,1.349319,0.755306,0.916051,0.907808
qs_std,0.331923,0.832882,1.016826,0.719722,0.51915,0.70136,0.479473
qs_max,4.195333,8.937861,11.354018,7.681351,4.502031,6.426838,4.611206
qs_sum,19.48366,34.891398,44.142029,42.552124,23.819337,28.967737,28.315708
pmx_max,118.720001,180.520004,100.480003,84.639999,109.919998,231.880005,173.360001
pmx_sum,2159.559998,2877.670002,2777.4,2847.86,2198.09,3222.800001,3076.049996


In [18]:
df2latex(df_catchment_stats, os.path.join(PLOT_PATH, r"table_1_summary_data.txt"))

C:\GitHub\ForecastModel-Revised-10062024\answer-plots\table_1_summary_data.txt


# Table 2: Average Model Performance

In [19]:
metric_names = ["kge", "nse", "bias"]
metric_labels = ["KGE", "NSE", "PBIAS"]


xx = np.arange(1,97)
df = pd.DataFrame(columns=["name", "year"])

n_row = -5

for n, key in enumerate(models.keys()):
    metrics          = load_metrics(os.path.join(models[key].lg_path, "metrics.txt"))
    metrics_baseline = load_metrics(os.path.join(models["arima"].lg_path, "metrics.txt"))
    
    n_row += 5
    
    for j, met in enumerate(metric_names):

        metric_valid    = metrics["valid"][met]
        metric_test     = metrics["test"][met]
        metric_baseline = metrics_baseline["test"][met]
        
        for i in range(5):
            df.loc[n_row+i, ["name", "year"]] = [models[key].name, 2013+i]

            # evaluations --------------------------------------------------------------------------
            # median metrics of validation and test set
            df.loc[n_row+i, [f"{met}_valid"]]  = [np.median(metric_valid[i])]
            df.loc[n_row+i, [f"{met}_test"]]   = [np.median(metric_test[i])]
            # generalization measure
            df.loc[n_row+i, [f"{met}_general"]]= [np.median(metric_valid[i]) - np.median(metric_test[i])]
            # metric for inital and final step in forecasting window
            df.loc[n_row+i, [f"{met}_init"]]   = [metric_test[i][0]]
            df.loc[n_row+i, [f"{met}_final"]]  = [ metric_test[i][-1]]
            # difference of metric between inital and final step in forecasting window
            df.loc[n_row+i, [f"{met}_drop"]]   = [metric_test[i][0] - metric_test[i][-1]]
            # num of lead time steps required so that model outperforms the baseline (ARIMA)
            if met != "bias":
                out_perform_idxs = [n for n, (x,y) in enumerate(zip(metric_baseline[i], metric_test[i])) if y-x > 0]
                if len(out_perform_idxs) > 0:
                    df.loc[n_row+i, [f"{met}_out"]] = out_perform_idxs[0]
                    df.loc[n_row+i, [f"{met}_out_hours"]] = out_perform_idxs[0] * 0.25
                else:
                    df.loc[n_row+i, [f"{met}_out"]] = -1
                    df.loc[n_row+i, [f"{met}_out_hours"]] = -1

# summary
cols = df.columns.to_list()
df[cols[:2]+cols[4:9]+cols[12:17]+cols[20:]]

Unnamed: 0,name,year,kge_general,kge_init,kge_final,kge_drop,kge_out,nse_general,nse_init,nse_final,nse_drop,nse_out,bias_general,bias_init,bias_final,bias_drop
0,ARIMA,2013,0.024469,0.993606,0.800486,0.19312,-1.0,0.043853,0.990559,0.608826,0.381733,-1.0,-0.095999,-0.000717,0.053995,-0.054712
1,ARIMA,2014,0.001348,0.995475,0.81155,0.183926,-1.0,-0.035996,0.993396,0.66536,0.328036,-1.0,0.164512,0.00091,0.043052,-0.042143
2,ARIMA,2015,0.033451,0.995947,0.745181,0.250766,-1.0,0.042241,0.994314,0.584369,0.409944,-1.0,-0.012728,0.001628,0.104698,-0.10307
3,ARIMA,2016,-0.040246,0.994068,0.815479,0.178589,-1.0,-0.020547,0.988831,0.637108,0.351723,-1.0,0.076062,-0.000369,-0.02803,0.027661
4,ARIMA,2017,0.318239,0.980389,0.517551,0.462838,-1.0,0.768113,0.969957,-0.103737,1.073694,-1.0,0.366589,-0.007793,-0.51157,0.503777
5,PBHM-HLSTM,2013,-0.01705,0.906953,0.814126,0.092827,7.0,-0.115411,0.941724,0.879041,0.062683,4.0,-0.654377,6.526826,5.261711,1.265116
6,PBHM-HLSTM,2014,0.042751,0.960826,0.876943,0.083882,7.0,-0.032398,0.969128,0.939296,0.029832,4.0,4.938129,-3.628257,-3.973139,0.344883
7,PBHM-HLSTM,2015,0.100423,0.923803,0.847409,0.076394,25.0,0.062722,0.927998,0.867904,0.060095,6.0,6.657013,-4.523778,-9.365205,4.841427
8,PBHM-HLSTM,2016,0.105654,0.930464,0.818376,0.112088,28.0,0.073535,0.948967,0.809219,0.139748,3.0,-6.423446,5.772713,7.605906,-1.833193
9,PBHM-HLSTM,2017,0.103728,0.920198,0.765891,0.154307,3.0,0.29566,0.843563,0.430629,0.412934,3.0,15.628916,-3.72868,-16.31196,12.58328


In [20]:
# calculate metrics of hydrologic model per fold
dfh = pd.read_csv(DATA_PATH, parse_dates=["time"])

hyd_metrics = {}
for year in range(2012, 2018):
      dfp = dfh.loc[dfh["time"].dt.year == year]
      hyd_metrics[year] = {
          "kge":  calculate_kge(dfp.qmeasval.values, dfp.qsim.values),
          "nse":  calculate_nse(dfp.qmeasval.values, dfp.qsim.values),
          "bias": calculate_bias(dfp.qmeasval.values, dfp.qsim.values),
      }    
    
    
# averaged metrics
evalu = {}
for n, key in enumerate(models.keys()):
    metrics = load_metrics(os.path.join(models[key].lg_path, "metrics.txt"))
    evalu[key] = {}
    for i in range(0,5):
        year = 2013 + i
        evalu[key][year] = {}
        for j, met in enumerate(["kge", "nse", "bias"]):
            metric = metrics["test"][met][i]
            evalu[key][year][met] = metric
       

In [21]:
years = np.array([x for x in range(2013,2018)])
metric_names = ["kge", "nse", "bias"]

df_avg_model = pd.DataFrame(index=years)

for year in years:
    for met in metric_names:
        df_avg_model.loc[year, f"hyd_{met}"] = hyd_metrics[year][met]

for n,key in enumerate(evalu.keys()):
    for year in years:
        for met in metric_names:
            df_avg_model.loc[year, f"{key}_{met}"] = np.mean(evalu[key][year][met])

df_avg_model

Unnamed: 0,hyd_kge,hyd_nse,hyd_bias,arima_kge,arima_nse,arima_bias,lstm_kge,lstm_nse,lstm_bias,elstm_kge,elstm_nse,elstm_bias
2013,0.631525,0.185013,-10.678816,0.841647,0.686484,-0.020626,0.86953,0.906583,4.508858,0.894369,0.845648,1.084802
2014,0.73864,0.49442,-18.437207,0.851671,0.739703,0.031338,0.916186,0.951527,-4.11502,0.853864,0.902426,-2.061313
2015,0.505049,0.236257,2.510872,0.820567,0.701578,0.069297,0.861502,0.89747,-7.809336,0.877162,0.901531,-1.307625
2016,0.738958,0.512468,-8.241606,0.854126,0.714077,-0.017186,0.850701,0.857629,7.419091,0.679236,0.684377,8.587589
2017,0.191555,-4.243535,-57.659804,0.564865,0.02232,-0.326514,0.821199,0.60933,-13.001763,0.786206,0.638627,0.722015


In [22]:
mask = get_bold_mask(df_avg_model.abs(), [np.argmax, np.argmax, np.argmin], 3, 0)
df2latex(df_avg_model[df_avg_model.columns[[0,1,2,3,4,5,6,7,8,9,10,11]]], os.path.join(PLOT_PATH, r"table_2_mean_metrics.txt"), mask, '6.2f')

C:\GitHub\ForecastModel-Revised-10062024\answer-plots\table_2_mean_metrics.txt


# Table 3: Generalization

In [25]:
df.groupby("name").mean()[["kge_general", "nse_general", "bias_general"]]
dfgen = df.set_index("name")[["year","kge_general", "nse_general", "bias_general"]]
df1 = dfgen.loc["ARIMA",:].set_index("year")
df1.columns = [x+"_ARIMA" for x in df1.columns]
df2 = dfgen.loc["eLSTM",:].set_index("year")
df2.columns = [x+"_eLSTM" for x in df2.columns]
df3 = dfgen.loc["PBHM-HLSTM",:].set_index("year")
df3.columns = [x+"_PBHM-HLSTM" for x in df3.columns]

df_general = pd.concat([df1, df3, df2], axis=1)
df_general

#df1 = pd.merge(dfgen.loc["ARIMA",:], dfgen.loc["HLSTM",["kge_robust", "nse_robust", "bias_robust"]], left_index=True, right_index=False, suffixes=("_ARIMA","_HLSTM"))
#df1

Unnamed: 0_level_0,kge_general_ARIMA,nse_general_ARIMA,bias_general_ARIMA,kge_general_PBHM-HLSTM,nse_general_PBHM-HLSTM,bias_general_PBHM-HLSTM,kge_general_eLSTM,nse_general_eLSTM,bias_general_eLSTM
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013,0.024469,0.043853,-0.095999,-0.01705,-0.115411,-0.654377,-0.074534,-0.072925,2.484071
2014,0.001348,-0.035996,0.164512,0.042751,-0.032398,4.938129,0.0512,-0.067907,2.309353
2015,0.033451,0.042241,-0.012728,0.100423,0.062722,6.657013,0.076977,0.029542,1.020971
2016,-0.040246,-0.020547,0.076062,0.105654,0.073535,-6.423446,0.283541,0.241287,-9.608155
2017,0.318239,0.768113,0.366589,0.103728,0.29566,15.628916,0.087451,0.20592,1.704293


In [26]:
mask = get_bold_mask(df_general.abs(), np.argmin, 3, 0)
df2latex(df_general.abs(), os.path.join(PLOT_PATH, r"table_3_gernalization.txt"), mask, '6.3f')

C:\GitHub\ForecastModel-Revised-10062024\answer-plots\table_3_gernalization.txt


# Table 4: Peak Performance

In [27]:
def peak_loss(y, y_hat):
    mask = y >= tf.math.reduce_max(y)*0.75
    
    squared_diff  = (y - y_hat)**2

    squared_diff = tf.where(mask, 2 * squared_diff, squared_diff)

    return tf.reduce_mean(squared_diff)

def dt(dates_string, format="%d/%m/%Y %H:%M"):
    return pd.to_datetime(dates_string, format=format)

In [38]:
# def
num_peaks_per_fold = 2
reload = False

In [82]:
idx = -10
dfp = pd.DataFrame()
for n, key in enumerate(models.keys()):
    idx += 10
    print(key)
    eval_path = os.path.join(models[key].hp_path, "eval_peaks.pckl")
    
    if not os.path.exists(eval_path) or reload == True:
        eval_peaks = []

        # load datamodel
        dm = DataModelCV(DATA_PATH,
               target_name       = models[key].target_name,
               hincast_features  = models[key].feat_hindcast,
               forecast_features = models[key].feat_forecast,
             )

        if models[key].is_external_model:
            overlap_length = 0
            hindcast_length = 96
        else:
            # load trial data
            with open(os.path.join(models[key].hp_path, "trial.json")) as f:
                trial = json.load(f)

            hindcast_length = trial['hyperparameters']['values']['hindcast_length']
            try:
                overlap_length = trial['hyperparameters']['values']['osc_length']
            except:
                overlap_length = 0 

        dm.main(os.path.join(CROSS_INDICES_PATH+f"_{overlap_length}", f"cross_indices_{hindcast_length}.pckl"))

        for n_fold in dm.cross_sets.keys():
            year = 2013 + n_fold

            # load dataset
            X, y  = dm.getDataSet(dm.cross_sets[n_fold]["test"], scale=True) 

            # get hydrologial model 
            s = dm.getFeatureSet(n_fold+2, "qsim")[2]
            df = pd.DataFrame({'index':dt(s.index), 'qhyd':s.values}).set_index("index")

            # add ground truth
            s = dm.getFeatureSet(n_fold+2, "qmeasval")[2]
            s.index = dt(s.index)
            df = df.merge(s.rename("qmeas").to_frame(), left_index=True, right_index=True)

            if models[key].is_external_model:
                
                ext_df = pd.read_pickle(os.path.join(models[key].hp_path, f"forecast_{year}.pckl"))
                
                forecasts_df = ext_df[[f"fc{x:d}" for x in range(96)]].copy()
                forecasts_df.columns = [f"q{x:d}" for x in range(96)]

                del ext_df
                
            else:
                # load model
                tf.keras.backend.clear_session()
                model  = tf.keras.models.load_model(os.path.join(models[key].hp_path, f"model_fold_{n_fold:d}.keras"),
                                               custom_objects={'peak_loss': peak_loss,
                                                              'kge_nse_loss':peak_loss})

                yp = model.predict(X, batch_size=1000)
                
                if key == "lstm_residual":
                    _, _, yidx = dm.sets[dm.cross_sets[n_fold]["test"]]
                    simu = dm.getWithIndexArray(["qsim"], yidx)
            
                    # get real values from residuals
                    yp += simu[:,:,0]
            
                forecasts_df = pd.DataFrame(data    = yp, 
                                        columns = [f"q{x:d}" for x in range(yp.shape[1])],
                                        index   = dt(dm.getTimeSet(n_fold+2, 0)[2]))

                # save dataframe 
                df_out = forecasts_df.copy()
                df_out.index = pd.to_datetime(df_out.index, format="%d/%m/%Y %H:%M")
                df_out.to_pickle(os.path.join(models[key].hp_path, f"forecast_{year}.pckl"))
            
            
            # get forcasting stats                          
            for forecast_step in range(1, forecasts_df.shape[1]):
                forecasts_df[f"q{forecast_step:d}"] = forecasts_df[f"q{forecast_step:d}"].shift(forecast_step)
            
            # merge model predctions
            df = df.merge(forecasts_df, left_index=True, right_index=True)  
            #df = df.merge(forecasts_df["q0"], left_index=True, right_index=True)
            #df = df.merge(forecasts_df["q95"], left_index=True, right_index=True)

            # merge prcipitation
            s = pd.Series(dm.getFeatureSet(n_fold+2, "pmean", 0)[2].values, dt(dm.getTimeSet(n_fold+2, 0)[2]))
            df = df.merge(s.rename("pmean").to_frame(), left_index=True, right_index=True)
            
            forecasts_df.dropna(inplace=True)
            stats_df = pd.DataFrame(columns = ["fmin", "fmax", "fmean", 
                                               "fq95", "fq90", "fq75",
                                               "fq50",
                                               "fq25", "fq10", "fq5"],
                                   index = forecasts_df.index)
            
            for i, row in forecasts_df.iterrows():
                stats_df.loc[i] = [row.values.min(), row.values.max(), row.values.mean()] + \
                                        [np.quantile(row.values, float(x[2:])/100) for x in stats_df.columns[3:]]
                  
            # merge stats
            df = df.merge(stats_df, left_index=True, right_index=True)        
            df.dropna(inplace=True)
                                           
            peaks = get_n_peaks(df, "qmeas", num_peaks_per_fold, 24*4)
            peaks["n_fold"] = n_fold + 2
            
            #peaks.index = pd.to_datetime(peaks.index, format="%d/%m/%Y %H:%M")

            # add to summary
            eval_peaks.append(peaks)

        # save data to pickle
        df = pd.concat(eval_peaks, axis=0)
        df.to_pickle(eval_path)

    df = pd.read_pickle(eval_path)
    for n_fold in df.n_fold.unique().tolist():
        #print(f"processing fold {n_fold}")
        peaks = df[df.n_fold == n_fold]
        for p in range(num_peaks_per_fold):
            
            dt_index = pd.to_datetime(peaks[peaks.n_peak == p]["qmeas"].index, format="%d/%m/%Y %H:%M")
            timestep = dt_index[1] - dt_index[0]

            
            # eval section
            #rms_q0 = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
              #                      peaks[peaks.n_peak == p]["q0"].values)
            #rms_qm = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
            #                        peaks[peaks.n_peak == p]["fmean"].values)
            #rms_q95 = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
             #                       peaks[peaks.n_peak == p]["q95"].values)
            #rms_hyd = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
                  #                  peaks[peaks.n_peak == p]["qhyd"].values)
            
            idx_peak  = peaks[peaks.n_peak == p]["qmeas"].argmax()
            peak_flow = peaks[peaks.n_peak == p]["qmeas"].max()
            
            dfp.loc[idx+n_fold+5*p, ["name", "year", "peak", "peak_flow"]] = [models[key].name, int(2011+n_fold), p, peak_flow]
            
            
            dfp.loc[idx+n_fold+5*p, ["hyd_perr", "hyd_poff"]] = [peaks[peaks.n_peak == p]["qhyd"].max() - peak_flow,
                                                                 peaks[peaks.n_peak == p]["qhyd"].argmax() - idx_peak,
                                                                ]
            dfp.loc[idx+n_fold+5*p, [f"perr_{x}" for x in range(96)]] = [peaks[peaks.n_peak == p][f"q{x}"].max() - peak_flow for x in range(96)]
            dfp.loc[idx+n_fold+5*p, [f"poff_{x}" for x in range(96)]] = [peaks[peaks.n_peak == p][f"q{x}"].argmax() - idx_peak for x in range(96)]
            #dfp.loc[idx+n_fold+5*p, ["peak_flow", "total_flow"]] = [peak_flow, peaks[peaks.n_peak == p]["qmeas"].sum()]
            #dfp.loc[idx+n_fold+5*p, ["rms_hyd", "flow_hyd"]] = [rms_hyd,  peaks[peaks.n_peak == p]["qhyd"].sum()]
            #dfp.loc[idx+n_fold+5*p, ["rms_0", "rms_m", "rms_95"]] = [rms_q0, rms_qm, rms_q95]
            #dfp.loc[idx+n_fold+5*p, ["flow_0", "flow_m", "flow_95"]] = [peaks[peaks.n_peak == p]["q0"].sum(),
             #                                                        peaks[peaks.n_peak == p]["fmean"].sum(),
              #                                                       peaks[peaks.n_peak == p]["q95"].sum(),
               #                                                     ]
            
       
dfp = dfp.astype({"year": int, "peak":int,"hyd_poff":int})     
dfp.reset_index(inplace=True, drop=True)

arima


  dfp.loc[idx+n_fold+5*p, ["name", "year", "peak", "peak_flow"]] = [models[key].name, int(2011+n_fold), p, peak_flow]


lstm
elstm


In [83]:
df_peaks = dfp[["name", "year", "peak", "peak_flow", "hyd_perr", "hyd_poff"]].copy()

df_peaks["peak_median"] = dfp.filter(regex='^perr').median(axis=1)
df_peaks["off_median"]  = dfp.filter(regex='^poff').median(axis=1).astype(int)

df_peaks["hyd_perr"]    = 100 * df_peaks["hyd_perr"]    /  df_peaks["peak_flow"]
df_peaks["peak_median"] = 100 * df_peaks["peak_median"] /  df_peaks["peak_flow"]

df = df_peaks.loc[0:9, df_peaks.columns[1:6]]

df = df.join(df_peaks.loc[0:9,   df_peaks.columns[6:8]].reset_index(drop=True), how='left', rsuffix="arima")
df = df.join(df_peaks.loc[10:19,  df_peaks.columns[6:8]].reset_index(drop=True), how='left', rsuffix="_pbhm-hlstm")
df = df.join(df_peaks.loc[20:29, df_peaks.columns[6:8]].reset_index(drop=True), how='left', rsuffix="_elstm")

df = df.set_index("year", drop=True)
df

Unnamed: 0_level_0,peak,peak_flow,hyd_perr,hyd_poff,peak_median,off_median,peak_median_pbhm-hlstm,off_median_pbhm-hlstm,peak_median_elstm,off_median_elstm
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013,0,15.0,-90.340227,31,-75.335372,47,-31.628675,5,-77.956767,59
2013,1,10.02,13.313555,20,-3.517414,19,-24.721049,-1,-40.036559,-1
2014,0,7.27,3.475723,18,2.120575,18,-18.595551,-2,-26.956867,0
2014,1,6.23,23.296167,16,21.279349,16,-21.287002,4,-24.817277,2
2015,0,5.85,-62.460701,36,-46.905234,36,-6.590307,1,-66.728345,10
2015,1,3.33,4.708865,49,6.196197,48,-16.932952,3,-32.685514,0
2016,0,17.94,-73.571109,26,-47.397975,47,-67.166613,3,-77.066668,5
2016,1,9.99,-39.413896,-96,-33.925033,-89,-52.351333,70,-65.490134,27
2017,0,9.21,-49.932621,25,-43.083815,37,-4.572971,-1,-54.471552,3
2017,1,7.37,-63.082924,28,-44.686879,29,13.192666,1,-59.942492,6


In [84]:
mask = get_bold_mask(df.abs(), np.argmin, 2, 2)
df2latex(df, os.path.join(PLOT_PATH, r"table_4_peak_compare_percent.txt"), mask, ['d', '5.2f', '+5.1f', 'd', '+5.1f', 'd', '+5.1f', 'd', '+5.1f', 'd'])

C:\GitHub\ForecastModel-Revised-10062024\answer-plots\table_4_peak_compare_percent.txt


In [85]:
df.abs().median()

peak                       0.500000
peak_flow                  8.290000
hyd_perr                  44.673259
hyd_poff                  27.000000
peak_median               38.504424
off_median                36.500000
peak_median_pbhm-hlstm    19.941276
off_median_pbhm-hlstm      2.500000
peak_median_elstm         57.207022
off_median_elstm           4.000000
dtype: float64