In [1]:
#!/usr/bin/env python3
import tensorflow as tf

import os

from src.ForecastModel.data.models import DataModelCV
from src.ForecastModel.utils.metrics import calculate_nse, calculate_kge, calculate_bias, calculate_rms, calculate_bias_flv, calculate_bias_fhv
from src.ForecastModel.utils.losses import loss_peak_mse
from src.ForecastModel.utils.postprocessing import ModelHandler, df2latex, get_bold_mask, load_metrics, get_n_peaks, dt

import numpy as np
import pandas as pd

In [2]:
PLOT_PATH          = r"plots"
DATA_PATH          = r"src\data\Dataset.csv"
CROSS_INDICES_PATH = r"src\data\indices"

In [3]:
models = {
    "arima": ModelHandler("ARIMA",
                r"rst\ARIMA",
                is_final_model = True,
                is_external_model = True,
                color = "#999999",
                ls = "--",
                  ),
     "elstm": ModelHandler("eLSTM",
                   r"rst\eLSTM",
                   is_final_model = True,
                   color = '#984ea3',
                   ls = "-",
                 ),
     "pbhm-hlstm": ModelHandler("PBHM-HLSTM",
               r"rst\PBHM-HLSTM",
               is_final_model = True,
               color = "#e41a1c",
               ls = "-",
              )
     }

# Table 1: Statistics of the catchment

In [4]:
# calculate metrics of hydrologic model per fold

dfh = pd.read_csv(DATA_PATH, parse_dates=["time"])

hyd_metrics = {}
for year in range(2011, 2018):
      dfp = dfh.loc[dfh["time"].dt.year == year]
      hyd_metrics[year] = {
          "kge":  calculate_kge(dfp.qmeasval.values, dfp.qsim.values),
          "nse":  calculate_nse(dfp.qmeasval.values, dfp.qsim.values),
          "bias": calculate_bias(dfp.qmeasval.values, dfp.qsim.values),
          "q_mean": dfp.qmeasval.mean(),
          "q_std":  dfp.qmeasval.std(),
          "q_max":  dfp.qmeasval.max(),
          "q_sum":  dfp.qmeasval.sum()*0.25*60*60 / 1e6,
          "qs_mean": dfp.qsim.mean(),
          "qs_std":  dfp.qsim.std(),
          "qs_max":  dfp.qsim.max(),
          "qs_sum":  dfp.qsim.sum()*0.25*60*60 / 1e6,
          "pmx_max": dfp.pmax.max()*4,
          "pmx_sum": dfp.pmax.sum(),
          "p_max":   dfp.pmean.max()*4,
          "p_sum":   dfp.pmean.sum(),
          "t_mean": dfp.tmean.mean(),
          "t_std":  dfp.tmean.std(),
      }    


In [5]:
df_catchment_stats = pd.DataFrame(hyd_metrics).transpose()
df_catchment_stats = df_catchment_stats[df_catchment_stats.columns[3:]].transpose()
df_catchment_stats.columns = [str(x) for x in df_catchment_stats.columns]
df_catchment_stats

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017
q_mean,0.571149,1.00931,1.208354,1.167666,0.711204,0.829312,0.56743
q_std,0.245171,0.842681,0.681078,0.669319,0.332219,0.682278,0.229892
q_max,9.61,25.2,15.0,7.27,5.85,17.94,9.21
q_sum,18.011757,31.916797,38.106657,36.823518,22.428531,26.224821,17.698878
qs_mean,0.617823,1.103376,1.399735,1.349319,0.755306,0.916051,0.907808
qs_std,0.331923,0.832882,1.016826,0.719722,0.51915,0.70136,0.479473
qs_max,4.195333,8.937861,11.354018,7.681351,4.502031,6.426838,4.611206
qs_sum,19.48366,34.891398,44.142029,42.552124,23.819337,28.967737,28.315708
pmx_max,118.720001,180.520004,100.480003,84.639999,109.919998,231.880005,173.360001
pmx_sum,2159.559998,2877.670002,2777.4,2847.86,2198.09,3222.800001,3076.049996


In [6]:
df2latex(df_catchment_stats, os.path.join(PLOT_PATH, r"table_1_summary_data.txt"))

plots\table_1_summary_data.txt


# Table 2: Average Model Performance

In [7]:
metric_names = ["kge", "nse", "bias", "flv", "fhv"]
metric_labels = ["KGE", "NSE", "PBIAS", "FLV", "FHV"]

xx = np.arange(1,97)
df = pd.DataFrame(columns=["name", "year"])

n_row = -5

for n, key in enumerate(models.keys()):
    metrics          = load_metrics(os.path.join(models[key].lg_path, "metrics_eval.txt"))
    metrics_baseline = load_metrics(os.path.join(models["arima"].lg_path, "metrics_eval.txt"))
    
    n_row += 5
    
    for j, met in enumerate(metric_names):

        metric_test     = metrics["test"][met]
        metric_baseline = metrics_baseline["test"][met]
        
        for i in range(5):
            df.loc[n_row+i, ["name", "year"]] = [models[key].name, 2013+i]

            # evaluations --------------------------------------------------------------------------
            # median metrics of test set
            df.loc[n_row+i, [f"{met}_test"]]   = [np.median(metric_test[i])]
            # metric for inital and final step in forecasting window
            df.loc[n_row+i, [f"{met}_init"]]   = [metric_test[i][0]]
            df.loc[n_row+i, [f"{met}_final"]]  = [ metric_test[i][-1]]
            # difference of metric between inital and final step in forecasting window
            df.loc[n_row+i, [f"{met}_drop"]]   = [metric_test[i][0] - metric_test[i][-1]]
            # num of lead time steps required so that model outperforms the baseline (ARIMA)
            if met != "bias":
                out_perform_idxs = [n for n, (x,y) in enumerate(zip(metric_baseline[i], metric_test[i])) if y-x > 0]
                if len(out_perform_idxs) > 0:
                    df.loc[n_row+i, [f"{met}_out"]] = out_perform_idxs[0]
                    df.loc[n_row+i, [f"{met}_out_hours"]] = out_perform_idxs[0] * 0.25
                else:
                    df.loc[n_row+i, [f"{met}_out"]] = -1
                    df.loc[n_row+i, [f"{met}_out_hours"]] = -1

# summary
cols = df.columns.to_list()
df[cols[:2]+cols[4:9]+cols[12:17]+cols[20:]]

Unnamed: 0,name,year,kge_final,kge_drop,kge_out,kge_out_hours,nse_test,nse_out,nse_out_hours,bias_test,...,flv_final,flv_drop,flv_out,flv_out_hours,fhv_test,fhv_init,fhv_final,fhv_drop,fhv_out,fhv_out_hours
0,ARIMA,2013,0.883965,0.108399,-1.0,-1.0,0.769669,-1.0,-1.0,0.41929,...,50.730574,-64.122792,-1.0,-1.0,-0.77536,0.237162,-0.011318,0.24848,-1.0,-1.0
1,ARIMA,2014,0.869098,0.126521,-1.0,-1.0,0.76363,-1.0,-1.0,0.866919,...,10.829539,-34.604911,-1.0,-1.0,6.403875,0.344575,5.640614,-5.296039,-1.0,-1.0
2,ARIMA,2015,0.726017,0.269026,-1.0,-1.0,0.670728,-1.0,-1.0,0.71048,...,46.954967,-52.858248,-1.0,-1.0,24.910782,0.564424,34.700103,-34.135679,-1.0,-1.0
3,ARIMA,2016,0.80667,0.186046,-1.0,-1.0,0.683637,-1.0,-1.0,0.303081,...,17.893949,-21.574498,-1.0,-1.0,-6.162203,0.025213,-7.501014,7.526227,-1.0,-1.0
4,ARIMA,2017,0.594178,0.37445,-1.0,-1.0,0.232782,-1.0,-1.0,0.559718,...,17.399872,-22.109133,-1.0,-1.0,7.066954,1.193207,5.340759,-4.147552,-1.0,-1.0
5,eLSTM,2013,0.897098,0.050535,9.0,2.25,0.841174,6.0,1.5,-0.91899,...,25.436694,-41.311113,2.0,0.5,2.679881,-5.643594,0.937166,-6.58076,27.0,6.75
6,eLSTM,2014,0.787249,0.112874,43.0,10.75,0.905239,8.0,2.0,2.037583,...,-106.734302,89.260422,0.0,0.0,-12.417576,-10.600086,-18.912759,8.312673,-1.0,-1.0
7,eLSTM,2015,0.704111,0.181462,13.0,3.25,0.90564,6.0,1.5,1.221196,...,65.030179,-26.562638,0.0,0.0,-5.346377,-3.703567,-7.830905,4.127338,-1.0,-1.0
8,eLSTM,2016,0.614212,0.237413,-1.0,-1.0,0.671255,17.0,4.25,-8.79085,...,23.965108,-137.176208,78.0,19.5,-39.579754,-21.777544,-42.981597,21.204052,-1.0,-1.0
9,eLSTM,2017,0.647341,0.203271,5.0,1.25,0.656459,4.0,1.0,-0.2351,...,-106.52129,148.75785,0.0,0.0,0.945907,-6.837054,-0.875328,-5.961725,87.0,21.75


In [8]:
# calculate metrics of hydrologic model per fold
dfh = pd.read_csv(DATA_PATH, parse_dates=["time"])

hyd_metrics = {}
for year in range(2012, 2018):
      dfp = dfh.loc[dfh["time"].dt.year == year]
      hyd_metrics[year] = {
          "kge":  calculate_kge(dfp.qmeasval.values, dfp.qsim.values),
          "nse":  calculate_nse(dfp.qmeasval.values, dfp.qsim.values),
          "bias": calculate_bias(dfp.qmeasval.values, dfp.qsim.values),
          "flv": calculate_bias_flv(dfp.qmeasval.values, dfp.qsim.values),
          "fhv": calculate_bias_fhv(dfp.qmeasval.values, dfp.qsim.values),
      }    
    
    
# averaged metrics
evalu = {}
for n, key in enumerate(models.keys()):
    metrics = load_metrics(os.path.join(models[key].lg_path, "metrics_eval.txt"))
    evalu[key] = {}
    for i in range(0,5):
        year = 2013 + i
        evalu[key][year] = {}
        for j, met in enumerate(["kge", "nse", "bias", "flv", "fhv"]):
            metric = metrics["test"][met][i]
            evalu[key][year][met] = metric
       

In [9]:
years = np.array([x for x in range(2013,2018)])
metric_names = ["kge", "nse", "bias", "flv", "fhv"]

df_avg_model = pd.DataFrame(index=years)

# get metrics from the hydrological model
for year in years:
    for met in metric_names:
        df_avg_model.loc[year, f"hyd_{met}"] = hyd_metrics[year][met]

# get metrics from data models
for n,key in enumerate(evalu.keys()):
    for year in years:
        for met in metric_names:
            df_avg_model.loc[year, f"{key}_{met}"] = np.median(evalu[key][year][met])

df_avg_model

Unnamed: 0,hyd_kge,hyd_nse,hyd_bias,hyd_flv,hyd_fhv,arima_kge,arima_nse,arima_bias,arima_flv,arima_fhv,elstm_kge,elstm_nse,elstm_bias,elstm_flv,elstm_fhv,pbhm-hlstm_kge,pbhm-hlstm_nse,pbhm-hlstm_bias,pbhm-hlstm_flv,pbhm-hlstm_fhv
2013,0.631525,0.185013,15.838105,29.212907,32.441052,0.885991,0.769669,0.41929,48.702402,-0.77536,0.891144,0.841174,-0.91899,8.924938,2.679881,0.873886,0.915637,-4.399931,-129.74142,1.986731
2014,0.73864,0.49442,15.556921,-130.621654,6.862243,0.869743,0.76363,0.866919,6.087115,6.403875,0.858158,0.905239,2.037583,-141.644974,-12.417576,0.939515,0.9491,-3.055125,-136.649814,-8.349699
2015,0.505049,0.236257,6.201059,16.722689,35.47451,0.800245,0.670728,0.71048,43.057975,24.910782,0.887542,0.90564,1.221196,38.667841,-5.346377,0.869994,0.878783,7.679015,-18.462809,4.522243
2016,0.738958,0.512468,10.459236,-46.968768,-5.176013,0.828989,0.683637,0.303081,18.348574,-6.162203,0.66889,0.671255,-8.79085,-119.893203,-39.579754,0.892467,0.876417,-2.539703,-1.47171,-10.270599
2017,0.191555,-4.243535,59.985891,-151.860726,74.533865,0.580995,0.232782,0.559718,10.826032,7.066954,0.798162,0.656459,-0.2351,-194.327971,0.945907,0.830826,0.69721,12.175733,-306.760369,11.077637


In [10]:
columns = [x for x in df_avg_model.columns if x.split("_")[1] in ["kge", "nse", "bias", "fhv"]]
mask = get_bold_mask(df_avg_model[columns].abs(), [np.argmax, np.argmax, np.argmin, np.argmin], 4, 0)
df2latex(df_avg_model[columns], 
         os.path.join(PLOT_PATH, r"table_2_mean_metrics.txt"), 
         mask, 
         ['6.2f', '6.2f', '+6.1f', '+6.1f']*4, 
        )

columns = [x for x in df_avg_model.columns if x.split("_")[1] in ["bias", "flv", "fhv"]]
mask = get_bold_mask(df_avg_model[columns].abs(), [np.argmin, np.argmin, np.argmin], 3, 2)
df2latex(df_avg_model[columns], os.path.join(PLOT_PATH, r"table_3_mean_metrics-bias.txt"), mask, '6.1f')

plots\table_2_mean_metrics.txt
plots\table_3_mean_metrics-bias.txt


# Table 4: Peak Performance

In [11]:
def dt(dates, format="%d/%m/%Y %H:%M"):
    if dates.tz == None:
        # make TZ aware
        return pd.to_datetime(dates, format=format).tz_localize("Europe/London").tz_convert("UTC")
    else:
        return pd.to_datetime(dates, format=format).tz_convert("UTC")

In [12]:
# def
num_peaks_per_fold = 2    # number of peaks per fold to analyze
load_predictions   = False # load predictions or newly predict with models

In [None]:
idx = -10
dfp = pd.DataFrame(columns = ["name", "year", "peak", 
                              "peak_flow", "total_flow",
                              "hyd_perr", "hyd_poff",
                              "rms_hyd", "flow_hyd", 
                              "rms_0", "rms_m", "rms_95",
                              "flow_0", "flow_m", "flow_95"]
                   
                        ) 
dfp = dfp.astype(dtype= {"name"     :"str",     "year"      :"int32",    "peak"  : "int32", 
                         "peak_flow":"float64", "total_flow":"float64",
                         "hyd_perr" :"float64", "hyd_poff"  :"float64",
                         "rms_hyd"  :"float64", "flow_hyd"  :"float64", 
                         "rms_0"    :"float64", "rms_m"     :"float64", "rms_95" : "float64",
                         "flow_0"   :"float64", "flow_m"    :"float64", "flow_95": "float64"}
                )

for n, key in enumerate(models.keys()):
    idx += 10
    print(key)
    eval_path = os.path.join(models[key].hp_path, "eval_peaks.pkl")
    
    if not os.path.exists(eval_path) or load_predictions == False:
        eval_peaks = []

        # load datamodel
        dm = DataModelCV(DATA_PATH,
               target_name       = models[key].target_name,
               hincast_features  = models[key].feat_hindcast,
               forecast_features = models[key].feat_forecast,
             )

        if models[key].is_external_model:
            overlap_length = 0
            hindcast_length = 96
        else:
            # load trial data
            with open(os.path.join(models[key].hp_path, "trial.json")) as f:
                trial = json.load(f)

            hindcast_length = trial['hyperparameters']['values']['hindcast_length']
            try:
                overlap_length = trial['hyperparameters']['values']['osc_length']
            except:
                overlap_length = 0 

        dm.main(os.path.join(CROSS_INDICES_PATH, f"cross_indices_{hindcast_length}.pkl"))

        for n_fold in dm.cross_sets.keys():
            year = 2013 + n_fold

            # load dataset
            X, y  = dm.getDataSet(dm.cross_sets[n_fold]["test"], scale=True) 

            # get hydrologial model 
            s = dm.getFeatureSet(n_fold+2, "qsim")[2]
            df = pd.DataFrame({'index':dt(s.index), 'qhyd':s.values}).set_index("index")

            # add ground truth
            s = dm.getFeatureSet(n_fold+2, "qmeasval")[2]
            s.index = dt(s.index)
            df = df.merge(s.rename("qmeas").to_frame(), left_index=True, right_index=True)

            if models[key].is_external_model:
                
                ext_df = pd.read_pickle(os.path.join(models[key].hp_path, f"forecast_{year}.pkl"))

                ext_df.index = pd.date_range(ext_df.index[0], ext_df.index[-1], freq="15min", tz="UTC")
                
                forecasts_df = ext_df[[f"fc{x:d}" for x in range(96)]].copy()
                forecasts_df.columns = [f"q{x:d}" for x in range(96)]

                del ext_df
                
            else:
                # load model
                tf.keras.backend.clear_session()
                model  = tf.keras.models.load_model(os.path.join(models[key].hp_path, f"model_fold_{n_fold:d}.keras"))

                yp = model.predict(X, batch_size=1000)
                
                if key == "lstm_residual":
                    _, _, yidx = dm.sets[dm.cross_sets[n_fold]["test"]]
                    simu = dm.getWithIndexArray(["qsim"], yidx)
            
                    # get real values from residuals
                    yp += simu[:,:,0]
            
                forecasts_df = pd.DataFrame(data    = yp, 
                                        columns = [f"q{x:d}" for x in range(yp.shape[1])],
                                        index   = dt(dm.getTimeSet(n_fold+2, 0)[2]))

                # save dataframe 
                df_out = forecasts_df.copy()
                df_out.index = pd.to_datetime(df_out.index, format="%d/%m/%Y %H:%M", utc=True)
                df_out.to_pickle(os.path.join(models[key].hp_path, f"forecast_{year}.pkl"))
            
            
            # get forcasting stats                          
            for forecast_step in range(1, forecasts_df.shape[1]):
                forecasts_df[f"q{forecast_step:d}"] = forecasts_df[f"q{forecast_step:d}"].shift(forecast_step)
            
            # merge model predctions
            df = df.merge(forecasts_df, left_index=True, right_index=True)  
            
            # merge prcipitation
            s = pd.Series(dm.getFeatureSet(n_fold+2, "pmean", 0)[2].values, dt(dm.getTimeSet(n_fold+2, 0)[2]))
            df = df.merge(s.rename("pmean").to_frame(), left_index=True, right_index=True)
            
            forecasts_df.dropna(inplace=True)
            stats_df = pd.DataFrame(columns = ["fmin", "fmax", "fmean", 
                                               "fq95", "fq90", "fq75",
                                               "fq50",
                                               "fq25", "fq10", "fq5"],
                                   index = forecasts_df.index)
            
            for i, row in forecasts_df.iterrows():
                stats_df.loc[i] = [row.values.min(), row.values.max(), row.values.mean()] + \
                                        [np.quantile(row.values, float(x[2:])/100) for x in stats_df.columns[3:]]
                  
            # merge stats
            df = df.merge(stats_df, left_index=True, right_index=True)        
            df.dropna(inplace=True)
                                           
            peaks = get_n_peaks(df, "qmeas", num_peaks_per_fold, 24*4)
            peaks["n_fold"] = n_fold + 2

            # add to summary
            eval_peaks.append(peaks)

        # save data to pickle
        df = pd.concat(eval_peaks, axis=0)
        df.to_pickle(eval_path)

    df = pd.read_pickle(eval_path)
    for n_fold in df.n_fold.unique().tolist():
        #print(f"processing fold {n_fold}")
        peaks = df[df.n_fold == n_fold]
        for p in range(num_peaks_per_fold):

            # eval peaks
            idx_peak  = peaks[peaks.n_peak == p]["qmeas"].argmax()
            peak_flow = peaks[peaks.n_peak == p]["qmeas"].max()
            dfp.loc[idx+n_fold+5*p, ["hyd_perr", "hyd_poff"]] = [peaks[peaks.n_peak == p]["qhyd"].max() - peak_flow,
                                                                 peaks[peaks.n_peak == p]["qhyd"].argmax() - idx_peak,
                                                                ]
            dfp.loc[idx+n_fold+5*p, [f"perr_{x}" for x in range(96)]] = [peaks[peaks.n_peak == p][f"q{x}"].max() - peak_flow for x in range(96)]
            dfp.loc[idx+n_fold+5*p, [f"poff_{x}" for x in range(96)]] = [peaks[peaks.n_peak == p][f"q{x}"].argmax() - idx_peak for x in range(96)]

            # eval section
            rms_q0 = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
                                    peaks[peaks.n_peak == p]["q0"].values)
            rms_qm = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
                                    peaks[peaks.n_peak == p]["fmean"].values)
            rms_q95 = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
                                    peaks[peaks.n_peak == p]["q95"].values)
            rms_hyd = calculate_rms(peaks[peaks.n_peak == p]["qmeas"].values, 
                                    peaks[peaks.n_peak == p]["qhyd"].values)
            
            peak_flow = peaks[peaks.n_peak == p]["qmeas"].max()
            
            dfp.loc[idx+n_fold+5*p, ["name", "year", "peak"]] = [models[key].name, np.int32(2011+n_fold), np.int32(p)]
            dfp.loc[idx+n_fold+5*p, ["peak_flow", "total_flow"]] = [peak_flow, peaks[peaks.n_peak == p]["qmeas"].sum()]
            dfp.loc[idx+n_fold+5*p, ["rms_hyd", "flow_hyd"]] = [rms_hyd,  peaks[peaks.n_peak == p]["qhyd"].sum()]
            dfp.loc[idx+n_fold+5*p, ["rms_0", "rms_m", "rms_95"]] = [rms_q0, rms_qm, rms_q95]
            dfp.loc[idx+n_fold+5*p, ["flow_0", "flow_m", "flow_95"]] = [peaks[peaks.n_peak == p]["q0"].sum(),
                                                                     peaks[peaks.n_peak == p]["fmean"].sum(),
                                                                     peaks[peaks.n_peak == p]["q95"].sum(),
                                                                    ]

arima
dictonary loaded


In [None]:
dfout = dfp[["name", "year", "peak", "peak_flow", "hyd_perr", "hyd_poff"]].copy()
dfout["peak_median"] = dfp.filter(regex='^perr').median(axis=1)
dfout["off_median"]  = dfp.filter(regex='^poff').median(axis=1)

dfout["hyd_perr"] = 100 * dfout["hyd_perr"] /  dfout["peak_flow"]
dfout["peak_median"] = 100 * dfout["peak_median"] /  dfout["peak_flow"]

dfout = dfout.reset_index()
df = dfout.loc[0:9, dfout.columns[2:7]]

for n, key in enumerate(models.keys()):
    df = df.join(dfout.loc[n*10:n*10+10, dfout.columns[7:9]].reset_index(drop=True), how='left', rsuffix="_" + models[key].name)

df.index = pd.MultiIndex.from_arrays([df["year"].map(int), df["peak"].map(int)])
df = df.drop(["peak", "year"], axis=1)

In [None]:
# get summary statistics over all events
# hydrologic model
df.loc[("all folds", 0), "hyd_perr"] = dfout["hyd_perr"].iloc[::2].median()
df.loc[("all folds", 1), "hyd_perr"] = dfout["hyd_perr"].iloc[1::2].median()
df.loc[("all folds", 99), "hyd_perr"] = dfout["hyd_perr"].median()

df.loc[("all folds", 0), "hyd_poff"] = dfout["hyd_poff"].iloc[::2].median()
df.loc[("all folds", 1), "hyd_poff"] = dfout["hyd_poff"].iloc[1::2].median()
df.loc[("all folds", 99), "hyd_poff"] = dfout["hyd_poff"].median()

# models percentage error
dfall = dfp.filter(regex='^perr').reset_index().drop("index", axis=1)
dfall = dfall.multiply(100/dfout["peak_flow"].values, axis=0)
for n, key in enumerate(models.keys()):
    if n == 0: # fix as suffix is not applied on first appearance of columns
        name = ""
    else:
        name = "_" + models[key].name    
    df.loc[("all folds", 0), "peak_median"+name] = np.median(dfall.loc[n*10:(n+1)*10-1:2, :].values)
    df.loc[("all folds", 1), "peak_median"+name] = np.median(dfall.loc[n*10+1:(n+1)*10:2, :].values)
    df.loc[("all folds", 99),"peak_median"+name] = np.median(dfall.loc[n*10:(n+1)*10-1, :].values)
    
# models offset error
dfall = dfp.filter(regex='^poff').reset_index().drop("index", axis=1)
for n, key in enumerate(models.keys()):
    if n == 0: # fix as suffix is not applied on first appearance of columns
        name = ""
    else:
        name = "_" + models[key].name    
    df.loc[("all folds", 0), "off_median"+name] = np.median(dfall.loc[n*10:(n+1)*10-1:2, :].values)
    df.loc[("all folds", 1), "off_median"+name] = np.median(dfall.loc[n*10+1:(n+1)*10:2, :].values)
    df.loc[("all folds", 99),"off_median"+name] = np.median(dfall.loc[n*10:(n+1)*10-1, :].values)

df = df.fillna(0)

In [None]:
df = df.reset_index()
df = df.drop(["year"], axis=1)

# make latex table and bold best values per row and metric
mask = get_bold_mask(df.abs(), np.argmin, 2, 2)

# apply table format
df.index = [str(2013+x//2) if x%2==0 else " " for x in df.index]
df.rename(index={'2018':"all folds", '2019':" "}, inplace=True)
df.loc[:, "peak"] = df.loc[:, "peak"].map(str).replace({'0':'1st', '1':'2nd', '99':"both"})


df2latex(df, 
         os.path.join(PLOT_PATH, r"table_4_peak_compare_percent.txt"), 
         mask, 
         ['s', '5.2f', '+5.1f', 'd'] + ['+5.1f', 'd']*len(models.keys()),
        )

In [None]:
df

# Table 5: Fold Sensitivity - all samples

In [None]:
# load data frame
df = pd.read_pickle(r"rst\ig_all_folds-final_model.pkl")
df.index = df.index + 2013
df.index = df.index.map(str)
df.loc['all folds', df.columns] = df.sum()

# convert to percentage
df = df.div(df.sum(axis=1), axis=0)

mask = get_bold_mask(df.abs(), np.argmax, 1, 0)
df2latex(df,
         os.path.join(PLOT_PATH, r"table_5_sensitivity-all.txt"), 
         mask,
         ['5.2f']*df.shape[1],
        )



In [None]:
df

# Table 6: Fold Sensitivity - 2 major peaks

In [None]:
# load data frame
df = pd.read_pickle(r"rst\ig_peaks-final-model.pkl")

sum_all = df.sum()
df.loc[('sum', 0), df.columns] = df.xs(0,level=1).sum()
df.loc[('sum', 1), df.columns] = df.xs(1,level=1).sum()
df.loc[('sum', 99), df.columns]= sum_all

# convert to percentage
df = df.div(df.sum(axis=1), axis=0)

# apply table format
df = df.reset_index()
df = df.drop(["level_0"], axis=1)

mask = get_bold_mask(df.abs(), np.argmax, 1, 1)

df.index = [str(2013+x//2) if x%2==0 else " " for x in df.index]
df.rename(index={'2018':"all folds", '2019':" "}, inplace=True)
df.loc[:, "level_1"] = df.loc[:, "level_1"].map(str).replace({'0':'1st', '1':'2nd', '99':"both"})


df2latex(df,
         os.path.join(PLOT_PATH, r"table_6_sensitivity-peaks.txt"), 
         mask,
         ['s'] + ['5.2f']*df.shape[1],
        )

In [None]:
df