# Evaluation Notebook

In [26]:
import os
import pickle
import pandas as pd

from utils import tools

In [27]:
data = 'pvod'

config = tools.load_config('config.yaml')
results_dir = config['eval']['results_path']
results_dir = os.path.join(results_dir, data)
result_files = os.listdir(results_dir)
# pickle files where training was performed on single dataset sequentially
sims = [f for f in result_files if (f.endswith('.pkl')) & ('cl' not in f) & ('fl' not in f)]
# pickle files where training was performed on concatenated datasets (centralized learning)
cl_sims = [f for f in result_files if 'cl' in f]
# pickle files for federated learning simulations
fl_sims = [f for f in result_files if 'fl' in f]
persistence_file = [f for f in result_files if 'persistence' in f][0]

In [28]:
def concatenate_results(results_dir: str,
                        results: list,
                        get_skill: bool = False,
                        pers: pd.DataFrame = pd.DataFrame(),
                        sort_skill: bool = False) -> pd.DataFrame:
    index_cols = ['Models', 'output_dim', 'freq', 't_0']
    indices = {}
    metrics = []
    # initialize the dict list
    for col in index_cols:
        indices[col] = []
    for file in results:  # Iterate through all result files
        # Load the data
        with open(os.path.join(results_dir, file), 'rb') as f:
            pkl = pickle.load(f)
        df = pkl['evaluation']
        df.reset_index(inplace=True)
        if get_skill:
            df= pd.merge(df, pers[['RMSE', 'key', 'output_dim', 'freq']],
               on=['key', 'output_dim', 'freq'],
               how='left',
               suffixes=('', '_p'))
            df['Skill'] = 1 - df.RMSE / df.RMSE_p
            df.drop('RMSE_p', axis=1, inplace=True)
        for col in index_cols:
            indices[col].append(df[col].iloc[0])
        df.drop(index_cols, axis=1, inplace=True)
        df.drop('key', axis=1, inplace=True)
        metric = df.mean(axis=0)
        metrics.append(metric)
    df = pd.DataFrame(metrics, columns=metric.index)
    df_index = pd.DataFrame(indices)
    df = pd.concat([df, df_index], axis=1)
    df.sort_values(['output_dim', 'freq', 'Models'], inplace=True)
    if sort_skill:
        df.sort_values(['Skill'], ascending=False, inplace=True)
    return df

def read_sim(results_dir, sim):
    try:
        with open(os.path.join(results_dir, sim), 'rb') as f:
            sim_results = pickle.load(f)
    except:
        sim_results = None
    return sim_results

In [29]:
pers = pd.read_csv(os.path.join(results_dir, persistence_file))

Local simulation analysis - 1 dataset : 1 model 

In [18]:
df_sep = concatenate_results(results_dir=results_dir, results=sims)
df_sep

2025-04-23 10:49:20.928559: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-04-23 10:49:20.928807: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2025-04-23 10:49:20.928826: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2025-04-23 10:49:20.928907: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-23 10:49:20.928926: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Unnamed: 0,R^2,RMSE,MAE,Skill,retrain_interval,Models,output_dim,freq,t_0
1,0.879178,0.092808,0.051545,0.390635,,bilstm,1,1h,
2,0.877271,0.09348,0.051344,0.387123,,lstm,1,1h,
4,0.865613,0.096584,0.05834,0.332225,,tft,1,1h,
5,0.866132,0.097908,0.059619,0.357072,,bilstm,48,1h,
6,0.837694,0.107862,0.064739,0.293578,,lstm,48,1h,
0,0.819761,0.113709,0.069235,0.257288,,tcn-gru,48,1h,
3,0.811786,0.115398,0.067755,0.208857,,tft,48,1h,


Centralized simulation analysis - N datasets : 1 model

In [29]:
df_cl = concatenate_results(results_dir=results_dir,
                             get_skill=True,
                             pers=pers,
                             results=cl_sims)
df_cl

Unnamed: 0,R^2,RMSE,MAE,Skill,Models,output_dim,freq,t_0
0,0.725657,0.130368,0.072616,0.110185,tft,48,1h,


Federated simulation analysis - N datasets : 1 model

In [13]:
df_fl = concatenate_results(results_dir=results_dir,
                            results=fl_sims)
df_fl

  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))


Unnamed: 0,R^2,RMSE,MAE,Skill,Models,output_dim,freq,t_0
4,0.835251,0.107536,0.058138,0.295496,tcn-gru,1,1h,
3,0.850806,0.102455,0.058093,0.334009,bilstm,48,1h,
2,0.831738,0.108809,0.061448,0.290704,lstm,48,1h,
1,0.840405,0.106631,0.061371,0.308183,tcn-gru,48,1h,
0,0.856006,0.101952,0.056313,0.315888,tft,48,1h,


Individual simulation analysis

In [30]:
model = 'tft'
output_dim = 48
freq = '1h'

sim = f'd-{data}_m-{model}_out-{output_dim}_freq-{freq}.pkl'
cl_sim = f'cl_d-{data}_m-{model}_out-48_freq-{freq}.pkl'
fl_sim = f'fl_d-{data}_m-{model}_out-{output_dim}_freq-{freq}.pkl'

In [31]:
sim_results = read_sim(results_dir, sim)
cl_results = read_sim(results_dir, cl_sim)
fl_results = read_sim(results_dir, fl_sim)

In [32]:
sim_results['evaluation']

Unnamed: 0_level_0,R^2,RMSE,MAE,Skill,retrain_interval,output_dim,freq,key,t_0
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
tft,0.771039,0.110582,0.06024,0.114765,,48,1h,station00.csv,
tft,0.802007,0.13649,0.082453,0.133968,,48,1h,station01.csv,
tft,0.816905,0.110721,0.063542,0.221176,,48,1h,station02.csv,
tft,0.864706,0.141438,0.075544,0.409657,,48,1h,station04.csv,
tft,0.826771,0.086967,0.048764,0.148813,,48,1h,station06.csv,
tft,0.751962,0.122653,0.087092,0.124555,,48,1h,station07.csv,
tft,0.849109,0.098935,0.056649,0.309067,,48,1h,station08.csv,


In [33]:
fl_results['evaluation']

Unnamed: 0_level_0,R^2,RMSE,MAE,Skill,output_dim,freq,key,t_0
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tft,0.856645,0.0875,0.044142,0.299539,48,1h,station00.csv,
tft,0.892889,0.100391,0.055701,0.363021,48,1h,station01.csv,
tft,0.917931,0.074128,0.042151,0.478577,48,1h,station02.csv,
tft,0.781501,0.179743,0.102426,0.249779,48,1h,station04.csv,
tft,0.792202,0.09525,0.055419,0.067742,48,1h,station06.csv,
tft,0.868394,0.089342,0.046508,0.362313,48,1h,station07.csv,
tft,0.882482,0.087311,0.047841,0.390243,48,1h,station08.csv,


In [12]:
cl_results['evaluation']

Unnamed: 0_level_0,R^2,RMSE,MAE,Skill,output_dim,freq,key,t_0
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tft,0.754981,0.114394,0.059101,0.084248,48,1h,station00.csv,
tft,0.856837,0.116063,0.066582,0.263583,48,1h,station01.csv,
tft,0.771421,0.123712,0.070206,0.1298,48,1h,station02.csv,
tft,0.892132,0.126291,0.06796,0.472879,48,1h,station04.csv,
tft,0.396252,0.162357,0.091801,-0.589069,48,1h,station06.csv,
tft,0.797792,0.110744,0.060261,0.209558,48,1h,station07.csv,
tft,0.610182,0.159019,0.092399,-0.110543,48,1h,station08.csv,
