In [None]:
import pandas as pd
from pyprojroot import here
import numpy as np
from pathlib import Path

from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import kruskal, ranksums
import scikit_posthocs as sp

from nutils import interval_score, bootstrap, name_mask, pairwise

In [None]:
TARGET = 'occ'

In [None]:
true_matrix = pd.read_csv(
    here() / f'data/processed/true_matrices/{TARGET}.csv',
    index_col='datetime',
    parse_dates=True
)

In [None]:
def flatten(matrix):
    return matrix.values.flatten()

In [None]:
result_list = list()
error_list = list()
data_path = Path('data/processed/prediction_matrices/')

for p in here(data_path / '50').glob('*1.csv'):
    
    result = dict()
    
    parts = p.stem.split('-')
    
    result['Model'] = parts[1]
    result['FS'] = parts[2]
    name = parts[1] + '-' + parts[2]
    
    pred_matrix = pd.read_csv(
        p, 
        index_col='datetime', 
        parse_dates=True)
    
    idx = pred_matrix.dropna().index.intersection(true_matrix.dropna().index)

    pred_matrix = pred_matrix.loc[idx]
    true_matrix = true_matrix.loc[idx]
    
    true = true_matrix.groupby(true_matrix.index.month).apply(flatten).to_dict()
    pred = pred_matrix.groupby(pred_matrix.index.month).apply(flatten).to_dict()
    
    for month in range(1,13):
        rmse = mean_squared_error(true[month], pred[month])
        result[month] = rmse
        
    result_list.append(result)

In [None]:
df = pd.DataFrame(result_list)

df = df.round(0)
df.Model.replace(name_mask, inplace=True)
df.FS = df.FS.str.upper()

df['Mean'] = df.iloc[:,2:].mean(axis=1)
df = df.sort_values(by='Mean', ascending=False)
df = df.drop(columns='Mean')

In [None]:
df.iloc[:,2:] = df.iloc[:,2:].astype(int)

In [None]:
df

In [None]:
df.median(numeric_only=True)

In [None]:
df.to_latex(
    buf = here() / 'output/tables/monthly_performance.tex',
    #float_format='%.0f',
    column_format='rp{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}p{5pt}',
    label='tab:monthly_performance',
    caption='Monthly performance of the tested models in RMSE.\
    FS = feature set',
    position='H',
    index=False,
    na_rep='-'
)

In [None]:
# Plot
df['Model'] = df['Model'] + '-' + df['FS']
df = df.drop(columns='FS')
df = df.set_index('Model')

In [None]:
df.T.plot(kind='bar', width=.8)