In [1]:
import pandas as pd
from pyprojroot import here
import numpy as np
from pathlib import Path

from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import kruskal, ranksums
import scikit_posthocs as sp

from nutils import interval_score, bootstrap, name_mask, pairwise

In [2]:
def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [3]:
TARGET = 'occ'

In [4]:
true_matrix = pd.read_csv(
    here() / f'data/processed/true_matrices/{TARGET}.csv',
    index_col='datetime',
    parse_dates=True
)

In [17]:
result_list = list()
error_list = list()
data_path = Path('data/processed/prediction_matrices/')

for p in here(data_path / '50').glob('*512.csv'):
    
    result = dict()
    
    parts = p.stem.split('-')
    
    result['Model'] = parts[1]
    result['FS'] = parts[2]
    name = parts[1] + '-' + parts[2]
    result['name'] = name
    
    pred_matrix = pd.read_csv(
        p, 
        index_col='datetime', 
        parse_dates=True)
    
    idx = pred_matrix.dropna().index.intersection(true_matrix.dropna().index)

    pred_vector = pred_matrix.loc[idx].values.flatten()
    true_vector = true_matrix.loc[idx].values.flatten()
    
    # error for tests
    error = np.abs(pred_vector - true_vector)
    error = pd.Series(error, name=name)
    error_list.append(error)

    # mae
    mae = mean_absolute_error(true_vector, pred_vector)
    lb, ub = bootstrap(true_vector, pred_vector, mean_absolute_error)
    result['MAE'] = f'{mae:.2f} ({lb:.2f}-{ub:.2f})'
    result['_mae'] = mae
    print(mae)
    
    # rmse 
    rmse = root_mean_squared_error(true_vector, pred_vector)
    lb, ub = bootstrap(true_vector, pred_vector, root_mean_squared_error)
    result['RMSE'] = f'{rmse:.2f} ({lb:.2f}-{ub:.2f})'
    result['_rmse'] = rmse
    """
    # msis
    if result['Model'] != 'sn':
        lb = pd.read_csv(here() / data_path / '05' / f'{p.stem}.csv', 
                         parse_dates=True, 
                         index_col='datetime').loc[idx].values.flatten()
        ub = pd.read_csv(here() / data_path / '95' / f'{p.stem}.csv',
                         parse_dates=True, 
                         index_col='datetime').loc[idx].values.flatten()
        
        result['MSIS'] = interval_score(
            true_vector, 
            0.05, 
            q_left=lb, 
            q_right=ub, 
            mean=True, 
            scaled=True, 
            seasonality=24)
    """
    
    result_list.append(result)

7.047825091575092
6.8590293040293036
7.293817536630038
6.6877186355311355
5.799358974358974


In [7]:
df = pd.DataFrame(result_list)

In [8]:
# Statistical tests
errors = pd.concat(error_list, axis=1)
s, p = kruskal(*errors.values, nan_policy='omit')
errors = errors.melt()
table = sp.posthoc_dunn(errors, val_col='value', group_col='variable', p_adjust='holm')
values = table.iloc[:,0].round(3)

In [9]:
# Process test results
values = values.replace(0.000, '<.001')
values = values.replace(1.0, '1.000')
values['arimax-u'] = np.nan
values.name = 'p'
df = df.join(values, on='name')

In [10]:
# Calculate delta
baseline = df[df.Model=='arimax']._mae.values[0]
df['Delta (%)'] = ((baseline - df._mae) / baseline) * 100
df['Delta (%)'] = df['Delta (%)'].round(0)
df['Delta (%)'] = df['Delta (%)'].astype(int)
df = df.drop(columns='_mae')

IndexError: index 0 is out of bounds for axis 0 with size 0

In [11]:
# Sort
df = df.sort_values(by='_rmse', ascending=False)
df = df.drop(columns='_rmse')

df.Model.replace(name_mask, inplace=True)
df = df.set_index('Model')

In [13]:
# Reorganize columns
df = df[['FS', 'MAE', 'p', 'RMSE']]

In [12]:
# Reorganize columns
df = df[['FS', 'MAE', 'Delta (%)', 'p', 'RMSE', 'MSIS']]

KeyError: "['Delta (%)', 'MSIS'] not in index"

In [14]:
df.MSIS = df.MSIS.round(0)
df.MSIS = df.MSIS.fillna(200)
df.MSIS = df.MSIS.astype(int)
df.MSIS = df.MSIS.replace(200, '-')

AttributeError: 'DataFrame' object has no attribute 'MSIS'

In [15]:
df.FS = df.FS.str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.FS = df.FS.str.upper()


In [16]:
df.to_latex(
    buf = here() / 'output/tables/performance.tex',
    #float_format='%.0f',
    column_format='rcccccc',
    label='tab:performance',
    caption='Continuous performance of the tested models.\
    FS = feature set, MAE = mean absolute error, \
    RMSE = mean squared error, \
    MSIS = mean scaled interval score. \
    95\% confidence intervals in parenthesis.',
    position='H',
    index=True,
    na_rep='-'
)

  df.to_latex(
