In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from panda.utils.eval_utils import get_summary_metrics_dict
from panda.utils.plot_utils import (
    apply_custom_style,
    make_box_plot,
    plot_all_metrics_by_prediction_length,
)

apply_custom_style("../../config/plotting.yaml")

In [None]:
DEFAULT_COLORS = list(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [None]:
figs_save_dir = os.path.join("../../figures", "eval_metrics")
os.makedirs(figs_save_dir, exist_ok=True)

outputs_save_dir = os.path.join("../../outputs", "eval_metrics")
os.makedirs(outputs_save_dir, exist_ok=True)

In [None]:
WORK_DIR = os.getenv("WORK", "")
DATA_DIR = os.path.join(WORK_DIR, "data")

In [None]:
data_split = "test_zeroshot"

run_metrics_dir_dict = {
    "Panda": os.path.join(
        WORK_DIR,
        "eval_results",
        "patchtst",
        "pft_chattn_emb_w_poly-0",
        data_split,
    ),
    "Chronos 20M SFT": os.path.join(
        WORK_DIR,
        "eval_results",
        "chronos",
        # "chronos_nondeterministic",
        "chronos_t5_mini_ft-0",
        data_split,
    ),
    "Chronos 20M": os.path.join(
        WORK_DIR,
        "eval_results",
        # "chronos_nondeterministic",
        "chronos",
        "chronos_mini_zeroshot",
        data_split,
    ),
    "Chronos 200M": os.path.join(
        WORK_DIR,
        "eval_results",
        "chronos",
        # "chronos_nondeterministic",
        "chronos_base_zeroshot",
        data_split,
    ),
    "Time MOE 50M": os.path.join(
        WORK_DIR,
        "eval_results",
        "timemoe",
        "timemoe-50m",
        data_split,
    ),
    "TimesFM 200M": os.path.join(
        WORK_DIR,
        "eval_results",
        "timesfm",
        "timesfm-200m",
        data_split,
    ),
    # "Dynamix": os.path.join(WORK_DIR, "eval_results", "dynamix", data_split),
}

In [None]:
import re

# Pattern to find np.float64(...) values or plain numbers/special values
_VALUE_PATTERN = re.compile(r"np\.float\d*\(([^)]+)\)|([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?|nan|inf|-inf)", re.IGNORECASE)


def _parse_float(s: str) -> float:
    """Parse a string to float, handling nan/inf."""
    s = s.strip().lower()
    if s == "nan":
        return np.nan
    elif s in {"inf", "+inf"}:
        return np.inf
    elif s == "-inf":
        return -np.inf
    return float(s)


def parse_metric_lists(metrics_df: pd.DataFrame) -> pd.DataFrame:
    def parse_value(value):
        if isinstance(value, str) and value.strip().startswith("["):
            return [_parse_float(m.group(1) or m.group(2)) for m in _VALUE_PATTERN.finditer(value)]
        return value

    parsed = metrics_df.copy()
    for col in parsed.columns:
        if col != "system":
            parsed[col] = parsed[col].apply(parse_value)
    return parsed


def aggregate_system_metrics(metrics_df: pd.DataFrame) -> pd.DataFrame:
    def agg(value):
        return (
            float(np.nanmean(value))
            if isinstance(value, list) and value
            else (np.nan if isinstance(value, list) else value)
        )

    aggregated = metrics_df.copy()
    for col in aggregated.columns:
        if col != "system":
            aggregated[col] = aggregated[col].apply(agg)
    return aggregated


In [None]:
metrics_all_runs = defaultdict(dict)
instance_metrics_all_runs = defaultdict(dict)
for model_name, run_metrics_dir in run_metrics_dir_dict.items():
    print(model_name)
    if not os.path.exists(run_metrics_dir):
        print(f"Run metrics dir does not exist: {run_metrics_dir}")
        continue
    for file in sorted(
        filter(lambda x: x.endswith(".csv"), os.listdir(run_metrics_dir)),
        key=lambda x: int(x.split("_pred")[1].split(".csv")[0]),
    ):
        if file.endswith(".csv"):
            prediction_length = int(file.split("_pred")[1].split(".csv")[0])
            metrics_df = pd.read_csv(os.path.join(run_metrics_dir, file))
            parsed_metrics = parse_metric_lists(metrics_df)
            aggregated_metrics = aggregate_system_metrics(parsed_metrics)
            metrics_all_runs[model_name][prediction_length] = aggregated_metrics.to_dict()
            instance_metrics_all_runs[model_name][prediction_length] = parsed_metrics.copy()

In [None]:
len(metrics_all_runs["Chronos 20M SFT"][64]["smape"])

In [None]:
# Unroll metrics from nested structure
unrolled_metrics = {
    model: {
        pred_len: {k: list(v.values()) for k, v in metrics.items() if k != "system"}
        for pred_len, metrics in model_metrics.items()
    }
    for model, model_metrics in metrics_all_runs.items()
}

# Sort by median smape at prediction length 128 (excludes models without this data)
median_smape_128 = lambda m: float(np.nanmedian(unrolled_metrics[m][128]["smape"]))
sorted_models = sorted(
    (m for m in unrolled_metrics if 128 in unrolled_metrics[m] and "smape" in unrolled_metrics[m][128]),
    key=median_smape_128,
)
unrolled_metrics = {m: unrolled_metrics[m] for m in sorted_models}
n_runs = len(unrolled_metrics)

In [None]:
unrolled_metrics["Chronos 20M SFT"][128]["smape"][:10]

In [None]:
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

METRICS = ["mae", "mse", "smape"]
PRED_LENGTHS = [128, 256, 512]

# Build results dictionary
p_values = defaultdict(lambda: defaultdict(dict))
for baseline, metrics_by_len in unrolled_metrics.items():
    if baseline == "Panda":
        continue
    for pred_len, metrics in metrics_by_len.items():
        for metric in METRICS:
            baseline_vals = np.array(metrics[metric])
            panda_vals = np.array(unrolled_metrics["Panda"][pred_len][metric])
            valid = ~np.isnan(baseline_vals)
            stat, pval = wilcoxon(panda_vals[valid], baseline_vals[valid], correction=True)
            p_values[pred_len][f"{metric}_pvalue"][baseline] = pval
            p_values[pred_len][f"{metric}_statistic"][baseline] = stat

# Process and save results for each prediction length
for pred_len in PRED_LENGTHS:
    df = pd.DataFrame(p_values[pred_len]).dropna()
    for col in df.columns:
        if "pvalue" in col and len(df[col]) > 0:
            rejected, pvals_adj, *_ = multipletests(df[col])
            df[f"{col}_pval_adj"], df[f"{col}_reject"] = pvals_adj, rejected
    df.to_csv(f"{outputs_save_dir}/pvals_{pred_len}.csv")

In [None]:
df

In [None]:
default_colors = DEFAULT_COLORS[: n_runs + 1]
default_colors = default_colors[:3] + default_colors[4:7]
print(default_colors)

In [None]:
unrolled_metrics.keys()

In [None]:
selected_metric = "smape"
legend_handles = make_box_plot(
    unrolled_metrics,
    prediction_length=128,
    metric_to_plot=selected_metric,  # Specify which metric to plot
    sort_runs=True,  # Optionally sort runs by their metric values
    colors=default_colors,
    title=None,
    title_kwargs={"fontsize": 10},
    use_inv_spearman=True,
    order_by_metric="smape",
    save_path=f"{figs_save_dir}/{selected_metric}_128.pdf",
    ylabel_fontsize=12,
    show_xlabel=False,
    box_percentile_range=(25, 75),
    whisker_percentile_range=(5, 95),
    alpha_val=0.8,
    fig_kwargs={"figsize": (2, 4)},
    box_width=1.0,
)

In [None]:
selected_metric = "mae"
legend_handles = make_box_plot(
    unrolled_metrics,
    prediction_length=128,
    metric_to_plot=selected_metric,  # Specify which metric to plot
    sort_runs=True,  # Optionally sort runs by their metric values
    colors=default_colors,
    title=None,
    title_kwargs={"fontsize": 10},
    use_inv_spearman=True,
    order_by_metric="smape",
    save_path=f"{figs_save_dir}/{selected_metric}_128.pdf",
    ylabel_fontsize=12,
    show_xlabel=False,
    box_percentile_range=(25, 75),
    whisker_percentile_range=(5, 90),
    alpha_val=0.8,
    fig_kwargs={"figsize": (2, 4)},
    box_width=1.0,
)

In [None]:
plt.figure(figsize=(6, 1))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=6,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_horizontal_patches.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
plt.figure(figsize=(3, 2))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles,
    loc="upper center",
    frameon=True,
    ncol=1,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_vertical_patches.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
smape_metrics_dict, has_nans = get_summary_metrics_dict(unrolled_metrics, "smape")

In [None]:
metrics = ["mse", "mae", "smape", "spearman"]
metrics_dicts, has_nans = zip(*[get_summary_metrics_dict(unrolled_metrics, metric) for metric in metrics])
all_metrics_dict = {m: metrics_dicts[i] for i, m in enumerate(metrics)}
has_nans_dict = {m: has_nans[i] for i, m in enumerate(metrics)}

In [None]:
# Count the number of NaNs for each metric and model
nan_counts = {}
for metric_name, metric_data in all_metrics_dict.items():
    nan_counts[metric_name] = {}
    for model_name, model_data in metric_data.items():
        # all_vals = np.concatenate(model_data["all_vals"])
        all_vals_pred128 = model_data["all_vals"][1]
        nan_count = np.isnan(all_vals_pred128).sum()
        nan_counts[metric_name][model_name] = nan_count
        if nan_count > 0:
            print(f"Found {nan_count} NaNs in {model_name} for {metric_name}")

print(nan_counts)

In [None]:
has_nans_dict

Order model names by sMAPE

In [None]:
model_names_ordering = []  # sorted by median smape at 128
for model_name, data in all_metrics_dict["smape"].items():
    median_metrics_128 = data["medians"][1]
    model_names_ordering.append((model_name, median_metrics_128))
model_names_ordering = sorted(model_names_ordering, key=lambda x: x[1])
model_names_ordering = [x[0] for x in model_names_ordering]
print(model_names_ordering)

# Reorder all_metrics_dict according to model_names_ordering for each metric
reordered_metrics_dict = {}
for metric_name, metric_data in all_metrics_dict.items():
    reordered_metric_data = {}

    # Add models in the order specified by model_names_ordering
    for model_name in model_names_ordering:
        if model_name in metric_data:
            reordered_metric_data[model_name] = metric_data[model_name]
        else:
            raise ValueError(f"Model {model_name} not found in {metric_name}")

    reordered_metrics_dict[metric_name] = reordered_metric_data
all_metrics_dict = reordered_metrics_dict

In [None]:
selected_pred_lengths = [128, 256, 512]
all_pred_lengths = list(unrolled_metrics["Panda"].keys())
print(all_pred_lengths)

In [None]:
for model_name in unrolled_metrics.keys():
    print(f"========= model_name: {model_name}")
    for i, pred_length in enumerate(all_pred_lengths):
        if pred_length not in selected_pred_lengths:
            continue
        print(f"pred_length: {pred_length}")
        for metric in ["mae"]:
            print(f"{metric} median: {all_metrics_dict[metric][model_name]['medians'][i]:.2f}")
            print(f"{metric} p25: {all_metrics_dict[metric][model_name]['p25'][i]:.2f}")
            print(f"{metric} p75: {all_metrics_dict[metric][model_name]['p75'][i]:.2f}")
        print("--------------------------------")

In [None]:
legend_handles = plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    ["mse", "mae", "smape", "spearman"],
    metrics_to_show_envelope=["mae", "smape", "spearman"],
    n_cols=4,
    n_rows=1,
    save_path=f"{figs_save_dir}/zeroshot_metrics_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"loc": "upper left", "frameon": True, "fontsize": 10},
    colors=default_colors,
    use_inv_spearman=True,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
plt.figure(figsize=(6, 1))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles.values(),
    loc="upper center",
    frameon=True,
    ncol=6,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_horizontal.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
metric_to_plot = "smape"
legend_handles = plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
plt.figure(figsize=(3, 2))

# Add the legend with the combined handles
legend = plt.legend(
    handles=legend_handles.values(),
    loc="upper center",
    frameon=True,
    ncol=1,
    framealpha=1.0,
    fontsize=16,
)

plt.xticks([])
plt.yticks([])
plt.tight_layout(pad=0)
plt.savefig(f"{figs_save_dir}/baselines_legend_vertical.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
metric_to_plot = "mae"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
metric_to_plot = "mse"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    # metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    # legend_kwargs={"frameon": True, "fontsize": 12, "loc": "upper left"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
)

In [None]:
metric_to_plot = "spearman"
plot_all_metrics_by_prediction_length(
    all_metrics_dict,
    [metric_to_plot],
    metrics_to_show_envelope=[metric_to_plot],
    n_cols=1,
    n_rows=1,
    individual_figsize=(4, 4.5),
    save_path=f"{figs_save_dir}/zeroshot_{metric_to_plot}_autoregressive_rollout_metrics.pdf",
    show_legend=False,
    legend_kwargs={"frameon": True, "fontsize": 10, "loc": "lower right"},
    colors=default_colors,
    percentile_range=(40, 60),
    has_nans=has_nans_dict,
    use_inv_spearman=True,
)