## Plot computed GP Dims

See our script in `scripts/analysis/compute_gpdims.py`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression

from panda.utils.plot_utils import apply_custom_style

In [None]:
fig_save_dir = os.path.join("../../figures", "eval_metrics_mlm")
os.makedirs(fig_save_dir, exist_ok=True)

apply_custom_style("../../config/plotting.yaml")

In [None]:
WORK_DIR = os.getenv("WORK", "")
DATA_DIR = os.path.join(WORK_DIR, "data")
# eval_results_dir = os.path.join(WORK_DIR, "eval_results_mlm", "panda", "panda_mlm-66M", "test_zeroshot")
eval_results_dir = os.path.join(WORK_DIR, "eval_results_mlm", "panda_mlm", "panda_mlm-21M", "test_zeroshot")
data_split = "test_zeroshot"

In [None]:
results_dict_paths = {
    # "Panda MLM": eval_results_dir,
    # "Polynomial Degree 3": os.path.join(eval_results_dir, "polynomial3"),
    "Linear": os.path.join(eval_results_dir, "linear"),
}

In [None]:
gpdims_filepaths = {}
for model_name, results_dict_path in results_dict_paths.items():
    print(f"Loading gpdim files for {model_name} from {results_dict_path}")
    gpdims_fnames = [f for f in os.listdir(results_dict_path) if f.endswith(".json") and "gpdim" in f]
    print(f"Found {len(gpdims_fnames)} gpdim files for {model_name}: {gpdims_fnames}")
    gpdims_filepaths[model_name] = [os.path.join(results_dict_path, f) for f in gpdims_fnames]

In [None]:
gpdims_completions_all_runs = {}
gpdims_groundtruth_all_runs = {}

for model_name, gpdims_fnames in gpdims_filepaths.items():
    print(f"Processing {model_name} with {len(gpdims_fnames)} files")
    for gpdims_fname in gpdims_fnames:
        # load gpdims
        with open(gpdims_fname, "r") as f:
            gp_dims = json.load(f)
        print(f"number of systems in {gpdims_fname}: {len(gp_dims)}")
        print(f"gpdim of completions of first system in {gpdims_fname}: {gp_dims['LorenzStenflo_pp0']['completions']}")
        print(f"gpdim of groundtruth of first system in {gpdims_fname}: {gp_dims['LorenzStenflo_pp0']['groundtruth']}")
        for sys_name, gp_dim_val in gp_dims.items():
            if sys_name not in gpdims_completions_all_runs:
                gpdims_completions_all_runs[sys_name] = []
            gpdims_completions_all_runs[sys_name].append(gp_dim_val["completions"])
            if sys_name not in gpdims_groundtruth_all_runs:
                gpdims_groundtruth_all_runs[sys_name] = []
            gpdims_groundtruth_all_runs[sys_name].append(gp_dim_val["groundtruth"])

In [None]:
len(gpdims_completions_all_runs.keys())

In [None]:
test_system_name = next(iter(gpdims_completions_all_runs))
print(test_system_name)
test_gpdim_vals = gpdims_completions_all_runs[test_system_name]
print(len(test_gpdim_vals))

In [None]:
for sys_name in gpdims_completions_all_runs.keys():
    gpdims_completions_all_runs[sys_name] = np.mean(gpdims_completions_all_runs[sys_name])

for sys_name in gpdims_groundtruth_all_runs.keys():
    gpdims_groundtruth_all_runs[sys_name] = np.mean(gpdims_groundtruth_all_runs[sys_name])

groundtruth_gp_dims = list(gpdims_groundtruth_all_runs.values())
completions_gp_dims = list(gpdims_completions_all_runs.values())

print(len(groundtruth_gp_dims))

In [None]:
# Convert to numpy arrays
x = np.array(groundtruth_gp_dims)
y = np.array(completions_gp_dims)

# Remove outliers using z-score
z_scores = np.abs(stats.zscore(np.vstack([x, y]).T, axis=0))
outliers = np.any(z_scores > 3, axis=1)
x_clean = x[~outliers]
y_clean = y[~outliers]
print(f"Removed {np.sum(outliers)} outliers")

# Fit linear regression
model = LinearRegression(fit_intercept=True)
model.fit(x_clean.reshape(-1, 1), y_clean)
slope, intercept = model.coef_[0], model.intercept_
r_squared = model.score(x_clean.reshape(-1, 1), y_clean)

# Plot
plt.figure(figsize=(4, 4))
plt.scatter(x_clean, y_clean, color="black", s=5, alpha=0.1)

# Regression line
line_x = np.linspace(min(x_clean), max(x_clean), 100)
plt.plot(
    line_x,
    slope * line_x + intercept,
    "r-",
    alpha=0.9,
    zorder=10,
    label=f"y = {slope:.2f}x + {intercept:.2f} (R² = {r_squared:.2f})",
)

# Identity line
bounds = [min(min(x_clean), min(y_clean)), max(max(x_clean), max(y_clean))]
plt.plot(bounds, bounds, "r--", alpha=0.9, zorder=9, label="y = x")

plt.xlim(bounds)
plt.ylim(bounds)
plt.xlabel("Ground Truth", fontweight="bold")
plt.ylabel("Completions", fontweight="bold")
plt.title("Correlation Dimension (Panda MLM-66M)", fontweight="bold", fontsize=10)
plt.legend(loc="lower right", frameon=True)
plt.tight_layout()

os.makedirs("../figures", exist_ok=True)
plt.savefig("../figures/gpdims.pdf", bbox_inches="tight")
plt.show()

In [None]:
from scipy import stats
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# After you have x_clean and y_clean:

# 1. Pearson Correlation Coefficient (measures linear correlation)
pearson_r, pearson_p = stats.pearsonr(x_clean, y_clean)
print(f"Pearson r: {pearson_r:.3f} (p-value: {pearson_p:.3e})")

# 2. Spearman Correlation Coefficient (measures monotonic relationship, rank-based)
spearman_rho, spearman_p = stats.spearmanr(x_clean, y_clean)
print(f"Spearman ρ: {spearman_rho:.3f} (p-value: {spearman_p:.3e})")

# 3. Mean Absolute Error (average absolute difference)
mae = mean_absolute_error(x_clean, y_clean)
print(f"MAE: {mae:.3f}")

# 4. Root Mean Squared Error (penalizes larger errors more)
rmse = np.sqrt(mean_squared_error(x_clean, y_clean))
print(f"RMSE: {rmse:.3f}")

# 5. Mean Absolute Percentage Error (relative error)
mape = np.mean(np.abs((x_clean - y_clean) / x_clean)) * 100
print(f"MAPE: {mape:.2f}%")

# 6. Concordance Correlation Coefficient (measures agreement, not just correlation)
# This is particularly good for assessing agreement between measurements
mean_x = np.mean(x_clean)
mean_y = np.mean(y_clean)
var_x = np.var(x_clean)
var_y = np.var(y_clean)
covariance = np.cov(x_clean, y_clean)[0, 1]
ccc = (2 * covariance) / (var_x + var_y + (mean_x - mean_y) ** 2)
print(f"Concordance Correlation: {ccc:.3f}")

# 7. Kendall's Tau (another rank-based correlation)
kendall_tau, kendall_p = stats.kendalltau(x_clean, y_clean)
print(f"Kendall's τ: {kendall_tau:.3f} (p-value: {kendall_p:.3e})")