In [None]:
import os
import sys
import numpy as np
import polars as pl
import plotly.express as px
import polars.selectors as cs
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)

In [None]:
df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "2024_L1_1_min_sbs_period_acropolis.parquet")) \
    .sort("datetime") \
    .with_columns(
        diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected"),
        diff_1p_low = pl.col("gmp343_corrected_one_point_low") - pl.col("picarro_corrected"),
        diff_1p_high = pl.col("gmp343_corrected_one_point_high") - pl.col("picarro_corrected"),
        diff_offset = pl.col("offset_low") - pl.col("offset_high"),
        ) \
    .group_by_dynamic("datetime", every='1h', group_by=["system_id", "system_name"]) \
    .agg([
            cs.numeric().mean(),
            pl.col("gmp343_corrected").std().alias("gmp343_corrected_std"),
            pl.col("gmp343_corrected").var().alias("gmp343_corrected_var"),
            pl.col("gmp343_corrected").count().alias("gmp343_corrected_count"),
            pl.col("gmp343_temperature").max().alias("gmp343_temperature_max")
        ]) \
    .sort("system_id") \
    .with_columns(system_name = "ACROPOLIS-" + pl.col('system_id').cast(pl.String))

In [None]:
fig = px.scatter(df, x="datetime", y="gmp343_corrected_count", color = "system_name",)
fig.show()

In [None]:
df_sbs = df.filter(pl.col("diff").is_not_nan()) \
    .group_by(["system_id", "system_name"]) \
    .agg([
            (pl.col("diff").mean()).alias("MEAN"),
            (np.abs(pl.col("diff"))).mean().alias("MAE"),
            (pl.col("diff").pow(2).mean().sqrt()).alias("RMSE"),
            (pl.col("diff_1p_low").mean()).alias("MEAN_1p_low"),
            (np.abs(pl.col("diff_1p_low"))).mean().alias("MAE_1p_low"),
            (pl.col("diff_1p_low").pow(2).mean().sqrt()).alias("RMSE_1p_low"),
            (pl.col("diff_1p_high").mean()).alias("MEAN_1p_high"),
            (np.abs(pl.col("diff_1p_high"))).mean().alias("MAE_1p_high"),
            (pl.col("diff_1p_high").pow(2).mean().sqrt()).alias("RMSE_1p_high"),
            (pl.col("gmp343_temperature").max()),
            (pl.col("slope_interpolated").mean()).alias("slope_interpolated_mean"),
            (pl.col("slope_interpolated").std()).alias("slope_interpolated_std"),
            pl.col("gmp343_corrected").mean().alias("gmp343_corrected_mean"),
            pl.col("gmp343_corrected").median().alias("gmp343_corrected_median"),
            pl.col("gmp343_corrected").std().alias("gmp343_corrected_std"),

        ]) \
    .sort("system_id")

In [None]:
# Plotting the data for system_id X
system_id = 4

df_plot = df.filter(pl.col("system_id")== system_id)

fig = px.scatter(df_plot, x="datetime", y=["picarro_corrected", "gmp343_corrected", "gmp343_corrected_one_point_low", "gmp343_corrected_one_point_high"])
fig.show()
fig = px.scatter(df_plot, x="datetime", y="offset_high")
fig.show()
fig = px.scatter(df_plot, x="datetime", y="offset_low")
fig.show()
fig = px.scatter(df_plot, x="datetime", y="bottle_median_low")
fig.show()
fig = px.scatter(df_plot, x="datetime", y="bottle_median_high")
fig.show()

In [None]:
fig = px.scatter(df, x="datetime", y="gmp343_temperature_max", color = "system_name")
fig.show()
fig = px.scatter(df, x="datetime", y="diff", color = "system_name")
fig.show()
fig = px.scatter(df, x="datetime", y="diff_1p_low", color = "system_name")
fig.show()
fig = px.scatter(df, x="datetime", y="diff_1p_high", color = "system_name")
fig.show()
fig = px.scatter(df_plot, x="datetime", y="diff_offset")
fig.show()

In [None]:
fig = px.scatter(df_sbs, x="system_name", y=["MAE", "MAE_1p_low", "MAE_1p_high"], 
                 labels={
                     "value": "MAE Picarro - System: CO₂ (ppm)",
                     "system_name": "System",
                 })

fig.show()
fig = px.scatter(df_sbs, x="system_name", y=["RMSE", "RMSE_1p_low", "RMSE_1p_high"], 
                 labels={
                     "value": "RMSE Picarro - System: CO₂ (ppm)",
                     "system_name": "System",
                 })
fig.show()

In [None]:
df_sbs = df_sbs.with_columns(
    diff_MAE_1p_low = pl.col("MAE_1p_low") - pl.col("MAE"),
    diff_MAE_1p_high = pl.col("MAE_1p_high") - pl.col("MAE"),
    diff_RMSE_1p_low = pl.col("RMSE_1p_low") - pl.col("RMSE"),
    diff_RMSE_1p_high = pl.col("RMSE_1p_high") - pl.col("RMSE"),
)

df_sbs    

In [None]:
plt.scatter(df_sbs["slope_interpolated_mean"], df_sbs["diff_MAE_1p_low"], color="#0072B2", alpha=0.7,)
plt.scatter(df_sbs["slope_interpolated_mean"], df_sbs["diff_MAE_1p_high"], color="#D55E00", alpha=0.7, )
plt.plot([0.96, 1.04], [0, 0], color='black', linestyle='--', linewidth=0.7)
plt.plot([1, 1], [2.5, -0.8], color='black', linestyle='--', linewidth=0.7)
plt.grid(True, linestyle='--', alpha=0.5)
plt.xlabel("Slope: 2 Point Calibration Correction")
plt.ylabel("MAE: 1P. - 2P. Calibration Correction (ppm)")
plt.legend(["1 Point (400 ppm)", "1 Point (520 ppm)"])

In [None]:
plt.scatter(df_sbs["slope_interpolated_mean"], df_sbs["diff_RMSE_1p_low"], color="#0072B2", alpha=0.7,)
plt.scatter(df_sbs["slope_interpolated_mean"], df_sbs["diff_RMSE_1p_high"], color="#D55E00", alpha=0.7, )
plt.plot([0.96, 1.04], [0, 0], color='black', linestyle='--', linewidth=0.7)
plt.plot([1, 1], [2.5, -0.8], color='black', linestyle='--', linewidth=0.7)
plt.grid(True, linestyle='--', alpha=0.5)
plt.xlabel("Slope: 2 Point Calibration Correction")
plt.ylabel("RMSE: 1P. - 2P. Calibration Correction (ppm)")
plt.legend(["1 Point (400 ppm)", "1 Point (520 ppm)"])

In [None]:
x = df_sbs["slope_interpolated_mean"]
xerr = df_sbs["slope_interpolated_std"]  # Replace with your actual column name
y_low = df_sbs["diff_RMSE_1p_low"]
y_high = df_sbs["diff_RMSE_1p_high"]

# Plot with horizontal error bars
plt.errorbar(
    x, y_low, xerr=xerr, fmt='o', color="#0072B2", alpha=0.7,
    ecolor="#0072B2", elinewidth=0.8, capsize=2, capthick=0.6, label="1 Point (400 ppm)"
)
plt.errorbar(
    x, y_high, xerr=xerr, fmt='o', color="#D55E00", alpha=0.7,
    ecolor="#D55E00", elinewidth=0.8, capsize=2, capthick=0.6, label="1 Point (520 ppm)"
)
# Add reference lines
plt.plot([0.96, 1.04], [0, 0], color='black', linestyle='--', linewidth=0.7)
plt.plot([1, 1], [2.5, -0.8], color='black', linestyle='--', linewidth=0.7)

# Styling
plt.grid(True, linestyle='--', alpha=0.5)
plt.xlabel("Slope: 2 Point Calibration Correction")
plt.ylabel("RMSE: 1P. - 2P. Calibration Correction (ppm)")
plt.legend()

In [None]:
# Average/median concentration

fig = px.scatter(df_sbs, x="system_id", y="gmp343_corrected_mean")
fig.show()
fig = px.scatter(df_sbs, x="system_id", y="gmp343_corrected_median")
fig.show()