In [1]:
from datetime import datetime
import plotly.express as px
import polars as pl
import numpy as np
import os
import glob

DATA_DIR = "/Users/patrickaigner/Documents/PROJECTS/ACROPOLIS-edge/ThingsBoard-Downloader/data"
PICARRO_DIR = "/Users/patrickaigner/Documents/PROJECTS/acropolis-visualisation/data/processed/picarro/Calibrated_1_h_ICOS_Picarro_G2401_529.parquet"

start_date = datetime(2025, 1, 26, 3, 0, 0)
end_date = datetime(2025, 1 ,30, 8, 0, 0)

filter = '1h'
sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
sensor_id = [3,6]

In [2]:
data_paths = glob.glob(os.path.join(DATA_DIR, "2025", "*.parquet"))

dfs = []

for path in data_paths:
    dfs.append(pl.read_parquet(path).with_columns(system_name = pl.lit(os.path.basename(path).split(".")[0])))
    
df = pl.concat(dfs, how="diagonal") \
    .filter(pl.col("datetime").is_between(start_date, end_date)) \
    .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id"))
    

df = df.filter(pl.col("system_id").is_in(sensor_id))

In [3]:
df_p_icos = pl.read_parquet(PICARRO_DIR)

In [4]:
df_p_icos = df_p_icos.rename({"creation_timestamp": "datetime", "picarro_corrected": "CO2"}) \
    .select(["datetime", "CO2"]) \
    .with_columns(pl.col("datetime").cast(pl.Datetime("ms"))) \
    .filter(pl.col("datetime").is_between(start_date, end_date)) \
    .with_columns(system_name = pl.lit("Picarro_ICOS")) \
    .with_columns(system_id = pl.lit(529)) \
    .with_columns(pl.col("system_id").cast(pl.Int64))

In [5]:
column_name = "CO2"

df_plot = df.rename({"gmp343_edge_corrected": column_name}) \
    .select(["datetime", "system_id", column_name]) \
    .group_by_dynamic("datetime", every=filter, group_by="system_id") \
    .agg(pl.col(column_name).mean())

df_plot = pl.concat([df_plot, df_p_icos], how="diagonal")

fig = px.line(
    df_plot,
    x="datetime",
    y=column_name,
    markers=True,
    color="system_id"
)
fig.show()

In [6]:
column_name = "gmp343_edge_corrected"

# join with picarro
df_plot = df.select(["datetime", "system_id", column_name]) \
    .group_by_dynamic("datetime", every=filter, group_by="system_id") \
    .agg(pl.col(column_name).mean())

df_plot = df_plot.join(df_p_icos, on="datetime", how="inner") \
    .with_columns(diff = pl.col(column_name) - pl.col("CO2"))
    
# rmse
df_plot.filter(pl.col("system_id") == 3) \
    .with_columns([
        (pl.col("diff")).mean().alias("mean_full_deployment"),
        (np.abs(pl.col("diff"))).mean().alias("mae_full_deployment"),
        (np.square(pl.col("diff")).mean()).alias("mse_full_deployment"),
                ]) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment")) \
    .select("mean_full_deployment", "mae_full_deployment", "mse_full_deployment", "rmse_full_deployment") \
    .head(1)

mean_full_deployment,mae_full_deployment,mse_full_deployment,rmse_full_deployment
f64,f64,f64,f64
0.733505,0.734794,0.683832,0.826941


In [7]:
# plot
fig = px.line(
    df_plot,
    x="datetime",
    y="diff",
    markers=True,
    color="system_id"
)
fig.update_layout(yaxis_range=[-3,3])
fig.show()