In [15]:
import os
import polars as pl
import plotly.express as px
from datetime import datetime, timezone


DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [16]:
df_1_min = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_min_acropolis.parquet")) \
    .filter(pl.col("Flag") == 'O')

df_p_dwd = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_DWD_Picarro_G2301_413.parquet")) \
    .filter(pl.col("creation_timestamp") < datetime(2024, 5, 7, 0, 0, 0).replace(tzinfo=timezone.utc))
    
df_p_icos = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))

df_p_1_min = pl.concat([df_p_dwd,df_p_icos], how="diagonal") \
    .with_columns(pl.lit("Picarro").alias("sys_name_short"),
        pl.lit(0.0).alias("diff")) 

In [17]:
df_1_min.head()

creation_timestamp,system_id,sys_name_short,co2,h2o,ws,wd,OriginalFlag,Flag
"datetime[μs, UTC]",i64,str,f32,f64,f64,f64,i32,str
2024-01-02 12:36:00 UTC,1,"""acropolis-1""",426.914825,0.750941,6.6,221.0,0,"""O"""
2024-01-02 12:37:00 UTC,1,"""acropolis-1""",429.886505,0.756998,6.6,221.0,0,"""O"""
2024-01-02 12:38:00 UTC,1,"""acropolis-1""",430.034119,0.756623,10.8,232.0,0,"""O"""
2024-01-02 12:39:00 UTC,1,"""acropolis-1""",429.856812,0.761341,10.8,232.0,0,"""O"""
2024-01-02 12:40:00 UTC,1,"""acropolis-1""",428.979248,0.765358,7.3,244.0,0,"""O"""


In [18]:
df_1_h = df_1_min.join(df_p_1_min.select("creation_timestamp", "picarro_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("co2") - pl.col("picarro_corrected")) \
    .sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h', by=["system_id", "sys_name_short"]) \
        .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
             pl.col("co2").std().alias("std"))

In [19]:
df_1_h.head()

system_id,sys_name_short,creation_timestamp,co2,h2o,ws,wd,OriginalFlag,Flag,picarro_corrected,diff,std
i64,str,"datetime[μs, UTC]",f32,f64,f64,f64,f64,str,f64,f64,f32
1,"""acropolis-1""",2024-01-02 12:00:00 UTC,428.685425,0.813515,11.170833,236.333333,0.0,,428.527514,0.157952,0.861973
1,"""acropolis-1""",2024-01-02 13:00:00 UTC,429.014984,0.977683,10.825,233.616667,0.0,,428.018959,0.996026,0.835584
1,"""acropolis-1""",2024-01-02 14:00:00 UTC,428.934998,1.068206,10.853333,234.566667,0.0,,428.275579,0.659386,1.00751
1,"""acropolis-1""",2024-01-02 15:00:00 UTC,428.74649,1.168354,8.905085,233.355932,0.0,,427.9045,0.841957,0.926748
1,"""acropolis-1""",2024-01-02 16:00:00 UTC,428.376892,1.275909,9.388333,233.183333,0.0,,428.635911,-0.258985,0.701865


In [20]:
df_1_h.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "side-by-side", "1_h_diff_with_picarro.parquet"))

In [21]:
fig = px.scatter(df_1_h.filter(pl.col("system_id") == 2), x="creation_timestamp", y = "diff")
fig.update_layout(yaxis_range=[-20,20])
fig.show()