In [1]:
import os
import polars as pl
import plotly.express as px
from datetime import datetime, timezone


DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [2]:
df_1_min = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_min_acropolis.parquet")) \
    .filter(pl.col("Flag") == 'O')

df_p_dwd = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_DWD_Picarro_G2301_413.parquet")) \
    .filter(pl.col("creation_timestamp") < datetime(2024, 5, 7, 0, 0, 0).replace(tzinfo=timezone.utc))
    
df_p_icos = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))

df_p_1_min = pl.concat([df_p_dwd,df_p_icos], how="diagonal") \
    .with_columns(pl.lit("Picarro").alias("sys_name_short"),
        pl.lit(0.0).alias("diff")) 

In [3]:
df_1_min.head()

creation_timestamp,system_id,sys_name_short,co2,sensor_temperature,h2o,ws,wd,OriginalFlag,Flag
"datetime[μs, UTC]",i64,str,f32,f64,f64,f64,f64,i32,str
2024-01-02 12:36:00 UTC,1,"""acropolis-1""",426.914825,30.4,0.750941,6.6,221.0,0,"""O"""
2024-01-02 12:37:00 UTC,1,"""acropolis-1""",429.886505,30.5,0.756998,6.6,221.0,0,"""O"""
2024-01-02 12:38:00 UTC,1,"""acropolis-1""",430.034119,30.5,0.756623,10.8,232.0,0,"""O"""
2024-01-02 12:39:00 UTC,1,"""acropolis-1""",429.856812,30.533333,0.761341,10.8,232.0,0,"""O"""
2024-01-02 12:40:00 UTC,1,"""acropolis-1""",428.979248,30.6,0.765358,7.3,244.0,0,"""O"""


In [4]:
df_1_h = df_1_min.join(df_p_1_min.select("creation_timestamp", "picarro_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("co2") - pl.col("picarro_corrected")) \
    .sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h', by=["system_id", "sys_name_short"]) \
        .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
             pl.col("co2").std().alias("std"))

In [5]:
df_1_h.head()

system_id,sys_name_short,creation_timestamp,co2,sensor_temperature,h2o,ws,wd,OriginalFlag,Flag,picarro_corrected,diff,std
i64,str,"datetime[μs, UTC]",f32,f64,f64,f64,f64,f64,str,f64,f64,f32
1,"""acropolis-1""",2024-01-02 12:00:00 UTC,428.685425,30.804861,0.813515,11.170833,236.333333,0.0,,428.527514,0.157952,0.861973
1,"""acropolis-1""",2024-01-02 13:00:00 UTC,428.981689,30.566667,0.976711,10.869492,233.559322,0.0,,428.026503,0.955189,0.801614
1,"""acropolis-1""",2024-01-02 14:00:00 UTC,429.049683,30.211273,1.067129,10.856364,233.636364,0.0,,428.317156,0.732495,0.7999
1,"""acropolis-1""",2024-01-02 15:00:00 UTC,428.835785,30.05297,1.166373,8.978182,233.436364,0.0,,427.893969,0.941812,0.727168
1,"""acropolis-1""",2024-01-02 16:00:00 UTC,428.400146,29.928531,1.275964,9.39661,233.050847,0.0,,428.649326,-0.249158,0.684205


In [6]:
df_1_h.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "side-by-side", "1_h_diff_with_picarro.parquet"))

In [7]:
fig = px.scatter(df_1_h.filter(pl.col("system_id") == 2), x="creation_timestamp", y = "diff")
fig.update_layout(yaxis_range=[-20,20])
fig.show()