In [2]:
from datetime import datetime, timedelta, timezone
import polars as pl
import pandas as pd
import os
import glob
import plotly.express as px
import numpy as np

start_date = datetime(2024, 11, 21, 15, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 12, 20, 23, 59, 59).replace(tzinfo=timezone.utc)

filter = '10m'

sensor_id = [2,6]

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_L2_1_min_acropolis.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .filter(pl.col("Flag") == 'O') \
    .collect()

df_p_1min = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()


In [3]:
pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_L2_1_h_acropolis.parquet")).collect()

system_id,sys_name_short,creation_timestamp,gmp343_corrected,gmp343_temperature,h2o_v%,bme280_pressure,enclosure_bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,Stdev,NbPoints,Flag
i64,str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,u32,str
1,"""acropolis-1""",2024-01-02 12:30:00 UTC,428.685467,30.804861,0.813515,924.885278,943.747083,11.170833,236.333333,0.861973,24,"""K"""
1,"""acropolis-1""",2024-01-02 13:30:00 UTC,429.014985,30.563611,0.977683,924.944139,943.571,10.825,233.616667,0.835584,60,"""O"""
1,"""acropolis-1""",2024-01-02 14:30:00 UTC,428.934966,30.199222,1.068206,924.2424,943.065333,10.853333,234.566667,1.00751,60,"""O"""
1,"""acropolis-1""",2024-01-02 15:30:00 UTC,428.746457,30.053898,1.168354,924.225864,942.917288,8.905085,233.355932,0.926748,59,"""O"""
1,"""acropolis-1""",2024-01-02 16:30:00 UTC,428.376926,29.929444,1.275909,924.556278,942.878,9.388333,233.183333,0.701865,60,"""O"""
…,…,…,…,…,…,…,…,…,…,…,…,…
17,"""acropolis-17""",2024-12-11 06:30:00 UTC,456.006675,29.333889,1.103975,938.202739,961.186333,1.665,68.836667,1.358553,60,"""O"""
17,"""acropolis-17""",2024-12-11 07:30:00 UTC,458.365671,29.28373,1.106043,938.441123,961.590833,1.336667,77.975,1.622808,60,"""O"""
17,"""acropolis-17""",2024-12-11 08:30:00 UTC,462.264243,29.230029,1.113397,938.790287,962.001379,1.208621,88.851724,1.930526,58,"""O"""
17,"""acropolis-17""",2024-12-11 09:30:00 UTC,460.895891,29.203056,1.119273,939.598383,962.4425,1.065,94.698333,3.067914,60,"""O"""


In [4]:
df.head(1)

creation_timestamp,system_id,sys_name_short,gmp343_corrected,gmp343_temperature,h2o_v%,bme280_pressure,enclosure_bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,Flag
"datetime[μs, UTC]",i64,str,f64,f64,f64,f64,f64,f64,f64,str
2024-11-21 15:00:00 UTC,1,"""acropolis-1""",439.336761,30.3,0.549736,919.291667,933.12,1.2,132.1,"""O"""


In [5]:
df_p_1min.head(1)

creation_timestamp,picarro_corrected,h2o_reported,std
"datetime[μs, UTC]",f64,f64,f64
2024-11-21 15:00:00 UTC,439.894139,0.429035,0.598067


In [6]:
df_p_filtered = df_p_1min.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .with_columns(sys_name_short = pl.lit("Picarro")) \
    .rename({"picarro_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected") 

In [7]:
df_filtered = df.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .filter(pl.col("system_id").is_in(sensor_id)) \
    .filter(~(pl.col("creation_timestamp").dt.hour() == 1)) \
    .rename({"gmp343_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected", "gmp343_temperature")

In [8]:
# join picarro dataframe for diff calculation
df_filtered = df_filtered.join(df_p_filtered.select("creation_timestamp", "co2_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("co2_corrected") - pl.col("co2_corrected_right"))

  df_filtered = df_filtered.join(df_p_filtered.select("creation_timestamp", "co2_corrected"), on = ["creation_timestamp"], how= "left") \


In [9]:
df_plot = pl.concat([df_filtered,df_p_filtered], how="diagonal") \
    .group_by_dynamic("creation_timestamp", every='1h', by=["sys_name_short"]) \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("co2_corrected").std().alias("std"))

  .group_by_dynamic("creation_timestamp", every='1h', by=["sys_name_short"]) \


In [10]:
fig = px.line(df_plot, x="creation_timestamp", y="co2_corrected", color="sys_name_short")
fig.update_layout(
    yaxis_title='CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [11]:
fig = px.line(df_filtered, x="creation_timestamp", y="gmp343_temperature", color="sys_name_short")
fig.update_layout(
    yaxis_title='Sensor Temperature (°C)',
    xaxis_title='',
    title='',
)
fig.show()

In [12]:
fig = px.line(df_plot, x="creation_timestamp", y="diff", color="sys_name_short")
fig.update_layout(
    yaxis_title='System - PICARRO: CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [13]:
import numpy as np

df_plot.filter(pl.col("sys_name_short") == "acropolis-3") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
,


In [14]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-2") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
0.413488,0.64303


In [15]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-6") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
1.170365,1.081834
