In [142]:
from datetime import datetime, timedelta, timezone
import polars as pl
import pandas as pd
import os
import glob
import plotly.express as px
import numpy as np

start_date = datetime(2024, 11, 26, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 11, 27, 23, 59, 59).replace(tzinfo=timezone.utc)

filter = '10m'

sensor_id = [2,3,6]

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

# processed 10min average measurement data
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "calibrated_1_min_acropolis.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()

df_p_1min = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()


In [143]:
pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_h_acropolis.parquet")).collect()

system_id,sys_name_short,creation_timestamp,co2,sensor_temperature,h2o,ws,wd,Stdev,NbPoints,OriginalFlag,Flag
i64,str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,u32,i32,str
1,"""acropolis-1""",2024-01-02 12:30:00 UTC,428.685467,30.804861,0.813515,11.170833,236.333333,0.861973,24,389,"""K"""
1,"""acropolis-1""",2024-01-02 13:30:00 UTC,428.981692,30.566667,0.976711,10.869492,233.559322,0.801614,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 14:30:00 UTC,429.049651,30.211273,1.067129,10.856364,233.636364,0.7999,55,0,"""O"""
1,"""acropolis-1""",2024-01-02 15:30:00 UTC,428.835781,30.05297,1.166373,8.978182,233.436364,0.727168,55,0,"""O"""
1,"""acropolis-1""",2024-01-02 16:30:00 UTC,428.400168,29.928531,1.275964,9.39661,233.050847,0.684206,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 17:30:00 UTC,428.303157,29.964583,1.292704,10.128571,230.267857,0.691555,56,0,"""O"""
1,"""acropolis-1""",2024-01-02 18:30:00 UTC,428.266314,29.894545,1.300241,8.809091,229.854545,0.652361,55,0,"""O"""
1,"""acropolis-1""",2024-01-02 19:30:00 UTC,428.41601,29.959152,1.296685,8.874545,224.8,0.627287,55,0,"""O"""
1,"""acropolis-1""",2024-01-02 20:30:00 UTC,428.472649,29.935714,1.254004,8.633929,228.75,0.658198,56,0,"""O"""
1,"""acropolis-1""",2024-01-02 21:30:00 UTC,428.054063,29.960508,1.163936,10.611864,232.305085,0.856411,59,0,"""O"""


In [144]:
df.head(1)

system_id,creation_timestamp,system_name,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,revision,receipt_timestamp,h2o_ah,h2o_v%,bme280_h2o_v%,gmp343_dry,slope,intercept,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,raspi_memory_usage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid,slope_interpolated,intercept_interpolated,gmp343_corrected,date,sys_name_short
i64,"datetime[μs, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,date,str
1,2024-11-26 00:00:00 UTC,,423.383333,484.65,485.8,29.7,25.361667,25.701667,21.653333,25.766667,938.195,34.0,2024-11-26 00:00:28.745780480 UTC,7.574485,1.12842,0.963425,491.344429,,,1.6,0.5,2.8,200.1,145.0,245.0,1732600000.0,,,,,16.93,953.16,32.69,0.02,52.5,0.557,0.128,0.0,1.0,1.0,1.0,0.98804,-41.735626,443.732277,2024-11-26,"""acropolis-1"""


In [145]:
df_p_1min.head(1)

creation_timestamp,picarro_corrected,h2o_reported,std
"datetime[μs, UTC]",f64,f64,f64
2024-11-26 00:00:00 UTC,452.56423,0.880752,1.110101


In [146]:
df_p_filtered = df_p_1min.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .with_columns(sys_name_short = pl.lit("Picarro")) \
    .rename({"picarro_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected") 

In [147]:
df_filtered = df.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .filter(pl.col("system_id").is_in(sensor_id)) \
    .filter(~(pl.col("creation_timestamp").dt.hour() == 1)) \
    .rename({"gmp343_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected", "gmp343_temperature")

In [148]:
# join picarro dataframe for diff calculation
df_filtered = df_filtered.join(df_p_filtered.select("creation_timestamp", "co2_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("co2_corrected") - pl.col("co2_corrected_right"))

In [149]:
df_plot = pl.concat([df_filtered,df_p_filtered], how="diagonal") \
    # .group_by_dynamic("creation_timestamp", every='10m', by=["sys_name_short"]) \
    # .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
    #         pl.col("co2_corrected").std().alias("std"))

In [150]:
fig = px.line(df_plot, x="creation_timestamp", y="co2_corrected", color="sys_name_short")
fig.update_layout(
    yaxis_title='CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [151]:
fig = px.line(df_filtered, x="creation_timestamp", y="gmp343_temperature", color="sys_name_short")
fig.update_layout(
    yaxis_title='Sensor Temperature (°C)',
    xaxis_title='',
    title='',
)
fig.show()

In [152]:
fig = px.line(df_plot, x="creation_timestamp", y="diff", color="sys_name_short")
fig.update_layout(
    yaxis_title='System - PICARRO: CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [153]:
import numpy as np

df_plot.filter(pl.col("sys_name_short") == "acropolis-3") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
3374.992382,58.094685


In [154]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-2") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
1.281813,1.132172


In [155]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-6") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
2.086868,1.4446
