In [1]:
from datetime import datetime, timedelta, timezone
import polars as pl
import pandas as pd
import os
import glob
import plotly.express as px
import numpy as np

start_date = datetime(2024, 11, 26, 16, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 11, 28, 23, 59, 59).replace(tzinfo=timezone.utc)

filter = '10m'

sensor_id = [2,3,6]

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

# processed 10min average measurement data
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "calibrated_1_min_acropolis.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()

df_p_1min = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet")).filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()


In [2]:
pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_h_acropolis.parquet")).collect()

system_id,sys_name_short,creation_timestamp,gmp343_corrected,gmp343_temperature,h2o_v%,bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,Stdev,NbPoints,OriginalFlag,Flag
i64,str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,u32,i32,str
1,"""acropolis-1""",2024-01-02 12:30:00 UTC,428.685467,30.804861,0.813515,924.885278,11.170833,236.333333,0.861973,24,389,"""K"""
1,"""acropolis-1""",2024-01-02 13:30:00 UTC,429.014985,30.563611,0.977683,924.944139,10.825,233.616667,0.835584,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 14:30:00 UTC,428.934966,30.199222,1.068206,924.2424,10.853333,234.566667,1.00751,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 15:30:00 UTC,428.746457,30.053898,1.168354,924.225864,8.905085,233.355932,0.926748,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 16:30:00 UTC,428.376926,29.929444,1.275909,924.556278,9.388333,233.183333,0.701865,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 17:30:00 UTC,428.370337,29.965556,1.292902,923.798694,9.941667,230.75,0.81971,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 18:30:00 UTC,428.332008,29.89661,1.300252,922.949576,8.955932,229.322034,0.779326,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 19:30:00 UTC,428.305491,29.961638,1.296882,922.146435,8.786441,225.135593,0.734057,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 20:30:00 UTC,428.377033,29.938983,1.254298,921.550904,8.645763,228.0,0.765081,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 21:30:00 UTC,428.096726,29.960333,1.163668,920.872761,10.635,232.433333,0.911163,60,0,"""O"""


In [3]:
df.head(1)

system_id,creation_timestamp,system_name,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,revision,receipt_timestamp,h2o_ah,h2o_v%,bme280_h2o_v%,gmp343_dry,slope,intercept,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,raspi_memory_usage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid,slope_interpolated,intercept_interpolated,gmp343_corrected,date,sys_name_short
i64,"datetime[μs, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,date,str
1,2024-11-26 16:00:00 UTC,,421.183333,479.9,479.083333,29.916667,24.21,25.998333,20.58,26.023333,942.296667,34.0,2024-11-26 16:00:31.221265152 UTC,7.315924,1.085933,0.92311,484.342972,,,2.7,0.3,4.7,197.9,62.0,277.0,1732600000.0,,,,,18.48,960.24,32.85,0.02,52.5,0.557,0.127,0.0,1.0,1.0,1.0,0.986265,-41.165745,436.524628,2024-11-26,"""acropolis-1"""


In [4]:
df_p_1min.head(1)

creation_timestamp,picarro_corrected,h2o_reported,std
"datetime[μs, UTC]",f64,f64,f64
2024-11-26 16:00:00 UTC,449.750437,1.053934,1.600025


In [5]:
df_p_filtered = df_p_1min.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .with_columns(sys_name_short = pl.lit("Picarro")) \
    .rename({"picarro_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected") 

In [6]:
df_filtered = df.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
    .filter(pl.col("system_id").is_in(sensor_id)) \
    .filter(~(pl.col("creation_timestamp").dt.hour() == 1)) \
    .rename({"gmp343_corrected": "co2_corrected"}) \
    .select("creation_timestamp","sys_name_short", "co2_corrected", "gmp343_temperature")

In [7]:
# join picarro dataframe for diff calculation
df_filtered = df_filtered.join(df_p_filtered.select("creation_timestamp", "co2_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("co2_corrected") - pl.col("co2_corrected_right"))

In [8]:
df_plot = pl.concat([df_filtered,df_p_filtered], how="diagonal") \
    # .group_by_dynamic("creation_timestamp", every='10m', by=["sys_name_short"]) \
    # .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
    #         pl.col("co2_corrected").std().alias("std"))

In [9]:
fig = px.line(df_plot, x="creation_timestamp", y="co2_corrected", color="sys_name_short")
fig.update_layout(
    yaxis_title='CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [10]:
fig = px.line(df_filtered, x="creation_timestamp", y="gmp343_temperature", color="sys_name_short")
fig.update_layout(
    yaxis_title='Sensor Temperature (°C)',
    xaxis_title='',
    title='',
)
fig.show()

In [11]:
fig = px.line(df_plot, x="creation_timestamp", y="diff", color="sys_name_short")
fig.update_layout(
    yaxis_title='System - PICARRO: CO2 (ppm)',
    xaxis_title='',
    title='',
)
fig.show()

In [12]:
import numpy as np

df_plot.filter(pl.col("sys_name_short") == "acropolis-3") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
3.150893,1.775076


In [13]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-2") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
1.382525,1.175808


In [14]:
df_plot.filter(pl.col("sys_name_short") == "acropolis-6") \
    .select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
    .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"))

mse_full_deployment,rmse_full_deployment
f64,f64
1.516696,1.231542
