In [165]:
import polars as pl
import polars.selectors as cs
import os
import sys
from datetime import datetime, timezone
import plotly.express as px
import numpy as np

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)
    
from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY, POSTPROCESSED_DATA_DIRECTORY
from utils.import_data import import_acropolis_system_data

assert(os.path.exists(POSTPROCESSED_DATA_DIRECTORY))
assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [206]:
# Filters
start_date = datetime(2024, 1, 1, 0, 0, 0)
end_date = datetime(2024, 4, 20, 0, 0, 0)

filter = '1h'
ids = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [208]:
# Load Picarro Data
# DWD Picarro: "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"
df_p_dwd = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))
df_p_dwd = df_p_dwd.filter(pl.col("datetime").is_between(start_date, end_date)).collect()

# ICOS Picarro: "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"
df_p_icos = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))
df_p_icos = df_p_icos.filter(pl.col("datetime").is_between(start_date, end_date)).collect()
    
df_p = pl.concat([df_p_dwd,df_p_icos], how="diagonal")
del df_p_dwd, df_p_icos
df_p.head(1).vstack(df_p.tail(1))

system_name,system_id,datetime,picarro_corrected,h2o_reported
str,i32,datetime[ms],f64,f64
"""Picarro_G2301""",413,2024-01-01 00:00:00,443.76065,0.933111
"""Picarro_G2301""",413,2024-04-17 11:04:00,469.973389,0.979418


In [209]:
df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "1_min_sbs_period_acropolis.parquet")) \
    .with_columns(ordinal_day = pl.col("datetime").dt.ordinal_day())
df.head(1).vstack(df.tail(1))

system_id,system_name,datetime,ts,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,bme280_temperature,bme280_humidity,bme280_pressure,sht45_temperature,sht45_humidity,gmp343_edge_corrected,gmp343_edge_dry,h2o_ah,h2o_v%,bme280_h2o_v%,gmp343_dry,slope,intercept,slope_interpolated,intercept_interpolated,gmp343_corrected,wxt532_direction_min,wxt532_direction_avg,wxt532_direction_max,wxt532_speed_min,wxt532_speed_avg,wxt532_speed_max,wxt532_last_update_time,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,enclosure_bme280_temperature,enclosure_bme280_humidity,enclosure_bme280_pressure,raspi_cpu_temperature,raspi_disk_usage,raspi_cpu_usage,raspi_memory_usage,ups_powered_by_grid,ups_battery_is_fully_charged,ups_battery_error_detected,ups_battery_above_voltage_threshold,cal_gmp343_slope,cal_gmp343_intercept,cal_sht_45_offset,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,picarro_corrected,ordinal_day
i32,str,datetime[ms],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i16
1,"""tum-esm-midcost-raspi-1""",2024-02-07 00:00:00,1707300000000.0,411.6,464.15,464.933333,27.433333,24.266667,19.165,942.153333,24.253333,22.316667,,,5.887648,0.866899,0.744471,468.99908,0.953834,-1.882813,0.981017,-26.299756,433.796346,,,,,,,,,,,,30.73,13.24,953.31,48.7,0.488,0.019,0.111,1.0,1.0,0.0,1.0,,,,,,,,,,,,,,434.600735,38
20,"""tum-esm-midcost-raspi-20""",2024-02-11 23:59:00,1707700000000.0,389.0,447.2,445.166667,28.083333,24.403333,19.875,928.88,25.103333,24.531667,,,6.707844,1.003944,0.813373,449.681215,0.995614,-9.092189,0.995446,-9.057748,438.575679,,,,,,,,,,,,31.44,12.55,940.26,53.0,0.359,0.032,0.111,1.0,1.0,0.0,1.0,,,,,,,,,,,,,,440.078157,42


In [210]:
fig = px.line(df.filter(pl.col("system_id") == id), x="datetime", y = "gmp343_dry", color = "system_id")
fig.show()

In [217]:
def reduce_calibration_days(df, frequency_days):
    df_cal = df.filter(pl.col("ordinal_day")%frequency_days == 0) \
        .select("datetime", "system_id", "slope_interpolated", "intercept_interpolated") \
        .group_by_dynamic("datetime", every='1d', group_by=["system_id"]) \
        .agg(cs.numeric().mean()) \
        .sort("datetime")
                
    return df.select("datetime", "system_id", "gmp343_dry") \
        .sort("datetime") \
        .join(df_cal, on=["datetime", "system_id"], how="left") \
        .sort(["system_id", "datetime"]) \
        .group_by("system_id").agg([
        pl.col("datetime"),
        pl.col("gmp343_dry"),
        pl.col("slope_interpolated").interpolate().backward_fill().forward_fill(),
        pl.col("intercept_interpolated").interpolate().backward_fill().forward_fill()
        ]) \
        .explode(["datetime","gmp343_dry", "slope_interpolated", "intercept_interpolated"]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias(f"gmp343_corrected")) \
        .join(df_p.select("datetime", "picarro_corrected"), on="datetime") \
        .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
        .filter(pl.col("diff").is_not_nan()) \
        .group_by_dynamic("datetime", every='1h', group_by=["system_id"]) \
        .agg(cs.numeric().mean()) \
        .group_by("system_id") \
        .agg((pl.col("diff").pow(2).mean().sqrt()).alias(f"rmse_{frequency_days}days"))


In [238]:
df_plot = reduce_calibration_days(df, 1)

for frequency_days in [2,3,4,5,6,7,8,9,10]:
    df_plot = df_plot.join(reduce_calibration_days(df, frequency_days), on="system_id", how="left")

df_plot = df_plot.drop("^.*_right$").with_columns(
    pl.concat_str([pl.lit("MidCost-"),pl.col("system_id").cast(pl.Utf8)], separator="")
    .alias("system_name")
)
df_plot

system_id,rmse_1days,rmse_2days,rmse_3days,rmse_4days,rmse_5days,rmse_6days,rmse_7days,rmse_8days,rmse_9days,rmse_10days,system_name
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,1.161558,1.089143,1.232485,1.207102,1.413161,1.3785,1.30204,1.22299,1.277064,1.854005,"""MidCost-1"""
2,0.686395,0.686683,0.693489,0.659796,0.660523,0.690581,0.690577,0.678873,0.6861,0.696813,"""MidCost-2"""
3,0.956303,0.959455,0.95262,1.485386,1.168172,1.558875,1.13258,1.584861,1.483032,1.58502,"""MidCost-3"""
4,1.333771,1.322549,1.333136,1.528663,1.408008,1.522662,1.461939,1.373686,1.345774,1.665624,"""MidCost-4"""
5,0.869575,0.850806,0.825871,0.782229,0.864786,0.746583,0.848205,0.76184,0.760171,0.948962,"""MidCost-5"""
…,…,…,…,…,…,…,…,…,…,…,…
16,0.629466,0.627574,0.67477,0.782574,0.808371,0.849557,0.832872,0.778456,0.818413,0.823055,"""MidCost-16"""
17,2.619437,2.613859,2.63212,2.577859,2.512725,2.663092,2.673878,2.469792,2.800106,2.170858,"""MidCost-17"""
18,0.709635,0.698733,0.680928,0.737825,0.769657,0.778151,0.838875,1.018302,0.72175,0.792492,"""MidCost-18"""
19,1.477602,1.471886,1.625043,1.85709,1.531241,2.547677,1.405453,2.011817,2.607853,1.947021,"""MidCost-19"""


In [239]:
fig = px.scatter(df_plot, x="system_name", y="rmse_1days")
fig.show()

In [216]:
for id in ids:
    list = df_plot.filter(pl.col("system_id") == id).drop("system_id").transpose()["column_0"].to_list()

    fig = px.line(x = range(1,len(list)+1), y = list)
    fig.update_layout(title=f"{id}", yaxis_range = [0.5,3])
    fig.show()