In [15]:
import polars as pl
import polars.selectors as cs
import os
import sys
from datetime import datetime, timezone
import plotly.express as px
import numpy as np

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)
    
from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY, POSTPROCESSED_DATA_DIRECTORY

assert(os.path.exists(POSTPROCESSED_DATA_DIRECTORY))
assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [81]:
# Filters 2024
ids = [10]

df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "2024_L1_1_min_sbs_period_acropolis.parquet")) \
    .filter(pl.col("system_id").is_in(ids)) \
    .select("datetime", "system_id", "slope", "intercept") \
    .group_by_dynamic("datetime", every='1d', group_by=["system_id"]) \
    .agg(cs.numeric().last()) \
    .sort("datetime") \
    .with_columns(ordinal_day = pl.col("datetime").dt.ordinal_day()) \
    .with_row_index()
    
fig = px.scatter(df, x="datetime", y = "slope")
fig.show()

In [80]:
# Filters
ids = [6]

df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "2025_L1_1_min_sbs_period_acropolis.parquet")) \
    .filter(pl.col("system_id").is_in(ids)) \
    .select("datetime", "system_id", "cal_gmp343_slope", "cal_gmp343_intercept") \
    .group_by_dynamic("datetime", every='1d', group_by=["system_id"]) \
    .agg(cs.numeric().last()) \
    .sort("datetime") \
    .with_columns(ordinal_day = pl.col("datetime").dt.ordinal_day()) \
    .with_row_index()
    
fig = px.scatter(df, x="datetime", y = "cal_gmp343_slope")
fig.show()

In [69]:
df

index,system_id,datetime,cal_gmp343_slope,cal_gmp343_intercept,ordinal_day
u32,i32,datetime[ms],f64,f64,i16
0,3,2025-02-22 00:00:00,,,53
1,3,2025-04-11 00:00:00,0.9911,-17.48,101
2,6,2025-04-05 00:00:00,1.0392,-2.24,95
3,6,2025-03-28 00:00:00,1.0419,-3.82,87
4,6,2025-02-18 00:00:00,1.03,1.39,49
…,…,…,…,…,…
41,6,2025-04-23 00:00:00,1.0338,0.07,113
42,6,2025-04-03 00:00:00,1.0311,0.81,93
43,6,2025-02-28 00:00:00,1.0356,-1.56,59
44,6,2025-05-10 00:00:00,1.0275,-1.92,130


In [8]:
# Filters
start_date = datetime(2025, 2, 18, 0, 0, 0)
end_date = datetime(2025, 5, 31, 0, 0, 0)

filter = '1h'
ids = [3,6]

In [9]:
# Load Picarro Data
# DWD Picarro: "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"
df_p_dwd = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))
df_p_dwd = df_p_dwd.filter(pl.col("datetime").is_between(start_date, end_date)).collect()

# ICOS Picarro: "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"
df_p_icos = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))
df_p_icos = df_p_icos.filter(pl.col("datetime").is_between(start_date, end_date)).collect()
    
df_p = pl.concat([df_p_dwd,df_p_icos], how="diagonal")
del df_p_dwd, df_p_icos
df_p.head(1).vstack(df_p.tail(1))

datetime,system_name,system_id,picarro_corrected,h2o_reported
datetime[ms],str,i32,f64,f64
2025-02-18 00:00:00,"""Picarro_G2401""",529,470.619119,0.503802
2025-05-31 00:00:00,"""Picarro_G2401""",529,478.524646,1.737918


In [10]:
df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "2025_L1_1_min_sbs_period_acropolis.parquet")) \
    .with_columns(ordinal_day = pl.col("datetime").dt.ordinal_day()).filter(pl.col("datetime").is_between(start_date, end_date))
df.head(1).vstack(df.tail(1))

datetime,system_id,system_name,gmp343_corrected,gmp343_edge_corrected,gmp343_temperature,sht45_humidity,h2o_v%,bme280_pressure,enclosure_bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,gmp343_dry,slope_interpolated,intercept_interpolated,Flag,picarro_corrected,Picarro_Flag,ordinal_day
datetime[ms],i32,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,i16
2025-02-18 00:00:00,3,"""acropolis-3""",470.18,470.35,42.633333,6.858333,0.6179,942.005,959.4,,,496.970783,0.986107,-19.888754,"""U""",470.62,"""U""",49
2025-05-31 00:00:00,6,"""acropolis-6""",480.77,478.516667,31.0,36.381333,1.73532,942.783333,958.8,1.1,256.2,464.57859,1.029701,2.395426,"""U""",478.52,"""U""",151


In [5]:
fig = px.line(df.filter(pl.col("system_id") == 3), x="datetime", y = "gmp343_corrected", color = "system_id")
fig.show()

In [64]:
def reduce_calibration_days(df, frequency_days):
    df_cal = df.filter(pl.col("ordinal_day")%frequency_days == 0) \
        .select("datetime", "system_id", "slope_interpolated", "intercept_interpolated") \
        .group_by_dynamic("datetime", every='1d', group_by=["system_id"]) \
        .agg(cs.numeric().mean()) \
        .sort("datetime")
                
    return df.select("datetime", "system_id", "gmp343_dry") \
        .sort("datetime") \
        .join(df_cal, on=["datetime", "system_id"], how="left") \
        .sort(["system_id", "datetime"]) \
        .group_by("system_id").agg([
        pl.col("datetime"),
        pl.col("gmp343_dry"),
        pl.col("slope_interpolated").interpolate().backward_fill().forward_fill(),
        pl.col("intercept_interpolated").interpolate().backward_fill().forward_fill()
        ]) \
        .explode(["datetime","gmp343_dry", "slope_interpolated", "intercept_interpolated"]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias(f"gmp343_corrected")) \
        .join(df_p.select("datetime", "picarro_corrected"), on="datetime") \
        .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
        .filter(pl.col("diff").is_not_nan()) \
        .group_by_dynamic("datetime", every='1h', group_by=["system_id"]) \
        .agg(cs.numeric().mean()) \
        .group_by("system_id") \
        .agg((pl.col("diff").pow(2).mean().sqrt()).alias(f"rmse_{frequency_days}days"))


In [65]:
df_plot = reduce_calibration_days(df, 2)

for frequency_days in [3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]:
    df_plot = df_plot.join(reduce_calibration_days(df, frequency_days), on="system_id", how="left")

df_plot = df_plot.drop("^.*_right$").with_columns(
    pl.concat_str([pl.lit("MidCost-"),pl.col("system_id").cast(pl.Utf8)], separator="")
    .alias("system_name")
)
df_plot

system_id,rmse_2days,rmse_3days,rmse_4days,rmse_5days,rmse_6days,rmse_7days,rmse_8days,rmse_9days,rmse_10days,rmse_11days,rmse_12days,rmse_13days,rmse_14days,rmse_15days,rmse_16days,rmse_17days,rmse_18days,rmse_19days,rmse_20days,system_name
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
3,0.625746,0.622111,0.598867,0.681106,0.593036,0.590728,0.654239,0.624891,0.698438,0.770031,0.603108,0.758639,0.713996,0.826474,0.849096,0.613453,0.66924,0.826221,0.919888,"""MidCost-3"""
6,0.939251,0.935474,0.967721,0.9516,1.022359,0.984867,1.060463,1.063121,0.994988,1.096648,0.985607,1.153839,1.148424,1.048012,1.174443,1.138127,1.141812,1.226322,1.136927,"""MidCost-6"""


In [66]:
for id in ids:
    list = df_plot.filter(pl.col("system_id") == id).drop(["system_name", "system_id"]).transpose().get_column("column_0")

    fig = px.line(x = range(2,len(list)+2), y = list)
    fig.show()

In [6]:
df

datetime,system_id,system_name,gmp343_corrected,gmp343_edge_corrected,gmp343_temperature,sht45_humidity,h2o_v%,bme280_pressure,enclosure_bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,gmp343_dry,slope_interpolated,intercept_interpolated,Flag,picarro_corrected,Picarro_Flag,ordinal_day
datetime[ms],i32,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,i16
2025-02-18 00:00:00,3,"""acropolis-3""",470.18,470.35,42.633333,6.858333,0.6179,942.005,959.4,,,496.970783,0.986107,-19.888754,"""U""",470.62,"""U""",49
2025-02-18 00:01:00,3,"""acropolis-3""",466.63,466.8,42.65,6.9,0.62231,941.833333,959.41,,,493.370291,0.986108,-19.889271,"""U""",467.38,"""U""",49
2025-02-18 00:02:00,3,"""acropolis-3""",458.89,458.983333,42.633333,6.833333,0.61531,942.521667,959.41,,,485.520793,0.986109,-19.889787,"""U""",462.71,"""U""",49
2025-02-18 00:03:00,3,"""acropolis-3""",458.46,458.55,42.6,6.723333,0.604897,941.67,959.38,,,485.084261,0.98611,-19.890304,"""U""",459.32,"""U""",49
2025-02-18 00:04:00,3,"""acropolis-3""",464.85,465.0,42.6,6.728333,0.604643,942.766667,959.38,,,491.572256,0.986112,-19.890821,"""U""",464.31,"""U""",49
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-05-30 23:56:00,6,"""acropolis-6""",480.96,478.7,31.0,36.5231,1.74308,942.243333,958.8,0.9,249.8,464.767944,1.029697,2.394304,"""U""",477.92,"""U""",150
2025-05-30 23:57:00,6,"""acropolis-6""",478.64,476.4,31.016667,36.409,1.736343,943.84,958.8,0.9,249.8,462.514166,1.029698,2.394584,"""U""",477.97,"""U""",150
2025-05-30 23:58:00,6,"""acropolis-6""",480.88,478.616667,31.0,36.41535,1.735491,943.571667,958.78,1.0,256.2,464.681166,1.029699,2.394865,"""U""",478.19,"""U""",150
2025-05-30 23:59:00,6,"""acropolis-6""",480.59,478.35,31.0,36.399783,1.733947,944.008333,958.78,1.0,256.2,464.402492,1.0297,2.395146,"""U""",478.76,"""U""",150


In [None]:
from typing import List

def create_list_of_calibration_days(df, frequency_days:int) -> List[List[int]]:
    1+y, 1+y+frequency_days*n
    
    return [[0], ]


def reduce_calibration_days(df, selected_days):
    # Filter the dataframe to only include the selected days
    df_cal = df.filter(pl.col("ordinal_day").is_in(selected_days)) \
        .select("datetime", "system_id", "cal_gmp343_slope", "cal_gmp343_intercept") \
        .group_by_dynamic("datetime", every='1d', group_by=["system_id"]) \
        .agg(cs.numeric().last()) \
        .sort("datetime")
                
                
                
                
    return df.select("datetime", "system_id", "gmp343_dry") \
        .sort("datetime") \
        .join(df_cal, on=["datetime", "system_id"], how="left") \
        .sort(["system_id", "datetime"]) \
        .group_by("system_id").agg([
        pl.col("datetime"),
        pl.col("gmp343_dry"),
        pl.col("slope_interpolated").interpolate().backward_fill().forward_fill(),
        pl.col("intercept_interpolated").interpolate().backward_fill().forward_fill()
        ]) \
        .explode(["datetime","gmp343_dry", "slope_interpolated", "intercept_interpolated"]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias(f"gmp343_corrected")) \
        .join(df_p.select("datetime", "picarro_corrected"), on="datetime") \
        .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
        .filter(pl.col("diff").is_not_nan()) \
        .group_by_dynamic("datetime", every='1h', group_by=["system_id"]) \
        .agg(cs.numeric().mean()) \
        .group_by("system_id") \
        .agg((pl.col("diff").pow(2).mean().sqrt()).alias(f"rmse_{frequency_days}days"))
