In [139]:
import polars as pl
import polars.selectors as cs
import os
import sys
from datetime import datetime, timezone
import plotly.express as px
import numpy as np

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)
    
from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY, POSTPROCESSED_DATA_DIRECTORY
from utils.import_data import import_acropolis_system_data

assert(os.path.exists(POSTPROCESSED_DATA_DIRECTORY))
assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [140]:
# Filters
start_date = datetime(2023, 12, 1, 12, 0, 0)
end_date = datetime(2024, 4, 17, 0, 0, 0)

filter = '1h'
ids = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [141]:
# Load Picarro Data
# DWD Picarro: "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"
df_p_dwd = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_h_DWD_Picarro_G2301_413.parquet"))
df_p_dwd = df_p_dwd.filter(pl.col("datetime").is_between(start_date, end_date)).collect()

# ICOS Picarro: "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"
df_p_icos = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_h_ICOS_Picarro_G2401_529.parquet"))
df_p_icos = df_p_icos.filter(pl.col("datetime").is_between(start_date, end_date)).collect()
    
df_p = pl.concat([df_p_dwd,df_p_icos], how="diagonal")
del df_p_dwd, df_p_icos
df_p.head(1).vstack(df_p.tail(1))

system_name,system_id,datetime,picarro_corrected,h2o_reported
str,i32,datetime[ms],f64,f64
"""Picarro_G2301""",413,2023-12-01 12:00:00,521.012591,0.852042
"""Picarro_G2301""",413,2024-04-17 00:00:00,433.216178,0.888293


In [142]:
df = pl.read_parquet(os.path.join(DATA_DIRECTORY, "output", "side-by-side", "1_h_sbs_period_acropolis.parquet")) \
    .with_columns(ordinal_day = pl.col("datetime").dt.ordinal_day())
df.head(1).vstack(df.tail(1))

system_id,system_name,datetime,ts,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,bme280_temperature,bme280_humidity,bme280_pressure,sht45_temperature,sht45_humidity,gmp343_edge_corrected,gmp343_edge_dry,h2o_ah,h2o_v%,bme280_h2o_v%,gmp343_dry,slope,intercept,slope_interpolated,intercept_interpolated,gmp343_corrected,wxt532_direction_min,wxt532_direction_avg,wxt532_direction_max,wxt532_speed_min,wxt532_speed_avg,wxt532_speed_max,wxt532_last_update_time,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,enclosure_bme280_temperature,enclosure_bme280_humidity,enclosure_bme280_pressure,raspi_cpu_temperature,raspi_disk_usage,raspi_cpu_usage,raspi_memory_usage,ups_powered_by_grid,ups_battery_is_fully_charged,ups_battery_error_detected,ups_battery_above_voltage_threshold,cal_gmp343_slope,cal_gmp343_intercept,cal_sht_45_offset,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,picarro_corrected,gmp343_corrected_std,gmp343_corrected_var,ordinal_day
i32,str,datetime[ms],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i16
1,"""tum-esm-midcost-raspi-1""",2024-02-07 00:00:00,1707300000000.0,412.266444,465.421278,465.512389,27.360667,24.225806,19.246044,941.474433,24.202383,22.383206,,,5.881546,0.866416,0.744981,469.580913,0.953834,-1.882813,0.981128,-26.399673,434.319482,,,,,,,,,,,,30.675833,13.263333,952.984833,49.46,0.488,0.0194,0.111217,1.0,1.0,0.0,1.0,,,,,,,,,,,,,,434.108444,1.094123,1.197104,38
20,"""tum-esm-midcost-raspi-20""",2024-02-11 23:00:00,1707700000000.0,386.372278,444.066444,444.097778,28.05,24.448167,19.764822,928.680989,25.149928,24.427328,,,6.667104,0.997951,0.807469,448.574308,0.995614,-9.092189,0.99545,-9.058583,437.474802,,,,,,,,,,,,31.427333,12.469167,940.1655,51.201667,0.359,0.018733,0.11015,1.0,1.0,0.0,1.0,,,,,,,,,,,,,,437.743706,1.547816,2.395734,42


In [143]:
id = 1

fig = px.line(df.filter(pl.col("system_id") == id), x="datetime", y = "gmp343_dry", color = "system_id")
fig.show()

fig = px.line(df.filter(pl.col("system_id") == id), x="datetime", y = "slope", color = "system_id")
fig.show()

fig = px.line(df.filter(pl.col("system_id") == id), x="datetime", y = "intercept", color = "system_id")
fig.show()

In [144]:
def reduce_calibration_days(df, frequency_days):
    df_cal = df.filter(pl.col("ordinal_day")%frequency_days == 0) \
                .select("datetime", "system_id", "slope_interpolated", "intercept_interpolated") \
                .filter(pl.col("datetime").dt.hour() == 0) \
                
    return df.select("datetime", "system_id", "gmp343_dry") \
        .sort("datetime") \
        .join(df_cal, on=["datetime", "system_id"], how="left") \
        .sort(["system_id", "datetime"]) \
        .group_by("system_id").agg([
        pl.col("datetime"),
        pl.col("gmp343_dry"),
        pl.col("slope_interpolated").interpolate().forward_fill().backward_fill(),
        pl.col("intercept_interpolated").interpolate().forward_fill().backward_fill()
        ]) \
        .explode(["datetime","gmp343_dry", "slope_interpolated", "intercept_interpolated"]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias(f"gmp343_corrected")) \
        .join(df_p.select("datetime", "picarro_corrected"), on="datetime") \
        .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
        .filter(pl.col("diff").is_not_nan()) \
        .group_by("system_id") \
        .agg((pl.col("diff").pow(2).mean().sqrt()).alias(f"rmse_{frequency_days}days"))


In [153]:
df_plot = reduce_calibration_days(df, 1)

for frequency_days in [2,3,4,5,6,7,8,9,10]:
    df_plot = df_plot.join(reduce_calibration_days(df, frequency_days), on="system_id", how="left")

df_plot.drop("^.*_right$")

system_id,rmse_1days,rmse_2days,rmse_3days,rmse_4days,rmse_5days,rmse_6days,rmse_7days,rmse_8days,rmse_9days,rmse_10days
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,1.166176,1.173007,1.22438,1.239178,1.556781,1.6262,1.448779,1.217673,1.532256,1.918074
2,0.740304,0.740353,0.751035,0.709596,0.709881,0.749318,0.712802,0.727604,0.735461,0.758777
3,1.079102,1.084546,1.087423,1.550435,1.25141,1.61451,1.220746,1.707966,1.480763,1.658091
4,1.456854,1.457112,1.482068,1.61481,1.55078,1.600512,1.523013,1.448363,1.43595,1.799648
5,0.941905,0.941205,0.920918,0.851133,0.950095,0.888202,0.924302,0.851454,0.877563,1.014651
…,…,…,…,…,…,…,…,…,…,…
16,0.80897,0.805721,0.830826,0.947656,0.941448,1.00464,1.039397,0.882862,1.001949,0.956192
17,2.555628,2.553402,2.59705,2.522746,2.487326,2.609535,2.696051,2.360337,2.87258,2.228878
18,0.83001,0.824506,0.80173,0.83876,0.860378,0.875026,0.936379,1.030801,0.917843,0.865645
19,1.391989,1.404377,1.720951,1.88168,1.476265,2.702226,1.403005,2.200668,2.574636,1.850568


In [154]:
for id in ids:
    list = df_plot.filter(pl.col("system_id") == id).drop("system_id").transpose()["column_0"].to_list()

    fig = px.line(x = range(1,len(list)+1), y = list)
    fig.update_layout(title=f"{id}", yaxis_range = [0.5,2.5])
    fig.show()