In [158]:
import polars as pl
import os
from datetime import datetime
from datetime import timezone
import plotly.express as px
import numpy as np

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [159]:
# 10m preprocessed acropolis data
df_acropolis = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))
  
# 10m preprocessed picarro data
df_p_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))

In [160]:
df_acropolis.head(1).collect()

system_name,creation_timestamp,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,revision,receipt_timestamp,raspi_memory_usage,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid,date,slope,intercept,gmp343_dry,CO2_corr,system_id,gmp343_corrected,diff
str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,date,f64,f64,f64,f64,i64,f64,f64
"""tum-esm-midcos…",2023-06-01 00:00:00 UTC,,,,,0.94,0.26,1.66,104.2,123.2,294.2,1685600000.0,0.0312,53.52,0.426,15.154,956.434,33.662,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,,


In [161]:
id = 12

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc) # in the field

df_filtered = df_acropolis.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
    .filter(pl.col("CO2_corr") > 0) \
    .filter(pl.col("system_id") == id)  \
    .collect()

In [162]:
fig = px.line(df_filtered, x="creation_timestamp", y="diff")
fig.show()

In [163]:
fig = px.histogram(df_filtered, x="slope")
fig.show()

In [164]:
mean = df_filtered["slope"].mean()
std = df_filtered["slope"].std()

print(mean, std)

df_std = df_filtered.with_columns((((mean - std) < pl.col("slope")) & (pl.col("slope")  < (mean + std))).alias("slope_1_std_selector")) \
    .with_columns((((mean - 2*std) < pl.col("slope")) & (pl.col("slope")  < (mean + 2*std))).alias("slope_2_std_selector"))

1.045201370085878 0.03468350188779616


In [165]:
fig = px.histogram(df_std, x="slope", color="slope_1_std_selector")
fig.show()
fig = px.histogram(df_std, x="slope", color="slope_2_std_selector")
fig.show()

In [166]:
calibrations_1_std = df_std.filter(pl.col("slope_1_std_selector") == True).select("creation_timestamp","slope", "intercept") \
    .rename({"slope":"slope_1_std","intercept":"intercept_1_std"})
    
calibrations_2_std = df_std.filter(pl.col("slope_2_std_selector") == True).select("creation_timestamp","slope", "intercept") \
    .rename({"slope":"slope_2_std","intercept":"intercept_2_std"})

df_filtered = df_filtered.sort("creation_timestamp") \
    .join(calibrations_1_std, on = ["creation_timestamp"], how= "left") \
    .join(calibrations_2_std, on = ["creation_timestamp"], how= "left") \
    .fill_null(strategy = "forward") \
    .fill_null(strategy = "backward") 

In [167]:
fig = px.histogram(df_filtered, x="slope_1_std")
fig.show()

fig = px.histogram(df_filtered, x="slope_2_std")
fig.show()

In [168]:
df_filtered = df_filtered.with_columns([
    ((pl.col("gmp343_dry")) * pl.col("slope_1_std") + pl.col("intercept_1_std")).alias("gmp343_corrected_1_std"),
    ((pl.col("gmp343_dry")) * pl.col("slope_2_std") + pl.col("intercept_2_std")).alias("gmp343_corrected_2_std")
                         ]) \
    .with_columns([
        (pl.col("CO2_corr") - pl.col("gmp343_corrected_1_std")).alias("diff_1_std"),
        (pl.col("CO2_corr") - pl.col("gmp343_corrected_2_std")).alias("diff_2_std"),
        (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected"))).mean()).alias("mse_full_deployment"),
        (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected_1_std"))).mean()).alias("mse_1_std_full_deployment"),
        (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected_2_std"))).mean()).alias("mse_2_std_full_deployment")
                ]) \
    .with_columns([
        pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"),
        pl.col("mse_1_std_full_deployment").sqrt().alias("rmse_1_std_full_deployment"),
        pl.col("mse_2_std_full_deployment").sqrt().alias("rmse_2_std_full_deployment")
                   ])

In [171]:
fig = px.line(df_filtered, x="creation_timestamp", y=["diff", "diff_1_std","diff_2_std"])
fig.show()

fig = px.line(df_filtered, x="creation_timestamp", y=["rmse_full_deployment", "rmse_1_std_full_deployment","rmse_2_std_full_deployment"])
fig.show()