In [1]:
import polars as pl
import os
from datetime import datetime
from datetime import timezone
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [2]:
# 10m preprocessed acropolis data
df_acropolis = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "test_10m_cal_corr_acropolis.parquet"))
  
# 10m preprocessed picarro data
df_p_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))

In [3]:
df_acropolis.head(5).collect()

system_name,creation_timestamp,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,revision,receipt_timestamp,raspi_memory_usage,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid,date,gmp343_dry,h2o_ah,h2o_ppm,CO2_corr,system_id,diff,sys_name_short,slope,intercept,gmp343_corrected
str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,date,f64,f64,f64,f64,i64,struct[4],str,f64,f64,f64
"""tum-esm-midcos…",2023-06-01 00:00:00 UTC,,,,,0.94,0.26,1.66,104.2,123.2,294.2,1685600000.0,0.0312,53.52,0.426,15.154,956.434,33.662,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,"{null,null,null,null}","""mid-cost-1""",,,
"""tum-esm-midcos…",2023-06-01 00:10:00 UTC,,,,,1.12,0.44,1.74,47.0,87.8,218.2,1685600000.0,0.0438,53.88,0.426,15.228,956.448,33.5,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,"{null,null,null,null}","""mid-cost-1""",,,
"""tum-esm-midcos…",2023-06-01 00:20:00 UTC,,,,,0.675,0.275,1.35,57.75,107.0,262.0,1685600000.0,0.0335,53.875,0.426,15.2375,956.4725,33.585,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,"{null,null,null,null}","""mid-cost-1""",,,
"""tum-esm-midcos…",2023-06-01 00:30:00 UTC,,,,,0.66,0.22,1.26,168.2,81.6,174.8,1685600000.0,0.0418,53.4,0.426,15.024,956.406,34.104,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,"{null,null,null,null}","""mid-cost-1""",,,
"""tum-esm-midcos…",2023-06-01 00:40:00 UTC,,,,,1.04,0.38,1.82,62.4,89.0,287.6,1685600000.0,0.0436,54.0,0.426,14.796,956.312,34.432,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01,,,,,1,"{null,null,null,null}","""mid-cost-1""",,,


In [4]:
ongoing_sbs = datetime(2024, 3, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

sbs_times = [
    (1,datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    #2 is still deployed
    (3,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 18, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (4,datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (5,datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 25, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (6,datetime(2024, 2, 20, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (7,datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (8,datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (9,datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (10,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (11,datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (12,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (13,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 1, 30, 23, 59, 59).replace(tzinfo=timezone.utc)),
    #(14,datetime(2024, 2, 29, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (15,datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (16,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)),
    # 17 needs assembly
    (18,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)),
    # 19 needs assembly
    (20,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc))    
]

In [5]:
all_systems = []

#--- events during sbs

# cut roof-top power out
before = df_acropolis.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
after = df_acropolis.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
    
df_acropolis = pl.concat([before, after], how="diagonal")

# cut inlet change
before = df_acropolis.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
after = df_acropolis.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
    
df_acropolis = pl.concat([before, after], how="diagonal")

# cut inlet maintainence
before = df_acropolis.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
after = df_acropolis.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
    
df_acropolis = pl.concat([before, after], how="diagonal")

#---

for id, start_date, end_date in sbs_times:
    print(f"processing {id}")
    
    #cut maintainence / power outtage from 23.01.2024
    if id in [10,11,13]:
        before = df_acropolis.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
        after = df_acropolis.filter(pl.col("creation_timestamp") > datetime(2024, 1, 25, 23, 59, 59).replace(tzinfo=timezone.utc))
        
        df_acropolis = pl.concat([before, after], how="diagonal")
    
    #-------

    df_filtered = df_acropolis.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
        .filter(pl.col("CO2_corr") > 0) \
        .filter(pl.col("gmp343_filtered") > 0) \
        .filter(pl.col("system_id") == id)  \
        .collect()
        
    mean = df_filtered["slope"].mean()
    std = df_filtered["slope"].std()

    df_std = df_filtered.with_columns((((mean - std) < pl.col("slope")) 
                                    & (pl.col("slope")  < (mean + std))).alias("slope_1_std_selector")) \
        .with_columns((((mean - 2*std) < pl.col("slope")) & (pl.col("slope")  < (mean + 2*std))).alias("slope_2_std_selector"))
        
    calibrations_1_std = df_std.filter(pl.col("slope_1_std_selector") == True).select("creation_timestamp","slope", "intercept") \
        .rename({"slope":"slope_1_std","intercept":"intercept_1_std"})
        
    calibrations_2_std = df_std.filter(pl.col("slope_2_std_selector") == True).select("creation_timestamp","slope", "intercept") \
        .rename({"slope":"slope_2_std","intercept":"intercept_2_std"})

    df_filtered = df_filtered.sort("creation_timestamp") \
        .join(calibrations_1_std, on = ["creation_timestamp"], how= "left") \
        .join(calibrations_2_std, on = ["creation_timestamp"], how= "left") \
        .fill_null(strategy = "forward") \
        .fill_null(strategy = "backward") 
        
    df_filtered = df_filtered.with_columns([
        ((pl.col("gmp343_dry")) * pl.col("slope_1_std") + pl.col("intercept_1_std")).alias("gmp343_corrected_1_std"),
        ((pl.col("gmp343_dry")) * pl.col("slope_2_std") + pl.col("intercept_2_std")).alias("gmp343_corrected_2_std")
                            ]) \
        .with_columns([
            (pl.col("CO2_corr") - pl.col("gmp343_corrected_1_std")).alias("diff_1_std"),
            (pl.col("CO2_corr") - pl.col("gmp343_corrected_2_std")).alias("diff_2_std"),
            (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected"))).mean()).alias("mse_full_deployment"),
            (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected_1_std"))).mean()).alias("mse_1_std_full_deployment"),
            (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected_2_std"))).mean()).alias("mse_2_std_full_deployment")
                    ]) \
        .with_columns([
            pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment"),
            pl.col("mse_1_std_full_deployment").sqrt().alias("rmse_1_std_full_deployment"),
            pl.col("mse_2_std_full_deployment").sqrt().alias("rmse_2_std_full_deployment")
                    ]) 
        
    all_systems.append(df_filtered)
    

processing 1


thread '<unnamed>' panicked at crates/polars-core/src/series/mod.rs:462:67:
called `Result::unwrap()` on an `Err` value: Duplicate(ErrString("multiple fields with name 'mean_cal_low' found"))
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: called `Result::unwrap()` on an `Err` value: Duplicate(ErrString("multiple fields with name 'mean_cal_low' found"))

In [None]:
df_all_sbs_filtered.head(1)

In [None]:
df_all_sbs_filtered = pl.concat(all_systems, how="diagonal")
df_all_sbs_filtered.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "filtered_1_std_acropolis.parquet"))

In [None]:
df_all_sbs_filtered = df_all_sbs_filtered.with_columns(pl.lit("Munich").alias("Mid-Cost Network"))

In [None]:
fig = px.histogram(df_filtered, x="slope")
fig.show()
fig = px.histogram(df_std, x="slope", color="slope_1_std_selector")
fig.show()
fig = px.histogram(df_std, x="slope", color="slope_2_std_selector")
fig.show()

fig = px.line(df_all_sbs_filtered, x="creation_timestamp", y="diff", color = "system_name")
fig.show()
fig = px.line(df_all_sbs_filtered, x="creation_timestamp", y="diff_1_std", color = "system_name")
fig.show()

fig = px.scatter(df_all_sbs_filtered, x="system_name", y=["rmse_full_deployment", "rmse_1_std_full_deployment","rmse_2_std_full_deployment"])
fig.show()

In [None]:
# 144 * 10 minute intervals in 1 day - 3 * 10 min intervals in one calibration

df_plot = df_all_sbs_filtered.sort("creation_timestamp") \
    .group_by("system_name") \
    .agg([(pl.col("gmp343_corrected").count() / 141).alias("sbs_days"),
          pl.col("system_id"),
          ]) \
    .with_columns((pl.col("sbs_days") > 14).alias("days_threshold")) \
    .sort("sbs_days")
    
fig = px.scatter(df_plot, x="system_name", y="sbs_days", color="days_threshold")
fig.show()

In [None]:
df_plot = df_all_sbs_filtered.rename({"rmse_2_std_full_deployment":"RMSE (ppm)"}) \
    .join(df_plot, on="system_name", how="left")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=df_plot,x='Mid-Cost Network',y='RMSE (ppm)', hue="sbs_days", alpha=0.5).set(title='Side-by-side performance')
plt.show()