In [7]:
from datetime import datetime
from datetime import timezone
import polars as pl
import os
import math
import numpy as np
from sklearn.metrics import r2_score
from plot_data import plot_sensor_measurement, plot_wind_rose, rmse, calc_r2


start_date = datetime(2023, 11, 6, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2023, 11, 6, 23, 59, 59).replace(tzinfo=timezone.utc)

filter = '1h'

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20]
DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")


In [117]:
# parquet created by notebook: download_from_hermes.ipynb
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "measurements.parquet"))
# parquet created by notebook: processing_pipeline.ipynb
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))

df = df.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us")) \
.sort("creation_timestamp") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))

# PICARRO data

In [118]:
df_p = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "cal_corr_picarro.parquet")) 
df_p.tail(3)

creation_timestamp,CO2_corr,slope,intercept,system_name,diff
"datetime[μs, UTC]",f64,f64,f64,str,f64


# Broadcast for multiple systems

In [119]:
def broadcast_cal_corr(system):
    t1 = df.filter(pl.col("system_name") == system) \
        .select(["creation_timestamp", "system_name","gmp343_filtered"])  \
        .with_columns(pl.col("creation_timestamp").dt.date() \
        .alias("date"))

    t2 = df_cal.filter(pl.col("system_name") == system) \
        .select(["creation_timestamp", "system_name","slope","intercept"]) \
        .rename({"creation_timestamp": "date"})
        
    t1 = t1.join(t2, on = "date", how= "left") \
        .with_columns(pl.col("slope")) \
        .fill_null(strategy = "backward") \
        .fill_null(strategy = "forward") \
        .select(["creation_timestamp","system_name", "slope", "intercept"])
    
    t3 = df.sort("creation_timestamp") \
    .select(["creation_timestamp", "system_name","gmp343_filtered"])  \
    .join(t1, on=["system_name","creation_timestamp"], how= "left") \
    .filter(pl.col("system_name") == system)  \
    .filter(pl.col("gmp343_filtered") > 0) \
    .with_columns(((pl.col("gmp343_filtered") + 0) * pl.col("slope") + pl.col("intercept")) \
    .alias("CO2_corr")) \
    .collect() \
    .select(["creation_timestamp","system_name","CO2_corr","slope","intercept"]) \

    return t3

l_df_corr =[df_p]

for id in sensor_id:
    df_corr = broadcast_cal_corr(f"tum-esm-midcost-raspi-{id}")
    df_corr = df_corr.groupby_dynamic("creation_timestamp", every=filter)  \
        .agg([pl.all().exclude(["creation_timestamp","system_name"]).mean(),
                pl.col("system_name")]) \
        .with_columns(pl.col("system_name").list.last())
        
    df_corr = df_corr.join(df_p.select("creation_timestamp", "CO2_corr") \
                .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
                .with_columns((pl.col("temp")-pl.col("CO2_corr")).alias("diff")) \
                .drop("temp")
        
    l_df_corr.append(df_corr)
        
    
df_corr_agg = pl.concat(l_df_corr, how="vertical")
df_corr_agg.tail(3)
                    

creation_timestamp,CO2_corr,slope,intercept,system_name,diff
"datetime[μs, UTC]",f64,f64,f64,str,f64
2023-11-06 21:00:00 UTC,,,,"""tum-esm-midcos…",
2023-11-06 22:00:00 UTC,,,,"""tum-esm-midcos…",
2023-11-06 23:00:00 UTC,,,,"""tum-esm-midcos…",


In [120]:
import plotly.express as px

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"CO2_corr", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"diff", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"slope", markers=True, title = "slope", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"intercept", markers=True, title = "intercept", color="system_name")
fig.show()


In [121]:
# temp_id = 6

# mean_offset = df_corr_agg.filter(pl.col("system_name")==f"tum-esm-midcost-raspi-{temp_id}").select("diff").mean().item()

# l_acropolis = df_corr_agg.filter(pl.col("system_name")==f"tum-esm-midcost-raspi-{temp_id}").select("CO2_corr").with_columns(pl.col("CO2_corr")+ mean_offset).to_series().to_list()

# l_picarro = df_corr_agg.filter(pl.col("system_name")=="Picarro").select("CO2_corr").to_series().to_list()

# print("\nOffset:")
# print(mean_offset)

# print("\nRMSE:")
# print(rmse(l_picarro,l_acropolis))

# print("\nR2:")
# print(calc_r2(l_picarro,l_acropolis))

# Plot other data directly from database parquet

In [9]:
df = df.collect()

In [10]:
plot_sensor_measurement(df,sensor_id=sensor_id,col_name="gmp343_raw",filter=filter)

In [126]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_compensated",filter=filter)

In [127]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_filtered",filter=filter)

In [128]:

# Maisach
plot_wind_rose(df,1, 'Maisach')
# Rechts-der-Isar
plot_wind_rose(df,2, 'Rechts-der-Isar')
# Finsing
plot_wind_rose(df,3, 'Finsing')
# Großhadern
plot_wind_rose(df,4, 'Großhadern')
# Großhadern
plot_wind_rose(df,5, 'Pasing')
# plot wind at 3m pole inlet
plot_wind_rose(df,6, 'TUM')
# Feldkirchen
plot_wind_rose(df,7, 'Feldkirchen')
# Taufkirchen
plot_wind_rose(df,8, 'Taufkirchen')

In [129]:
plot_sensor_measurement(df,sensor_id,col_name="wxt532_speed_avg",filter=filter)

In [130]:
plot_sensor_measurement(df,sensor_id,col_name="wxt532_direction_avg",filter=filter)

In [131]:
# Plot auxilliary data
plot_sensor_measurement(df,sensor_id,col_name="sht45_humidity",filter=filter)

In [132]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_humidity",filter=filter)

In [133]:
plot_sensor_measurement(df,sensor_id,col_name="sht45_temperature",filter=filter)

In [134]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_temperature",filter=filter)

In [135]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_pressure",filter=filter)

In [136]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_temperature",filter=filter)

In [137]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_cpu_usage",filter=filter)

In [138]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_disk_usage",filter=filter)


In [139]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_cpu_temperature",filter=filter)

In [140]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_humidity",filter=filter)

In [141]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_pressure",filter=filter)

In [142]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_temperature",filter=filter)

# Calibration Data

In [143]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bottle_id",filter= '30s')

In [144]:
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '30s')

In [145]:
plot_sensor_measurement(df,sensor_id,col_name="cal_sht45_humidity",filter= '30s')

In [146]:
plot_sensor_measurement(df,sensor_id,col_name="cal_sht45_temperature",filter= '30s')

In [147]:
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_temperature",filter= '30s')

In [148]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_temperature",filter= '30s')

In [149]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_humidity",filter= '30s')

In [150]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_pressure",filter= '30s')

In [151]:
# Plot 400ppm Calibration Bottle 
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '1m',cut_above= 450,cut_below=350)

In [152]:
# Plot 800ppm Calibration Bottle 
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '30s',cut_below=700)