In [162]:
from datetime import datetime
from datetime import timezone
import polars as pl
import os
import math
import numpy as np
from sklearn.metrics import r2_score
from plot_data import plot_sensor_measurement, plot_wind_rose, rmse, calc_r2


start_date = datetime(2023, 11, 1, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2023, 11, 6, 23, 59, 59).replace(tzinfo=timezone.utc)

filter = '1h'

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20]
DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")


# Load Datasets

In [163]:
# Local network measurements dataset
# parquet created by notebook: download_from_hermes.ipynb
# parquet created by notebook: download_from_hermes.ipynb
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "measurements.parquet")) \
.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us")) \
    .sort("creation_timestamp") \
    .filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()
    
# reduced version for calibration correction
df_reduced = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "measurements.parquet")) \
    .select(["creation_timestamp", "system_name","gmp343_filtered"])   \
    .with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us")) \
    .sort("creation_timestamp") \
    .filter(pl.col("creation_timestamp").is_between(start_date, end_date))

# Slope / Intercept dataset
# parquet created by notebook: processing_pipeline.ipynb
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))  \
    .filter(pl.col("date").is_between(start_date, end_date))

# PICARRO dataset
# parquet created by notebook: processing_pipeline.ipynb
df_p = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))
df_p = df_p.sort("creation_timestamp").filter(pl.col("creation_timestamp").is_between(start_date, end_date)).collect()

# Broadcast for multiple systems

In [195]:
# reduce timestamp to date DD.XX.YYYY for measurement dataset and join slope and intercept from df_cal
df_date = df_reduced.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
    .join(df_cal, on = ["date","system_name"], how= "left") \
        
l_df_corr =[df_p]

for id in sensor_id:  
    # broadcast: via backward and forward fill
    # calibration correction: via coloumn operation 
    # aggregation: defined by filter
    # offset calculation to reference instrument PICARRO
    df_corr = df_date.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
    .filter(pl.col("gmp343_filtered") > 0) \
    .fill_null(strategy = "backward") \
    .fill_null(strategy = "forward") \
    .with_columns(((pl.col("gmp343_filtered")) * pl.col("slope") + pl.col("intercept")) \
    .alias("CO2_corr")) \
    .groupby_dynamic("creation_timestamp", every=filter)  \
        .agg([pl.all().exclude(["creation_timestamp","system_name"]).mean(),
                pl.col("system_name")]) \
        .with_columns(pl.col("system_name").list.last()) \
        .collect()
        
    df_corr = df_corr.join(df_p.select("creation_timestamp", "CO2_corr") \
                .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
                .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
                .drop("temp")
        
    l_df_corr.append(df_corr)
        
    
df_corr_agg = pl.concat(l_df_corr, how="diagonal")


In [196]:
import plotly.express as px

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"CO2_corr", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"diff", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"slope", markers=True, title = "slope", color="system_name")
fig.show()

fig = px.line(df_corr_agg, x="creation_timestamp", y=f"intercept", markers=True, title = "intercept", color="system_name")
fig.show()


In [127]:
# temp_id = 6

# mean_offset = df_corr_agg.filter(pl.col("system_name")==f"tum-esm-midcost-raspi-{temp_id}").select("diff").mean().item()

# l_acropolis = df_corr_agg.filter(pl.col("system_name")==f"tum-esm-midcost-raspi-{temp_id}").select("CO2_corr").with_columns(pl.col("CO2_corr")+ mean_offset).to_series().to_list()

# l_picarro = df_corr_agg.filter(pl.col("system_name")=="Picarro").select("CO2_corr").to_series().to_list()

# print("\nOffset:")
# print(mean_offset)

# print("\nRMSE:")
# print(rmse(l_picarro,l_acropolis))

# print("\nR2:")
# print(calc_r2(l_picarro,l_acropolis))

# Plot other data directly from local database measurements.parquet

In [167]:
plot_sensor_measurement(df,sensor_id=sensor_id,col_name="gmp343_raw",filter=filter)

In [168]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_compensated",filter=filter)

In [169]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_filtered",filter=filter)

In [170]:

# Maisach
plot_wind_rose(df,1, 'Maisach')
# Rechts-der-Isar
plot_wind_rose(df,2, 'Rechts-der-Isar')
# Finsing
plot_wind_rose(df,3, 'Finsing')
# Großhadern
plot_wind_rose(df,4, 'Großhadern')
# Großhadern
plot_wind_rose(df,5, 'Pasing')
# plot wind at 3m pole inlet
plot_wind_rose(df,6, 'TUM')
# Feldkirchen
plot_wind_rose(df,7, 'Feldkirchen')
# Taufkirchen
plot_wind_rose(df,8, 'Taufkirchen')

In [171]:
plot_sensor_measurement(df,sensor_id,col_name="wxt532_speed_avg",filter=filter)

In [172]:
plot_sensor_measurement(df,sensor_id,col_name="wxt532_direction_avg",filter=filter)

In [173]:
# Plot auxilliary data
plot_sensor_measurement(df,sensor_id,col_name="sht45_humidity",filter=filter)

In [174]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_humidity",filter=filter)

In [175]:
plot_sensor_measurement(df,sensor_id,col_name="sht45_temperature",filter=filter)

In [176]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_temperature",filter=filter)

In [177]:
plot_sensor_measurement(df,sensor_id,col_name="bme280_pressure",filter=filter)

In [178]:
plot_sensor_measurement(df,sensor_id,col_name="gmp343_temperature",filter=filter)

In [179]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_cpu_usage",filter=filter)

In [180]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_disk_usage",filter=filter)


In [181]:
plot_sensor_measurement(df,sensor_id,col_name="raspi_cpu_temperature",filter=filter)

In [182]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_humidity",filter=filter)

In [183]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_pressure",filter=filter)

In [184]:
plot_sensor_measurement(df,sensor_id,col_name="enclosure_bme280_temperature",filter=filter)

# Calibration Data

In [185]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bottle_id",filter= '30s')

In [186]:
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '30s')

In [187]:
plot_sensor_measurement(df,sensor_id,col_name="cal_sht45_humidity",filter= '30s')

In [188]:
plot_sensor_measurement(df,sensor_id,col_name="cal_sht45_temperature",filter= '30s')

In [189]:
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_temperature",filter= '30s')

In [190]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_temperature",filter= '30s')

In [191]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_humidity",filter= '30s')

In [192]:
plot_sensor_measurement(df,sensor_id,col_name="cal_bme280_pressure",filter= '30s')

In [193]:
# Plot 400ppm Calibration Bottle 
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '1m',cut_above= 450,cut_below=350)

In [194]:
# Plot 800ppm Calibration Bottle 
plot_sensor_measurement(df,sensor_id,col_name="cal_gmp343_filtered",filter= '30s',cut_below=700)