In [1]:
from datetime import datetime
from datetime import timezone
import polars as pl
import os
import math
from sklearn.metrics import r2_score
import numpy as np

from utils import plot_sensor_measurement

start_date = datetime(2023, 10, 23, 15, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2023, 10, 23, 23, 59, 59).replace(tzinfo=timezone.utc)

sensor_id = 8
filter = '1h'

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")


df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet")) \
    .filter(pl.col("creation_timestamp").is_between(start_date, end_date))
    
#Calibration
# after 23.10
picarro_slope = 1.006374633215469
picarro_intercept = 0.0709482571842841

acropolis_slope = 1.017498765879016
acropolis_intercept = -16.974317238766673


In [2]:
def rmse(y_true, y_meas):
    MSE = np.square(np.subtract(y_true, y_meas)).mean()
    RMSE = math.sqrt(MSE)

    return RMSE

def calc_r2(y_true, y_meas):
    return r2_score(y_true, y_meas)

In [3]:
df_a = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet")) \
    .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{sensor_id}" ) \
    .sort("creation_timestamp").filter(pl.col("creation_timestamp") \
    .is_between(start_date, end_date)) \
    .filter(pl.col("gmp343_dry")>0.0) \
    .with_columns(((pl.col("gmp343_dry")) * acropolis_slope + acropolis_intercept).alias("CO2_corr")) \
    .groupby_dynamic("creation_timestamp", every=filter) \
    .agg(pl.all().exclude(["creation_timestamp","DATE","TIME","datetime","variant"]).mean())  \
    .with_columns(pl.lit("ACROPOLIS").alias("system_name"))
    
df_a.head(3).collect()

creation_timestamp,system_name,gmp343_dry,CO2_corr
"datetime[μs, UTC]",str,f64,f64
2023-10-23 15:00:00 UTC,"""ACROPOLIS""",445.863648,436.691395
2023-10-23 16:00:00 UTC,"""ACROPOLIS""",451.54363,442.470769
2023-10-23 17:00:00 UTC,"""ACROPOLIS""",450.84699,441.761938


In [4]:
df_p = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet")) \
    .sort("creation_timestamp") \
    .filter(pl.col("creation_timestamp") \
    .is_between(start_date, end_date))

df_p.head(3).collect()

creation_timestamp,CO2_corr,slope,intercept,system_name,diff
"datetime[μs, UTC]",f64,f64,f64,str,f64
2023-10-23 15:00:00 UTC,434.308993,1.006375,0.070948,"""Picarro""",0.0
2023-10-23 16:00:00 UTC,440.057925,1.006375,0.070948,"""Picarro""",0.0
2023-10-23 17:00:00 UTC,438.825828,1.006375,0.070948,"""Picarro""",0.0


In [5]:
col_name = "CO2_corr"

df_all = pl.concat([df_a.select("creation_timestamp","system_name",col_name),df_p.select("creation_timestamp","system_name",col_name)], how="diagonal")

plot_sensor_measurement(df_all.collect(), col_name="CO2_corr")


# Scatter Plot, RSME, R^2

In [6]:
print(f"\nAnalysis for System {sensor_id}:")

#----
df_acropolis = df_a.select(["creation_timestamp", "CO2_corr"]) \
    .rename({"CO2_corr": f"CO2_acropolis"}).collect()
    
df_picarro = df_p.select(["creation_timestamp", "CO2_corr"]) \
    .rename({"CO2_corr": f"CO2_picarro"}).collect()
    
df_analysis = df_picarro.join(df_acropolis, on="creation_timestamp")  \
    .filter(pl.col("CO2_picarro") > 0)  \
    .filter(pl.col(f"CO2_acropolis") > 0)
    
l_picarro = df_analysis.select("CO2_picarro").to_series().to_list()
l_acropolis = df_analysis.select(f"CO2_acropolis").to_series().to_list()

#----
if len(l_acropolis) > 0:

    rmse_result = rmse(l_picarro,l_acropolis)
    print(f"RMSE: {rmse_result}")
    
    r2_result = calc_r2(l_picarro,l_acropolis)
    print(f"R2: {r2_result}")


Analysis for System 8:
RMSE: 2.756137188640951
R2: 0.8713160498463219


# Plot in-flow and enclosure differences

In [7]:
df_a = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet")) \
    .filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
    .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{sensor_id}" ) \
    .fill_null(strategy = "backward") \
    .fill_null(strategy = "forward")  \
    .sort(pl.col("creation_timestamp")) \
    .groupby_dynamic("creation_timestamp", every=filter) \
    .agg(pl.all().exclude(["creation_timestamp","DATE","TIME","datetime","variant"]).mean())  


In [8]:
# Pressure Delta

col_name = "pressure: flow - enclosure"

df_temp = df_a.with_columns((pl.col("bme280_pressure") - pl.col("enclosure_bme280_pressure")).alias(col_name)) \
.select("creation_timestamp","system_name",col_name).collect()

import plotly.express as px

fig = px.line(df_temp, x="creation_timestamp", y=col_name, markers=True, title = col_name)
fig.show()

In [9]:
# Humidity Delta
col_name = "humidity: flow - enclosure"

df_temp = df_a.with_columns((pl.col("sht45_humidity") - pl.col("enclosure_bme280_humidity")).alias(col_name)) \
.select("creation_timestamp","system_name",col_name).collect()

import plotly.express as px

fig = px.line(df_temp, x="creation_timestamp", y=col_name, markers=True, title = col_name)
fig.show()

In [10]:
# Temperature Delta
col_name = "temperature: flow - enclosure"

df_temp = df_a.with_columns((pl.col("sht45_temperature") - pl.col("enclosure_bme280_temperature")).alias(col_name)) \
.select("creation_timestamp","system_name",col_name).collect()

import plotly.express as px

fig = px.line(df_temp, x="creation_timestamp", y=col_name, markers=True, title = col_name)
fig.show()