In [178]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
import math
from sklearn.metrics import r2_score
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [179]:
# raw measurement data
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

# water corrected measurement data
df_dry = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))

# slope and intercept data derived from calibrations
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet")) \
    .with_columns((pl.col("creation_timestamp").dt.timestamp("ms") / 3.6e6).alias("timestamp")) #3.6e6 ms are in 1h
    
# 10m preprocessed picarro data
df_p_10m = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))

## System 6

In [187]:
id = 6

start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

## System 10

In [189]:
id = 10

start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

# System 12

In [183]:
id = 12

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

## System 13

In [193]:
id = 13

start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

## System 16

In [180]:
id = 16

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

## System 18

In [181]:
id = 18

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()

## System 20

In [184]:
id = 20

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

# calculate calibration corrected dataset
df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.join(df_cal, on = ["date","system_name"], how= "left") \
.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.fill_null(strategy = "forward") \
.fill_null(strategy = "backward") \
.with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
.alias("CO2_corr")) \
.sort("creation_timestamp") \
.groupby_dynamic("creation_timestamp", every='10m')  \
.agg([
    pl.all().exclude(["creation_timestamp","system_name"]).mean(),
    pl.col("system_name")
    ]) \
.with_columns(pl.col("system_name").list.last()) \
.collect()
    
# calculate the difference to the picarro
df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
            
# quantify the performance based on the calibration      
df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean()
        ]) \
.with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

# plot results from before
fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
fig.show()

# join the calibration accuracy on the slope and intercept data and visualise
dataframe = df_cal.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
.collect()

temp = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
.select("date","calibration_accuracy")

dataframe = dataframe.join(temp, on = ["date"], how= "left")

fig = px.scatter(dataframe, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
fig.show()

fig = px.scatter(dataframe, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
fig.show()