In [None]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import polars.selectors as cs
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from utils.plot_data import plot_sensor_measurement
from lmu_meteo_api import interface

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [None]:
# raw measurement data
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

# water corrected measurement data
df_dry = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))

# slope and intercept data derived from calibrations
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet")) \
    .with_columns((pl.col("creation_timestamp").dt.timestamp("ms") / 3.6e6).alias("timestamp")) #3.6e6 ms are in 1h
    
# 10m preprocessed picarro data
df_p_10m = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))

calibration_accuracy_masks = []

In [None]:
def analyse_system(id, start_date, end_date):
    
    df_cal_temp = df_cal.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
        .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
        .collect()

    # calculate calibration corrected dataset
    df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
        .filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
        .with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
        .collect() \
        .join(df_cal_temp, on = ["date","system_name"], how= "left") \
        .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
        .fill_null(strategy = "forward") \
        .fill_null(strategy = "backward") \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
        .alias("CO2_corr")) \
        .sort("creation_timestamp") \
        .groupby_dynamic("creation_timestamp", every='10m')  \
        .agg([
            pl.all().exclude(["creation_timestamp","system_name"]).mean(),
            pl.col("system_name")
            ]) \
        .with_columns(pl.col("system_name").list.last()) \

        
    # calculate the difference to the picarro
    df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
        .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
        .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
        .drop("temp")
                
    # quantify the performance based on the calibration      
    df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='1d')  \
        .agg([
            pl.all().exclude(["creation_timestamp","system_name"]).mean()
            ]) \
    .with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy"))

    # plot results from before
    fig = px.scatter(df_cal_corr, x="creation_timestamp", y="diff", title = "Difference System - Picarro", color="calibration_accuracy")
    fig.show()
    
    #--------------

    # extraxt for date and calibration accuracy information
    df_cal_acc = df_cal_corr.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
        .select("date","calibration_accuracy", "diff")

    # join calibration accuracy into df_cal matched on date
    df_cal_temp = df_cal_temp.join(df_cal_acc, on = ["date"], how= "left")

    fig = px.scatter(df_cal_temp, x="creation_timestamp", y="slope", title = "Slope", color="calibration_accuracy")
    fig.show()

    fig = px.scatter(df_cal_temp, x="creation_timestamp", y="intercept", title = "Intercept", color="calibration_accuracy")
    fig.show()
    
    #--------------

    # selected calibration periods from raw auxialliary measurement data
    df_raw_temp = df_raw.select(cs.starts_with("cal_"), "creation_timestamp", "system_name")  \
        .filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
        .filter(pl.col("cal_bottle_id") > 0) \
        .collect() \
        .filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
        .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
        .sort("creation_timestamp")  \
        .with_columns(pl.col("creation_timestamp").dt.date().alias("date"))  \


    # select the first timestamp of each calibration procedure
    df_calibration_start = df_raw_temp.select("creation_timestamp", "date") \
        .sort(pl.col("creation_timestamp")) \
        .groupby("date").first()

    # normalize all calibrations to start at same time
    df_raw_temp = df_raw_temp.join(df_calibration_start, on = ["date"], how= "left") \
        .with_columns((pl.col("creation_timestamp") - pl.col("creation_timestamp_right")).alias("time_since_calibration_start")) \
        .with_columns((datetime(1970, 1, 1, 0, 0, 0) + pl.col("time_since_calibration_start")).alias("normalised_time"))

    # join calibration accuracy into df_raw_temp matched on date
    df_raw_temp = df_raw_temp.join(df_cal_acc, on = ["date"], how= "left")  \

    fig = px.scatter(df_raw_temp, x="normalised_time", y="cal_sht45_humidity", title = "SHT45 Calibration Humidity", color="calibration_accuracy")
    fig.show()

    fig = px.scatter(df_raw_temp, x="normalised_time", y="cal_bme280_pressure", title = "BME280 Calibration Pressure", color="calibration_accuracy")
    fig.show()

    fig = px.scatter(df_raw_temp, x="normalised_time", y="cal_gmp343_temperature", title = "GMP343 Calibration Temperature", color="calibration_accuracy")
    fig.show()

    fig = px.scatter(df_raw_temp, x="normalised_time", y="cal_gmp343_filtered", title = "GMP343 Filtered Measurement Output", color="calibration_accuracy")
    fig.show()
    
    fig = px.scatter(df_raw_temp, x="normalised_time", y="cal_sht45_temperature", title = "GMP343 Filtered Measurement Output", color="calibration_accuracy")
    fig.show()
    
    return df_cal_temp.select("date", "system_name", "calibration_accuracy", "diff")

## System 1

In [None]:
start_date = datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 10, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(1, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 3

In [None]:
start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(3, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 4

In [None]:
start_date = datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 9, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(4, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 5

In [None]:
start_date = datetime(2024, 2, 6, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(5, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 6

In [None]:
start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(6,start_date, end_date)
#calibration_accuracy_masks.append(df)

## System 8

In [None]:
start_date = datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 9, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(8, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 9

In [None]:
start_date = datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 12, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(9, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 10

In [None]:
start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(10, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 11

In [None]:
start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(11, start_date, end_date)
calibration_accuracy_masks.append(df)

# System 12

In [None]:
id = 12

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(12, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 13

In [None]:
start_date = datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(13, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 16

In [None]:
start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(16, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 18

In [None]:
id = 18

start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(18, start_date, end_date)
calibration_accuracy_masks.append(df)

## System 20

In [None]:
start_date = datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

df = analyse_system(20, start_date, end_date)
calibration_accuracy_masks.append(df)

# Cluster Analysis

In [None]:
def get_lmu_data(start_time="2024-01-01T00-00-00", end_time= "2024-01-02T00-00-00", station_id = 'MIM01'):

    parameters = [
        "air_temperature_2m",
        "air_temperature_30m",
        "relative_humidity_2m",
        "relative_humidity_30m",
        'wind_speed_30m', 
        'wind_from_direction_30m',
        "air_pressure"]

    # download from the API
    lmu_api = interface.meteo_data()
    data = lmu_api.get_meteo_data(parameters=parameters, 
                                station_id= station_id, 
                                start_time=start_time, 
                                end_time=end_time)
    
    df_lmu = pl.from_pandas(data, include_index = True) \
    .rename({"time": "creation_timestamp"}) \
    .with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us").dt.replace_time_zone("UTC")) \
    .with_columns(pl.col("air_temperature_30m") -273.15) \
    .with_columns(pl.col("air_temperature_2m") -273.15)  \
    .with_columns(pl.col("air_pressure") / 100)  \
    .rename({"air_temperature_30m": "temperature", "relative_humidity_30m": "humidity", "air_pressure": "pressure"}) \
    .with_columns(
            pl.lit(f"lmu meteo").alias("system_name")
        )
        
    return df_lmu

In [None]:
def plot_diff_all_systems(start_date, end_date, system_ids, df_lmu):

    df_plots = []

    for id in system_ids:
        df_cal_temp = df_cal.filter(pl.col("creation_timestamp").is_between(start_date, end_date)) \
                .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
                .collect()

        # calculate calibration corrected dataset
        df_cal_corr = df_dry.filter(pl.col("gmp343_dry") > 0) \
            .filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
            .collect() \
            .join(df_cal_temp, on = ["date","system_name"], how= "left") \
            .filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
            .fill_null(strategy = "forward") \
            .fill_null(strategy = "backward") \
            .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
            .alias("CO2_corr")) \
            .sort("creation_timestamp") \
            .groupby_dynamic("creation_timestamp", every='10m')  \
            .agg([
                pl.all().exclude(["creation_timestamp","system_name"]).mean(),
                pl.col("system_name"),
                ]) 
            
        # calculate the difference to the picarro
        df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
            .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
            .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
            .drop("temp")
                        
        # quantify the performance based on the calibration      
        df_cal_corr = df_cal_corr.groupby_dynamic("creation_timestamp", every='10m')  \
            .agg([
                pl.all().exclude(["creation_timestamp","system_name"]).mean()
                ]) \
        .with_columns(((pl.col("diff") < 1.3) & (pl.col("diff") > -1.3)).alias("calibration_accuracy")) \
        .with_columns(pl.lit(f"system-{id}").alias("system_name"))
        
        df_plots.append(df_cal_corr)

    df_plot = pl.concat(df_plots, how="diagonal")   \
        .rename({"gmp343_temperature": "temperature", "sht45_humidity": "humidity", "bme280_pressure": "pressure"})

    # plot results from before
    fig = px.line(df_plot, x="creation_timestamp", y="diff", color="system_name", title = "Difference System - Picarro")
    fig.show()
    
    df_plot = pl.concat([df_lmu, df_plot], how="diagonal")
    
    fig = px.scatter(df_plot, x="creation_timestamp", y="pressure",  color="system_name" )
    fig.show()
    fig = px.scatter(df_plot, x="creation_timestamp", y="humidity",  color="system_name" )
    fig.show()
    fig = px.scatter(df_plot, x="creation_timestamp", y="temperature", color="system_name" )
    fig.show()
    
    

In [72]:
df_cluster_analysis = pl.concat(calibration_accuracy_masks, how="diagonal")

fig = px.scatter(df_cluster_analysis, x="date", y="system_name", title = "Cluster Analysis", color="calibration_accuracy")
fig.show()

fig = px.scatter(df_cluster_analysis, x="date", y="diff", title = "Cluster Analysis", color="system_name")
fig.show()

Let's analyse the dates 31.12, 03.01, 08.01, 28.01, 05.02, 11.02

Maintenance dates:

12.01 Integration of new calibration bottles 3,6,11,13 + check on already installed bottles

05.02 New inlets in the afternoon

13.02 (Upgrade inlets 10:30 - 12:00 local time)

## Good Day

In [None]:
start_date = datetime(2024, 1, 21, 00, 00, 00).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 1, 21, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [3,10,11,12,16,18,20]

df_lmu = get_lmu_data(start_time="2024-01-21T00-00-00", end_time= "2024-01-21T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)

## Bad Days

## 11.02

In [None]:
start_date = datetime(2024, 2, 11, 00, 00, 00).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [3,5,10,11,12,20]

df_lmu = get_lmu_data(start_time="2024-02-11T00-00-00", end_time= "2024-02-11T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)


## 5.02

In [None]:
start_date = datetime(2024, 2, 5, 00, 00, 00).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [3,10,11,12,16,18,20]

df_lmu = get_lmu_data(start_time="2024-02-05T00-00-00", end_time= "2024-02-05T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)


## 28.01

In [None]:
start_date = datetime(2024, 1, 28, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 1, 28, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [10,11,13,16,18,20]

df_lmu = get_lmu_data(start_time="2024-01-28T00-00-00", end_time= "2024-01-28T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)

## 8.01

In [None]:
start_date = datetime(2024, 1, 8, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 1, 8, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [12,16,20]

df_lmu = get_lmu_data(start_time="2024-01-8T00-00-00", end_time= "2024-01-8T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)

## 03.01

In [None]:
start_date = datetime(2024, 1, 3, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 1, 3, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [12,16,18,20]

df_lmu = get_lmu_data(start_time="2024-01-03T00-00-00", end_time= "2024-01-03T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)

## 31.12

In [None]:
start_date = datetime(2023, 12, 31, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2023, 12, 31, 23, 59, 59).replace(tzinfo=timezone.utc)

system_ids = [12,16,18,20]

df_lmu = get_lmu_data(start_time="2023-12-31T00-00-00", end_time= "2023-12-31T23-59-59")
plot_diff_all_systems(start_date, end_date, system_ids, df_lmu)