In [23]:
import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px
import numpy as np

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

In [24]:
df_slope_intercept = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet")) \
    .sort("creation_timestamp") \
    .with_columns(ordinal_day = pl.col("creation_timestamp").dt.ordinal_day())

df_1min = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_acropolis_dry.parquet")) \
    .drop("system_name")
    
start = datetime(2023, 12, 1, 12, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 4, 17, 0, 0, 0).replace(tzinfo=timezone.utc)
df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet")) \
    .filter(pl.col("creation_timestamp").is_between(start, end)) \
    .select("creation_timestamp","sys_name_short", "picarro_corrected")

In [25]:
sbs_times = [
    (
        1,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 26, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        2,
        datetime(2024, 3, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        3,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 18, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        4,
        datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 3, 20, 0, 0, 0).replace(tzinfo=timezone.utc),
    ),
    (
        5,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 25, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        6,
        datetime(2024, 2, 20, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        7,
        datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        8,
        datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 3, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        9,
        datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 5, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        10,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        11,
        datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        12,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        13,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 1, 30, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        14,
        datetime(2024, 3, 3, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        15,
        datetime(2024, 2, 26, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        16,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        17,
        datetime(2024, 3, 28, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        18,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        19,
        datetime(2024, 3, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        20,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
]

def extract_timeframes(df_input):
    all_systems = []

    #--- events during sbs

    # cut roof-top power out + warm up period
    before = df_input.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
    after = df_input.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
        
    df_input = pl.concat([before, after], how="diagonal")

    # cut inlet change
    before = df_input.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
    after = df_input.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
        
    df_input = pl.concat([before, after], how="diagonal")

    # cut inlet maintainence
    before = df_input.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
    after = df_input.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
        
    df_input = pl.concat([before, after], how="diagonal")

    #---

    for id, start_date, end_date in sbs_times:
        
        df_temp = df_input
        
        #cut maintainence / power outtage from 23.01.2024
        if id in [10,11,13]:
            before = df_input.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
            after = df_input.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")
            
        # wrong configuration for 2nd calibration bottle valve (sampled outside air instead)
        if id == 4:
            before = df_input.filter(pl.col("creation_timestamp") < datetime(2024, 3, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            after = df_input.filter(pl.col("creation_timestamp") > datetime(2024, 3, 29, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")

        
        #-------

        df_filtered = df_temp.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .filter(pl.col("system_id") == id)  \
            
        all_systems.append(df_filtered)
        
    return pl.concat(all_systems, how="diagonal")
    

In [26]:
df_1min = extract_timeframes(df_1min)
df_slope_intercept = extract_timeframes(df_slope_intercept)

In [27]:
id = 1

fig = px.line(df_1min.filter(pl.col("system_id") == id), x="creation_timestamp", y = "gmp343_dry", color = "system_id")
fig.show()

fig = px.line(df_slope_intercept.filter(pl.col("system_id") == id), x="creation_timestamp", y = "slope", color = "system_id")
fig.show()

fig = px.line(df_slope_intercept.filter(pl.col("system_id") == id), x="creation_timestamp", y = "intercept", color = "system_id")
fig.show()

In [28]:
def perform_calibration_correction(sensor_id:list = [1], frequency_days: int = 2):
    df_systems = []
    
    for id in sensor_id:  
        
        df_slope_intercept_id = df_slope_intercept.filter(pl.col("ordinal_day")%frequency_days == 0) \
            .filter(pl.col("system_id") == id) \
            .sort("creation_timestamp") \
            .drop("system_id")
        
        df_system = df_1min.filter(pl.col("system_id") == id) \
            .sort("creation_timestamp") \
            .join_asof(df_slope_intercept_id, on="creation_timestamp", strategy="nearest", tolerance="10m") \
            .with_columns([
                pl.col("slope").interpolate(),
                pl.col("intercept").interpolate()
                ]) \
            .with_columns([
                pl.col("slope").forward_fill().backward_fill(),
                pl.col("intercept").forward_fill().backward_fill()
                ]) \
            .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")).alias(f"gmp343_corrected")) \
            .select("creation_timestamp", "system_id", "gmp343_corrected") \
            .group_by_dynamic("creation_timestamp", every='1h', by="system_id") \
            .agg(pl.all().exclude(["creation_timestamp"]).mean()) \
            .join(df_p_1h, on="creation_timestamp") \
            .with_columns([
                (pl.col("picarro_corrected") - pl.col("gmp343_corrected")).alias("diff"),
                (np.abs(pl.col("picarro_corrected") - pl.col("gmp343_corrected"))).mean().alias("mae"),
                (np.square(np.subtract(pl.col("picarro_corrected"), pl.col("gmp343_corrected"))).mean()).alias("mse"),
                ]) \
            .with_columns([pl.col("mse").sqrt().alias(f"rmse_{frequency_days}days")]) \
            .select("system_id", f"rmse_{frequency_days}days") \
            .group_by("system_id").mean().sort("system_id")
            
                
        df_systems.append(df_system)
    
    return pl.concat(df_systems, how="vertical")

In [29]:
sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
#sensor_id = [1]

df_plot = perform_calibration_correction(sensor_id=sensor_id, frequency_days=1)

for frequency_days in [2,3,4,5,6,7,8,9,10]:
    
    df_plot = df_plot.join(perform_calibration_correction(sensor_id=sensor_id, frequency_days=frequency_days), on="system_id")

    #fig = px.scatter(df_plot, x="system_id", y="rmse")
    #fig.show()

In [30]:
df_plot

system_id,rmse_1days,rmse_2days,rmse_3days,rmse_4days,rmse_5days,rmse_6days,rmse_7days,rmse_8days,rmse_9days,rmse_10days
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,1.065359,1.10005,1.687015,1.366203,2.297369,1.73975,2.001627,1.214418,1.378633,2.297369
2,0.685147,1.007577,0.733174,1.029758,0.845497,1.028979,0.738798,1.054241,1.351022,1.054241
3,1.343295,1.332189,1.414858,1.753237,1.44476,1.994411,1.44558,1.981701,1.96929,1.791542
4,1.262997,1.260741,1.918129,1.50318,1.642934,1.916499,1.769396,1.508435,1.719027,1.642934
5,0.932753,0.963522,0.957468,0.804556,0.948707,1.071135,1.145928,0.923212,0.798284,0.948707
6,1.172259,1.148265,1.272518,1.109891,1.483734,1.260355,1.34366,1.267475,1.361371,1.451034
7,0.784584,1.143699,0.772233,1.139394,0.917841,1.122866,0.926049,1.718271,1.205283,1.70784
8,1.602697,1.606961,2.191018,1.940991,2.370201,2.202978,1.996692,1.984873,2.093081,2.370201
9,0.850758,0.889207,0.955493,0.914981,1.344151,0.991115,1.177734,1.349499,1.152981,1.344151
10,1.632818,1.610787,1.916721,1.55569,1.634763,2.060196,1.695987,2.513203,2.256929,1.619751


In [35]:
for id in sensor_id:
    list = df_plot.filter(pl.col("system_id") == id).drop("system_id").transpose()["column_0"].to_list()

    fig = px.line(x = range(1,len(list)+1), y = list)
    fig.update_layout(title=f"{id}", yaxis_range = [0.5,2.5])
    fig.show()

In [None]:
# plot as relative performance and stack