In [26]:

import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [27]:
# preprocessed acropolis data
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_acropolis_with_picarro.parquet"))

In [28]:
end_date = datetime(2024, 7, 2, 23, 59, 59).replace(tzinfo=timezone.utc)

sbs_times = [
    (
        1,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 26, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        2,
        datetime(2024, 3, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        3,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 18, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        4,
        datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        5,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 25, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        6,
        datetime(2024, 2, 20, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        7,
        datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 21, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        8,
        datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 3, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        9,
        datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 6, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        10,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        11,
        datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        12,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        13,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 1, 30, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        14,
        datetime(2024, 3, 3, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 21, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        15,
        datetime(2024, 2, 26, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 12, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        16,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        17,
        datetime(2024, 3, 28, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 7, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        18,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        19,
        datetime(2024, 3, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        20,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
]

In [29]:
def extract_timeframes(df_raw):
    all_systems = []

    #--- events during sbs

    # cut roof-top power out + warm up period
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet change
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet maintainence
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    #---

    for id, start_date, end_date in sbs_times:
        print(f"processing {id}")
        
        df_temp = df_raw
        
        #cut maintainence / power outtage from 23.01.2024
        if id in [10,11,13]:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")
            
        # wrong configuration for 2nd calibration bottle valve (sampled outside air instead)
        if id == 4:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 3, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 3, 29, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")

        
        #-------

        df_filtered = df_temp.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .filter(pl.col("system_id") == id)  \
            .filter(pl.col("gmp343_corrected") > 0) \
            .collect()
            
        all_systems.append(df_filtered)
        
    return pl.concat(all_systems, how="diagonal")
    

In [33]:
print("Processing 1h data:")
df_sbs = extract_timeframes(df_1h).filter((-20 < pl.col("diff")) & (pl.col("diff") < 20)) 
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_warm_sbs_acropolis.parquet"))

Processing 1h data:
processing 1
processing 2
processing 3
processing 4
processing 5
processing 6
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 16
processing 17
processing 18
processing 19
processing 20


In [35]:
# plot daily mean per station

df_plot = df_sbs.sort("creation_timestamp") \
    .group_by([pl.col("date"), pl.col("sys_name_short")]) \
    .agg([
        (pl.col("diff").mean()).alias("daily_mean"),
        (pl.col("diff").median()).alias("daily_median")
        ]) 
    
fig = px.scatter(df_plot, x="date", y="daily_mean", color = "sys_name_short")
fig.show()
fig = px.scatter(df_plot, x="date", y="daily_median", color = "sys_name_short")
fig.show()