In [29]:

import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px
from plotly.subplots import make_subplots

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [30]:
# preprocessed acropolis data
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_acropolis_with_picarro.parquet"))
df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

In [31]:
end_date = datetime(2024, 7, 2, 23, 59, 59).replace(tzinfo=timezone.utc)

sbs_times = [
    (
        2,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        4,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        6,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    ),
    (
        7,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 21, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        14,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 21, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        15,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 6, 12, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        17,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 7, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        19,
        datetime(2024, 6, 5, 0, 0, 0).replace(tzinfo=timezone.utc),
        end_date,
    )
]

In [32]:
def extract_timeframes(df_raw):
    all_systems = []

    #--- events during sbs

    # cut roof-top power out + warm up period
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet change
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet maintainence
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    #---

    for id, start_date, end_date in sbs_times:
        print(f"processing {id}")
        
        df_temp = df_raw
        
        #cut maintainence / power outtage from 23.01.2024
        if id in [10,11,13]:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")
            
        # wrong configuration for 2nd calibration bottle valve (sampled outside air instead)
        if id == 4:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 3, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 3, 29, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")

        
        #-------

        df_filtered = df_temp.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .filter(pl.col("system_id") == id)  \
            .filter(pl.col("gmp343_corrected") > 0) \
            .collect()
            
        all_systems.append(df_filtered)
        
    return pl.concat(all_systems, how="diagonal")
    

In [33]:
print("Processing 1h data:")
df_sbs = extract_timeframes(df_1h).filter((-20 < pl.col("diff")) & (pl.col("diff") < 20)) 
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_warm_sbs_acropolis.parquet"))

Processing 1h data:
processing 2
processing 4
processing 6
processing 7
processing 14
processing 15
processing 17
processing 19


In [34]:
# plot daily mean per station

df_plot = df_sbs.sort("creation_timestamp") \
    .group_by([pl.col("date"), pl.col("sys_name_short")]) \
    .agg([
        (pl.col("diff").mean()).alias("daily_mean"),
        (pl.col("diff").median()).alias("daily_median")
        ]) 
    
fig = px.scatter(df_plot, x="date", y="daily_mean", color = "sys_name_short")
fig.show()
fig = px.scatter(df_plot, x="date", y="daily_median", color = "sys_name_short")
fig.show()

In [38]:
df_plot_1 = df_p_1h[["creation_timestamp", "sys_name_short", "picarro_corrected"]].rename({"picarro_corrected": "CO2"})
df_plot_2 = df_sbs[["creation_timestamp", "sys_name_short", "gmp343_corrected","gmp343_temperature", "gmp343_temperature_change", "diff"]].rename({"gmp343_corrected": "CO2"})
df_plot = pl.concat([df_plot_1,df_plot_2], how="diagonal")

# Quality Check
start = datetime(2024, 6, 1, 12, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 7, 2, 0, 0, 0).replace(tzinfo=timezone.utc)

df_temp = df_plot.filter(pl.col("creation_timestamp").is_between(start, end))

figures = [
            px.scatter(df_temp, x="creation_timestamp", y = "CO2", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "diff", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "gmp343_temperature_change", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "gmp343_temperature", color = "sys_name_short")
            
    ]

fig = make_subplots(rows=len(figures), cols=1) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=i+1, col=1)
        
fig.update_layout(title_text='Plot', height=1000, width=1300, showlegend=False) 
fig.show()