In [14]:
import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px
from plotly.subplots import make_subplots

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [15]:
# preprocessed acropolis data
df_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "side-by-side", "1_h_diff_with_picarro.parquet"))
df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "picarro", "Calibrated_1_h_DWD_Picarro_G2301_413.parquet"))

In [16]:
sbs_times = [
    (
        1,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 26, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        2,
        datetime(2024, 3, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        3,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 18, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        4,
        datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 3, 20, 0, 0, 0).replace(tzinfo=timezone.utc),
    ),
    (
        5,
        datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 25, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        6,
        datetime(2024, 2, 20, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        7,
        datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        8,
        datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 3, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        9,
        datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 5, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        10,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        11,
        datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 7, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        12,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        13,
        datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 1, 30, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        14,
        datetime(2024, 3, 3, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        15,
        datetime(2024, 2, 26, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        16,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        17,
        datetime(2024, 3, 28, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        18,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        19,
        datetime(2024, 3, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 4, 17, 9, 59, 59).replace(tzinfo=timezone.utc),
    ),
    (
        20,
        datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),
        datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc),
    ),
]

In [17]:
def extract_timeframes(df_raw):
    all_systems = []

    #--- events during sbs

    # cut roof-top power out + warm up period
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet change
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet maintainence
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    #---

    for id, start_date, end_date in sbs_times:
        print(f"processing {id}")
        
        df_temp = df_raw
        
        #cut maintainence / power outtage from 23.01.2024
        if id in [10,11,13]:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")
            
        # wrong configuration for 2nd calibration bottle valve (sampled outside air instead)
        if id == 4:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 3, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 3, 29, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_temp = pl.concat([before, after], how="diagonal")

        
        #-------

        df_filtered = df_temp.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .filter(pl.col("system_id") == id)  \
            .filter(pl.col("co2") > 0) \
            
        all_systems.append(df_filtered)
        
    return pl.concat(all_systems, how="diagonal")
    

In [18]:
#print("Processing 10m data:")
#df_sbs = extract_timeframes(df_10m)
#df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_sbs_acropolis.parquet"))

print("Processing 1h data:")
df_sbs = extract_timeframes(df_1h).with_columns(date = pl.col("creation_timestamp").dt.date())
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "side-by-side", "1_h_sbs_period_acropolis.parquet"))

Processing 1h data:
processing 1
processing 2
processing 3
processing 4
processing 5
processing 6
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 16
processing 17
processing 18
processing 19
processing 20


In [19]:
# plot daily mean per station

df_plot = df_sbs.sort("creation_timestamp") \
    .group_by([pl.col("date"), pl.col("sys_name_short")]) \
    .agg([
        (pl.col("diff").mean()).alias("daily_mean"),
        (pl.col("diff").median()).alias("daily_median")
        ]) 
    
fig = px.scatter(df_plot, x="date", y="daily_mean", color = "sys_name_short")
fig.show()
fig = px.scatter(df_plot, x="date", y="daily_median", color = "sys_name_short")
fig.show()

In [21]:
df_p_1h = df_p_1h.with_columns(sys_name_short = pl.lit("Picarro"))

In [22]:
df_sbs

system_id,sys_name_short,creation_timestamp,co2,sensor_temperature,h2o,ws,wd,OriginalFlag,Flag,picarro_corrected,diff,std,date
i64,str,"datetime[μs, UTC]",f32,f64,f64,f64,f64,f64,str,f64,f64,f32,date
1,"""acropolis-1""",2024-02-07 00:00:00 UTC,434.258606,27.36152,0.866406,,,0.0,,434.095549,0.162983,0.917915,2024-02-07
1,"""acropolis-1""",2024-02-07 01:00:00 UTC,433.584625,27.26994,0.865792,,,0.0,,433.758836,-0.174289,0.964072,2024-02-07
1,"""acropolis-1""",2024-02-07 02:00:00 UTC,432.616638,27.199138,0.863854,,,0.0,,432.898855,-0.28219,0.802239,2024-02-07
1,"""acropolis-1""",2024-02-07 03:00:00 UTC,432.564117,27.215217,0.861336,,,0.0,,432.805598,-0.241462,0.770869,2024-02-07
1,"""acropolis-1""",2024-02-07 04:00:00 UTC,434.090302,27.827401,0.685201,,,0.0,,434.324278,-0.233952,1.573511,2024-02-07
1,"""acropolis-1""",2024-02-07 05:00:00 UTC,434.586945,27.429583,0.81374,,,0.0,,434.594763,-0.007819,1.269557,2024-02-07
1,"""acropolis-1""",2024-02-07 06:00:00 UTC,433.299316,27.285278,0.842763,,,0.0,,433.858231,-0.55891,0.883854,2024-02-07
1,"""acropolis-1""",2024-02-07 07:00:00 UTC,432.844788,27.370833,0.856129,,,0.0,,433.406828,-0.561938,1.11164,2024-02-07
1,"""acropolis-1""",2024-02-07 08:00:00 UTC,431.500427,27.486389,0.86548,,,0.0,,431.822734,-0.322267,1.817399,2024-02-07
1,"""acropolis-1""",2024-02-07 09:00:00 UTC,429.803192,27.6025,0.881236,,,0.0,,429.467027,0.336136,1.016455,2024-02-07


In [23]:
df_plot_1 = df_p_1h[["creation_timestamp", "sys_name_short", "picarro_corrected"]].rename({"picarro_corrected": "CO2"})
df_plot_2 = df_sbs[["creation_timestamp", "sys_name_short", "co2","sensor_temperature", "diff"]].rename({"co2": "CO2"})
df_plot = pl.concat([df_plot_1,df_plot_2], how="diagonal")

# Quality Check
start = datetime(2024, 2, 14, 12, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 4, 17, 0, 0, 0).replace(tzinfo=timezone.utc)

df_temp = df_plot.filter(pl.col("creation_timestamp").is_between(start, end)).filter(pl.col("sys_name_short").is_in(["acropolis-14", "acropolis-7","acropolis-6","acropolis-4",  "acropolis-2", "Picarro"]))

figures = [
            px.scatter(df_temp, x="creation_timestamp", y = "CO2", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "diff", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "gmp343_temperature", color = "sys_name_short")
            
    ]

fig = make_subplots(rows=len(figures), cols=1) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=i+1, col=1)

fig.update_layout(title_text='Plot', height=1000, width=1300, showlegend=False) 
fig.show()

SchemaError: cannot extend/append Float64 with Float32

In [None]:
# warm week

df_plot_1 = df_p_1h[["creation_timestamp", "sys_name_short", "picarro_corrected"]].rename({"picarro_corrected": "CO2"})
df_plot_2 = df_sbs[["creation_timestamp", "sys_name_short", "gmp343_corrected"," sensor_temperature"]].rename({"gmp343_corrected": "CO2"})
df_plot = pl.concat([df_plot_1,df_plot_2], how="diagonal")

# Quality Check
start = datetime(2024, 4, 4, 0, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 4, 10, 0, 0, 0).replace(tzinfo=timezone.utc)

df_temp = df_plot.filter(pl.col("creation_timestamp").is_between(start, end)).filter(pl.col("sys_name_short").is_in(["acropolis-14", "acropolis-7","acropolis-6","acropolis-4",  "acropolis-2", "Picarro"]))

figures = [
            px.scatter(df_temp, x="creation_timestamp", y = "CO2", color = "sys_name_short"),
            #px.scatter(df_temp, x="creation_timestamp", y = "diff", color = "sys_name_short"),
            px.scatter(df_temp, x="creation_timestamp", y = "gmp343_temperature", color = "sys_name_short")
            
    ]

fig = make_subplots(rows=len(figures), cols=1) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=i+1, col=1)

fig.update_layout(title_text='Plot', height=1000, width=1300, showlegend=False) 
fig.show()