In [1]:

import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

# local imports
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.sbs_times import sbs_times

In [2]:
# 10m preprocessed acropolis data
df_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_cal_corr_acropolis.parquet"))

In [3]:
def extract_timeframes(df_raw):
    all_systems = []

    #--- events during sbs

    # cut roof-top power out
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet change
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    # cut inlet maintainence
    before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
    after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
        
    df_raw = pl.concat([before, after], how="diagonal")

    #---

    for id, start_date, end_date in sbs_times:
        print(f"processing {id}")
        
        #cut maintainence / power outtage from 23.01.2024
        if id in [10,11,13]:
            before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
            after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
            
            df_raw = pl.concat([before, after], how="diagonal")
        
        #-------

        df_filtered = df_raw.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
            .filter(pl.col("system_id") == id)  \
            .filter(pl.col("gmp343_corrected") > 0) \
            .collect()
            
        all_systems.append(df_filtered)
        
    return pl.concat(all_systems, how="diagonal")
    

In [4]:
print("Processing 10m data:")
df_sbs = extract_timeframes(df_10m)
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_sbs_acropolis.parquet"))

print("Processing 1h data:")
df_sbs = extract_timeframes(df_1h)
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_sbs_acropolis.parquet"))

Processing 10m data:
processing 1
processing 3
processing 4
processing 5
processing 6
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 16
processing 18
processing 20
Processing 1h data:
processing 1
processing 3
processing 4
processing 5
processing 6
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 16
processing 18
processing 20


In [5]:
# plot daily mean per station

df_plot = df_sbs.sort("creation_timestamp") \
    .group_by([pl.col("date"), pl.col("system_name")]) \
    .agg([
        (pl.col("diff").mean()).alias("daily_mean"),
        (pl.col("diff").median()).alias("daily_median")
        ]) 
    
fig = px.scatter(df_plot, x="date", y="daily_mean", color = "system_name")
fig.show()
fig = px.scatter(df_plot, x="date", y="daily_median", color = "system_name")
fig.show()