In [None]:
import polars as pl
import os
from datetime import datetime
from datetime import timezone

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [None]:
# raw data
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

In [None]:
ongoing_sbs = datetime(2024, 3, 11, 23, 59, 59).replace(tzinfo=timezone.utc)

sbs_times = [
    (1,datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    #2 is still deployed
    (3,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 18, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (4,datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (5,datetime(2024, 2, 7, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 25, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (6,datetime(2024, 2, 20, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (7,datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (8,datetime(2024, 2, 13, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (9,datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (10,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (11,datetime(2024, 1, 12, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (12,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (13,datetime(2024, 1, 13, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 1, 30, 23, 59, 59).replace(tzinfo=timezone.utc)),
    (14,datetime(2024, 3, 1, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (15,datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc),ongoing_sbs),
    (16,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)),
    # 17 needs assembly
    (18,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 5, 23, 59, 59).replace(tzinfo=timezone.utc)),
    # 19 needs assembly
    (20,datetime(2023, 12, 23, 0, 0, 0).replace(tzinfo=timezone.utc),datetime(2024, 2, 11, 23, 59, 59).replace(tzinfo=timezone.utc))    
]

In [None]:
all_systems = []

# extract system id
df_raw = df_raw.filter(pl.col("cal_bottle_id") > 0) \
    .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id"))

#--- events during sbs

# cut roof-top power out
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

# cut inlet change
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

# cut inlet maintainence
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

#---

for id, start_date, end_date in sbs_times:
    print(f"processing {id}")
    
    #cut maintainence / power outtage from 23.01.2024
    if id in [10,11,13]:
        before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc)) \
            .filter(pl.col("system_id") == id)
        after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc)) \
            .filter(pl.col("system_id") == id)
        
        df_raw = pl.concat([before, after], how="diagonal")
    
    #-------

    df_filtered = df_raw.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
        .filter(pl.col("system_id") == id)  \
        .collect()
        
    all_systems.append(df_filtered)
    

In [None]:
df_sbs.head(3)

In [None]:
df_sbs = pl.concat(all_systems, how="diagonal")
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "filtered_sbs_acropolis.parquet"))

In [None]:
# 10m preprocessed acropolis data
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))

In [None]:
all_systems = []

#--- events during sbs

# cut roof-top power out
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 15, 18, 0, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 17, 0, 0, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

# cut inlet change
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 5, 13, 0, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 5, 15, 30, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

# cut inlet maintainence
before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 2, 13, 10, 30, 0).replace(tzinfo=timezone.utc))
after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 2, 13, 12, 0, 0).replace(tzinfo=timezone.utc))
    
df_raw = pl.concat([before, after], how="diagonal")

#---

for id, start_date, end_date in sbs_times:
    print(f"processing {id}")
    
    #cut maintainence / power outtage from 23.01.2024
    if id in [10,11,13]:
        before = df_raw.filter(pl.col("creation_timestamp") < datetime(2024, 1, 23, 14, 0, 0).replace(tzinfo=timezone.utc))
        after = df_raw.filter(pl.col("creation_timestamp") > datetime(2024, 1, 24, 3, 30, 0).replace(tzinfo=timezone.utc))
        
        df_raw = pl.concat([before, after], how="diagonal")
    
    #-------

    df_filtered = df_raw.filter(pl.col("creation_timestamp").is_between(start_date, end_date))  \
        .filter(pl.col("system_id") == id)  \
        .filter(pl.col("gmp343_filtered") > 0) \
        .collect()
        
    all_systems.append(df_filtered)
    

In [None]:
df_sbs = pl.concat(all_systems, how="diagonal")
df_sbs.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "filtered_10m_sbs_acropolis.parquet"))