In [1]:
import polars as pl
import os
from datetime import timedelta


DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

In [2]:
df_1_min = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_L2_1_min_acropolis.parquet"))

In [3]:
# Set Flag O for all aggregations with more or equal to 40 datapoints per hour
# Set Flag K for all aggregations with less than 40 datapoints per hour
# New Column: Standard Diviation Stdev
# New Column: Number of point NbPoints

# save a 1h product for ICOS cities portal
df_1_min.sort("creation_timestamp") \
        .filter(pl.col("Flag") == 'O') \
        .drop("Flag") \
        .group_by_dynamic("creation_timestamp", every='1h', by=["system_id", "sys_name_short"]) \
        .agg(pl.all().exclude(["creation_timestamp","system_id", "sys_name_short"]).mean(),
             pl.col("gmp343_corrected").std().alias("Stdev"),
             pl.col("gmp343_corrected").count().alias("NbPoints")) \
        .with_columns(pl.when(pl.col("NbPoints") < 40).then(pl.lit('K')).otherwise(pl.lit('O')).alias("Flag"),
                      (pl.col("creation_timestamp") + timedelta(minutes=30))) \
        .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_L2_1_h_acropolis.parquet"))

  .group_by_dynamic("creation_timestamp", every='1h', by=["system_id", "sys_name_short"]) \
