In [None]:
import polars as pl
import os
from datetime import datetime
from datetime import timezone
import plotly.express as px
import numpy as np

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORy = os.environ.get("PICARRO_DATA_DIRECTORY")

In [None]:
# 10m preprocessed acropolis data
df_acropolis = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_sbs_rmse_acropolis.parquet"))

In [None]:
df_filtered = df_acropolis.group_by(["date", "system_name"]) \
    .agg([
        pl.col("slope").mean(),
        pl.col("intercept").mean(),
        pl.col("system_id").mean(),
    ])

In [None]:
all_systems = []

system_list = df_acropolis.collect()["system_id"].unique().to_list()

for id in system_list:

    df_temp = df_acropolis.sort("date") \
        .filter(pl.col("system_id") == id).with_columns([
            (pl.col("slope").rolling_mean(window_size=72)).alias("slope_rolling_mean"),
            (pl.col("intercept").rolling_mean(window_size=72)).alias("intercept_rolling_mean"),
        ])  \
    .fill_null(strategy = "backward")
    
    all_systems.append(df_temp)
    
df_all_filtered = pl.concat(all_systems, how="diagonal")

In [None]:
df_filtered = df_acropolis.join(df_all_filtered, on = ["date", "system_name"], how= "left") \
    .collect()

In [None]:
fig = px.line(df_filtered, x="creation_timestamp", y=["slope","slope_rolling_mean"], color = "system_name")
fig.show()
fig = px.line(df_filtered, x="creation_timestamp", y=["slope","slope_rolling_mean"], color = "system_name")
fig.show()

In [None]:
all_systems = []

system_list = df_filtered["system_id"].unique().to_list()

for id in system_list:
    print(f"processing {id}")

    df_temp = df_filtered.filter(pl.col("system_id") == id) \
            .with_columns(
                ((pl.col("gmp343_dry")) * pl.col("slope_rolling_mean") + pl.col("intercept_rolling_mean")).alias("gmp343_corrected_rolling_mean")
                ) \
            .with_columns([
                (pl.col("CO2_corr") - pl.col("gmp343_corrected_rolling_mean")).alias("diff_rolling_mean"),
                (np.square(np.subtract(pl.col("CO2_corr"), pl.col("gmp343_corrected_rolling_mean"))).mean()).alias("mse_rolling_mean")
                        ]) \
            .with_columns(pl.col("mse_rolling_mean").sqrt().alias("rmse_rolling_mean"))
            
    all_systems.append(df_temp)
    
df_all_filtered = pl.concat(all_systems, how="diagonal")

In [None]:
fig = px.line(df_all_filtered, x="creation_timestamp", y="diff", color = "system_name")
fig.show()
fig = px.line(df_all_filtered, x="creation_timestamp", y="diff_rolling_mean", color = "system_name")
fig.show()

fig = px.scatter(df_all_filtered, x="system_name", y=["rmse_full_deployment", "rmse_rolling_mean"])
fig.show()