In [37]:
import os
import sys
import polars as pl
import plotly.express as px
import polars.selectors as cs

from datetime import datetime

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

unflagged_data = False

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)
    
from utils.paths import DESPIKED_DATA_DIRECTORY, PROCESSED_PICARRO_DATA_DIRECTORY, POSTPROCESSED_DATA_DIRECTORY
from utils.import_data import import_acropolis_system_data
from utils.plot_dataframes import plot_column, plot_column_difference

assert(os.path.exists(DESPIKED_DATA_DIRECTORY))
assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [66]:
start_time = datetime(2024, 4, 1, 0, 0, 0)

side_by_side_times = {
    1: {"start_time": start_time, "end_time": datetime(2024, 2, 29, 0, 0, 0)},
    2: {"start_time": start_time, "end_time": datetime(2024, 7, 30, 0, 0, 0)},
    4: {"start_time": start_time, "end_time": datetime(2024, 7, 30, 0, 0, 0)},
    5: {"start_time": start_time, "end_time": datetime(2024, 2, 28, 0, 0, 0)},
    6: {"start_time": start_time, "end_time": datetime(2024, 8, 30, 0, 0, 0)},
    7: {"start_time": start_time, "end_time": datetime(2024, 6, 23, 0, 0, 0)},
    8: {"start_time": start_time, "end_time": datetime(2024, 3, 15, 0, 0, 0)},
    9: {"start_time": start_time, "end_time": datetime(2024, 6, 26, 0, 0, 0)},
    10: {"start_time": start_time, "end_time": datetime(2024, 4, 10, 0, 0, 0)},
    11: {"start_time": start_time, "end_time": datetime(2024, 4, 8, 0, 0, 0)},
    12: {"start_time": start_time, "end_time": datetime(2024, 2, 14, 0, 0, 0)},
    13: {"start_time": start_time, "end_time": datetime(2024, 2, 22, 0, 0, 0)},
    14: {"start_time": start_time, "end_time": datetime(2024, 6, 23, 0, 0, 0)},
    15: {"start_time": start_time, "end_time": datetime(2024, 5, 20, 0, 0, 0)},
    16: {"start_time": start_time, "end_time": datetime(2024, 2, 8, 0, 0, 0)},
    # 17 is commented out
    18: {"start_time": start_time, "end_time": datetime(2024, 2, 8, 0, 0, 0)},
    20: {"start_time": start_time, "end_time": datetime(2024, 2, 14, 0, 0, 0)},
}

In [67]:
def extract_timeframes(df: pl.DataFrame, id: int) -> pl.DataFrame:
    # cut to start and end time of side-by-side
    return df.filter(pl.col("datetime").is_between(side_by_side_times[id]["start_time"], side_by_side_times[id]["end_time"]))  \
        .filter(pl.col("system_id") == id)  \
        .filter(pl.col("gmp343_corrected") > 0) \
        .collect()
        
def join_picarro_data(df: pl.DataFrame, df_p: pl.DataFrame) -> pl.DataFrame:
    return df.sort("datetime") \
    .join_asof(df_p, on="datetime", tolerance="1m", allow_exact_matches=True)

In [87]:
df_p = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY,"Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))

start_date = start_time
end_date =  datetime(2024, 4, 18, 0, 0, 0)

df_p = df_p.select(["datetime", "picarro_corrected"]) \
    .filter(pl.col("datetime").is_between(start_date, end_date)) \
    .collect()
    
df_p.head(1).vstack(df_p.tail(1))

datetime,picarro_corrected
datetime[ms],f64
2024-04-01 00:00:00,435.122086
2024-04-17 11:04:00,469.973389


In [88]:
all_systems = []

for id in side_by_side_times.keys():
    print("Processing system:", id)

    df_system = import_acropolis_system_data(
            years=[2024],
            target_directory=POSTPROCESSED_DATA_DIRECTORY,
            id=id,
            prefix="1min"
        ) \
        .pipe(extract_timeframes, id=id) \
        .pipe(join_picarro_data, df_p)
        
    all_systems.append(df_system)
    
df = pl.concat(all_systems, how="diagonal")

Processing system: 1
Processing system: 2
Processing system: 4
Processing system: 5
Processing system: 6
Processing system: 7
Processing system: 8
Processing system: 9
Processing system: 10
Processing system: 11
Processing system: 12
Processing system: 13
Processing system: 14
Processing system: 15
Processing system: 16
Processing system: 18
Processing system: 20


In [98]:
start_date = datetime(2024, 4, 4, 0, 0, 0)
end_date = datetime(2024, 4, 17, 0, 0, 0)
df_filtered = df.filter(pl.col("datetime").is_between(start_date, end_date)) \
    .filter(pl.col("gmp343_corrected") < 500) \
    .sort("datetime") \
    .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
    .group_by_dynamic("datetime", every='1h', group_by=["system_id", "system_name"]) \
    .agg(cs.numeric().mean()) \
    .sort("system_id")

In [99]:
fig = px.scatter(df_filtered, x="datetime", y="gmp343_temperature", color="system_name")
fig.show()
fig = px.scatter(df_filtered, x="datetime", y="diff", color="system_name")
fig.show()
fig = px.scatter(df_filtered, x="datetime", y="gmp343_corrected", color="system_name")
fig.show()
fig = px.scatter(df_filtered, x="datetime", y="sht45_humidity", color="system_name")
fig.show()