In [1]:
import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px

from sites_deloyment_times import deployment_times, datetime_format

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

In [2]:
sites_meta = pl.read_csv("sites.csv", separator=";")

df_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_min_acropolis.parquet"))
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_h_acropolis.parquet"))
#df_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_min_nsigma_3.0_acropolis.parquet"))
#df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_h_nsigma_3.0_acropolis.parquet"))

In [3]:
print(deployment_times["TUMR"])
sites_meta.filter(pl.col("site") == "TUMR")

{'sensors': [{'id': 4, 'start_time': '2024-02-14T00:00:00+0000', 'end_time': '2024-7-1T00:00:00+0000'}, {'id': 2, 'start_time': '2024-7-1T00:00:00+0000', 'end_time': '2024-11-28T09:35:53+0000'}]}


site,site_name,latitude,longitude,elevation,site_type,installation_classification,height_of_building,responsible_party
str,str,f64,f64,f64,str,str,f64,str
"""TUMR""","""TUM Zentralgel…",48.150733,11.569168,511.39,"""city""","""rooftop""",31.06,"""tum:environmen…"


In [4]:
df_1m.head().collect()

creation_timestamp,system_id,sys_name_short,co2,sensor_temperature,h2o,ws,wd,OriginalFlag,Flag
"datetime[μs, UTC]",i64,str,f64,f64,f64,f64,f64,i32,str
2024-01-02 12:36:00 UTC,1,"""acropolis-1""",426.914825,30.4,0.750941,6.6,221.0,0,"""O"""
2024-01-02 12:37:00 UTC,1,"""acropolis-1""",429.886505,30.5,0.756998,6.6,221.0,0,"""O"""
2024-01-02 12:38:00 UTC,1,"""acropolis-1""",430.034119,30.5,0.756623,10.8,232.0,0,"""O"""
2024-01-02 12:39:00 UTC,1,"""acropolis-1""",429.856812,30.533333,0.761341,10.8,232.0,0,"""O"""
2024-01-02 12:40:00 UTC,1,"""acropolis-1""",428.979248,30.6,0.765358,7.3,244.0,0,"""O"""


In [5]:
site_name = "TUMR"
sensor = 0

deployment_times[site_name]["sensors"][sensor]["id"]
deployment_times[site_name]["sensors"][sensor]["start_time"]
deployment_times[site_name]["sensors"][sensor]["end_time"]

'2024-7-1T00:00:00+0000'

In [6]:
# Utility

def extract_site_data(df, deployment_times: dict, site_name:str):
    extracted_dates = []
    
    for sensor in deployment_times[site_name]["sensors"]:
        
        id = sensor["id"]
        start_time = datetime.strptime(sensor["start_time"], datetime_format)
        end_time = datetime.strptime(sensor["end_time"], datetime_format)       
        
        df_temp = df.filter(pl.col("system_id") == id)  \
            .filter(pl.col("creation_timestamp") \
            .is_between(start_time, end_time)) \
            .collect()
        
        extracted_dates.append(df_temp)
    
    return pl.concat(extracted_dates)

def plot_extracted_data(df_plot, filter_flag:bool = True):
    
    if filter_flag:
        df_plot= df_plot.filter(pl.col("OriginalFlag") == 0)
        
    df_plot = df_plot.with_columns(
    pl.when(pl.col("OriginalFlag") == 185)
    .then(pl.col("co2"))
    .otherwise(None)  # Assign None (or np.nan if you export to Pandas)
    .alias("co2_185")
    ) \
    .with_columns(
    pl.when(pl.col("OriginalFlag") == 0)
    .then(pl.col("co2"))
    .otherwise(None)  # Assign None (or np.nan if you export to Pandas)
    .alias("co2_0")
    )
    
    fig = px.line(df_plot, 
            x="creation_timestamp", 
            y=["co2_0", "co2_185"], 
            )
    fig.update_traces(connectgaps=False)
    fig.update_layout(
        yaxis_title='CO2 Concentration (ppm)',
        xaxis_title='',
        title='',
    )
    fig.show()

In [7]:
df_site = extract_site_data(df=df_1h, deployment_times = deployment_times, site_name="TUMR")
plot_extracted_data(df_site, filter_flag=False)


In [10]:
start_date = datetime(2024, 11, 1, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 11, 25, 23, 59, 59).replace(tzinfo=timezone.utc)

df_test = df_1m.filter(pl.col("creation_timestamp").is_between(start_date, end_date))
df_site = extract_site_data(df=df_test, deployment_times = deployment_times, site_name="FINR")
plot_extracted_data(df_site, filter_flag=False)