In [54]:
from datetime import datetime
from datetime import timezone
import polars as pl
import os
import plotly.express as px

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

# processed 10min average measurement data
df_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_level_1_cities_portal.parquet"))
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_level_1_cities_portal.parquet"))


In [59]:
df_1m.tail(2).collect()

#Datetime,system_id,sys_name_short,co2,h2o,ws,wd,OriginalFlag,Flag,Year,Month,Day,Hour,Minute,Second
"datetime[μs, UTC]",i64,str,f32,f64,f64,f64,i32,str,i32,i8,i8,i8,i8,i8
2024-07-17 07:11:00 UTC,20,"""acropolis-20""",431.944061,2.075215,1.4,240.3,0,"""O""",2024,7,17,7,11,0
2024-07-17 07:12:00 UTC,20,"""acropolis-20""",435.061432,2.119144,1.4,240.3,0,"""O""",2024,7,17,7,12,0


In [58]:
df_1h.tail(2).collect()

system_id,sys_name_short,#Datetime,co2,h2o,ws,wd,Stdev,NbPoints,OriginalFlag,Flag,Year,Month,Day,Hour,Minute,Second
i64,str,"datetime[μs, UTC]",f32,f64,f64,f64,f32,u32,i32,str,i32,i8,i8,i8,i8,i8
17,"""acropolis-17""",2024-07-17 06:30:00 UTC,458.691956,2.269222,0.906667,280.038333,16.240496,60,0,"""O""",2024,7,17,6,30,0
17,"""acropolis-17""",2024-07-17 07:30:00 UTC,437.800995,2.27189,0.892308,277.638462,2.147389,13,389,"""K""",2024,7,17,7,30,0


In [64]:
# Utility

def extract_site_data(df, dates:dict[list[tuple]], site_name:str, plot=False):
    extracted_dates = []
    
    for date in dates[site_name]:
        #print(date[0], date[1], date[2])
        
        df_temp = df.filter(pl.col("system_id") == date[0]).filter(pl.col("#Datetime").is_between(date[1], date[2])).collect()
        
        extracted_dates.append(df_temp)
    
    df_extracted = pl.concat(extracted_dates)
    
    if plot:
        fig = px.line(df_extracted, x="#Datetime", y="co2", markers=True, title=f"{site_name}: CO2 Corrected [ppm]")
        fig.show()
        
    return df_extracted

In [62]:
today = datetime(2024, 7, 17, 0, 0, 0).replace(tzinfo=timezone.utc)

# INFO: First two days of deployment are cut due to system warming up and adjusting to new environment

dates = {
      "KLEG": [
            (1, datetime(2024, 2, 29, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "DLRO": [
            (14, datetime(2023, 11, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 23, 59, 59).replace(tzinfo=timezone.utc)),
            (5, datetime(2024, 2, 28, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "TUMN": [
            (6, datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 5, 11, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (6, datetime(2024, 5, 31, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "KRDI": [
            (2, datetime(2023, 9, 13, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (8, datetime(2024, 3, 15, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SWGG": [
            (10, datetime(2024, 4, 11, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "RFIN": [
            (15, datetime(2023, 11, 16, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (3, datetime(2024, 2, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 4, 2, 23, 59, 59).replace(tzinfo=timezone.utc)),
            (11, datetime(2024, 4, 11, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "WKRT": [
            (8, datetime(2023, 10, 27, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (12, datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "RFEL": [
            (7, datetime(2023, 12, 14, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (13, datetime(2024, 2, 22, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SWMZ": [
            (15, datetime(2024, 6, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SMAI": [
            (1, datetime(2023, 9, 8, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (16, datetime(2024, 2, 8, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SBBG": [
            (5, datetime(2023, 11, 16, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 2, 6, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (18, datetime(2024, 2, 8, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "LGRO": [
            (4, datetime(2023, 9, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (20, datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "FB48": [
            (14, datetime(2024, 6, 23, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "FB85": [
            (7, datetime(2024, 6, 23, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "KNPL": [
            (9, datetime(2024, 6, 26, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "KBOG": [
            (17, datetime(2024, 7, 9, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
 }


In [65]:
extract_site_data(df=df_1h, dates = dates, site_name="SMAI")

system_id,sys_name_short,#Datetime,co2,h2o,ws,wd,Stdev,NbPoints,OriginalFlag,Flag,Year,Month,Day,Hour,Minute,Second
i64,str,"datetime[μs, UTC]",f32,f64,f64,f64,f32,u32,i32,str,i32,i8,i8,i8,i8,i8
1,"""acropolis-1""",2023-09-08 00:30:00 UTC,525.686646,1.536833,0.788333,150.666667,36.749001,60,0,"""O""",2023,9,8,0,30,0
1,"""acropolis-1""",2023-09-08 01:30:00 UTC,555.169189,1.487815,0.668333,185.6,34.284832,60,0,"""O""",2023,9,8,1,30,0
1,"""acropolis-1""",2023-09-08 02:30:00 UTC,586.737854,1.449841,0.595,239.566667,26.875011,60,0,"""O""",2023,9,8,2,30,0
1,"""acropolis-1""",2023-09-08 03:30:00 UTC,606.083923,1.408887,1.026667,186.5,18.97349,60,0,"""O""",2023,9,8,3,30,0
1,"""acropolis-1""",2023-09-08 04:30:00 UTC,623.262939,1.391929,0.743333,229.95,20.104853,60,0,"""O""",2023,9,8,4,30,0
1,"""acropolis-1""",2023-09-08 05:30:00 UTC,611.634399,1.540214,0.73,232.483333,20.82835,60,0,"""O""",2023,9,8,5,30,0
1,"""acropolis-1""",2023-09-08 06:30:00 UTC,549.209229,1.59476,1.18,245.0,31.699793,60,0,"""O""",2023,9,8,6,30,0
1,"""acropolis-1""",2023-09-08 07:30:00 UTC,497.243591,1.671611,0.87,237.666667,9.433885,60,0,"""O""",2023,9,8,7,30,0
1,"""acropolis-1""",2023-09-08 08:30:00 UTC,466.497528,1.7407,0.77,220.6,11.224157,60,0,"""O""",2023,9,8,8,30,0
1,"""acropolis-1""",2023-09-08 09:30:00 UTC,450.882324,1.88543,0.495,158.533333,3.305336,60,0,"""O""",2023,9,8,9,30,0


In [66]:
extracted_dates = []

for key, site in dates.items():
    
    df_temp = extract_site_data(df=df_1m, dates = dates, site_name=key)
    df_temp = df_temp.with_columns(site_name = pl.lit(key))
    df_temp.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1m_acropolis_{key}.parquet"))
    
    df_temp = extract_site_data(df=df_1h, dates = dates, site_name=key)
    df_temp = df_temp.with_columns(site_name = pl.lit(key))
    df_temp.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1h_acropolis_{key}.parquet"))
    extracted_dates.append(df_temp)