In [154]:
import polars as pl
import os
from datetime import datetime, timezone
import plotly.express as px
import numpy as np
import csv

from sites_deloyment_times import deployment_times, datetime_format

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

In [155]:
sites_meta = pl.read_csv("sites.csv", separator=";")

df_1_min = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_min_acropolis.parquet"))
df_1_h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "pipeline", "flagged_1_h_acropolis.parquet"))

In [156]:
print(deployment_times["TUMR"])
sites_meta.filter(pl.col("site") == "TUMR")

{'sensors': [{'id': 4, 'start_time': '2024-02-14T00:00:00+0000', 'end_time': '2024-7-1T00:00:00+0000'}, {'id': 2, 'start_time': '2024-7-1T00:00:00+0000', 'end_time': '2024-12-04T12:11:46+0000'}]}


site,site_name,latitude,longitude,elevation,site_type,installation_classification,height_of_building,responsible_party
str,str,f64,f64,f64,str,str,f64,str
"""TUMR""","""TUM Zentralgelände Nord Maxvor…",48.150733,11.569168,511.39,"""city""","""rooftop""",31.06,"""tum:environmental_sensing and_…"


In [157]:
df_1_min.head().collect()

creation_timestamp,system_id,sys_name_short,gmp343_corrected,gmp343_temperature,h2o_v%,bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,OriginalFlag,Flag
"datetime[μs, UTC]",i64,str,f64,f64,f64,f64,f64,f64,i32,str
2024-01-02 12:36:00 UTC,1,"""acropolis-1""",426.914825,30.4,0.750941,924.82,6.6,221.0,0,"""O"""
2024-01-02 12:37:00 UTC,1,"""acropolis-1""",429.886505,30.5,0.756998,923.82,6.6,221.0,0,"""O"""
2024-01-02 12:38:00 UTC,1,"""acropolis-1""",430.034119,30.5,0.756623,924.566667,10.8,232.0,0,"""O"""
2024-01-02 12:39:00 UTC,1,"""acropolis-1""",429.856812,30.533333,0.761341,924.711667,10.8,232.0,0,"""O"""
2024-01-02 12:40:00 UTC,1,"""acropolis-1""",428.979248,30.6,0.765358,926.625,7.3,244.0,0,"""O"""


In [158]:
df_1_h.head().collect()

system_id,sys_name_short,creation_timestamp,gmp343_corrected,gmp343_temperature,h2o_v%,bme280_pressure,wxt532_speed_avg,wxt532_direction_avg,Stdev,NbPoints,OriginalFlag,Flag
i64,str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,u32,i32,str
1,"""acropolis-1""",2024-01-02 12:30:00 UTC,428.685467,30.804861,0.813515,924.885278,11.170833,236.333333,0.861973,24,389,"""K"""
1,"""acropolis-1""",2024-01-02 13:30:00 UTC,429.014985,30.563611,0.977683,924.944139,10.825,233.616667,0.835584,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 14:30:00 UTC,428.934966,30.199222,1.068206,924.2424,10.853333,234.566667,1.00751,60,0,"""O"""
1,"""acropolis-1""",2024-01-02 15:30:00 UTC,428.746457,30.053898,1.168354,924.225864,8.905085,233.355932,0.926748,59,0,"""O"""
1,"""acropolis-1""",2024-01-02 16:30:00 UTC,428.376926,29.929444,1.275909,924.556278,9.388333,233.183333,0.701865,60,0,"""O"""


In [159]:
# Utility

def extract_site_data(df, deployment_times: dict, site_name:str):
    extracted_dates = []
    
    for sensor in deployment_times[site_name]["sensors"]:
        
        id = sensor["id"]
        start_time = datetime.strptime(sensor["start_time"], datetime_format)
        end_time = datetime.strptime(sensor["end_time"], datetime_format)       
        
        df_temp = df.filter(pl.col("system_id") == id)  \
            .filter(pl.col("creation_timestamp") \
            .is_between(start_time, end_time)) \
            .collect()
        
        extracted_dates.append(df_temp)
    
    return pl.concat(extracted_dates)

def plot_1_h_product(df_1_h):
    
    df_plot = df_1_h.filter(pl.col("OriginalFlag") == 0)
    
    fig = px.line(df_plot, 
            x="creation_timestamp", 
            y="gmp343_corrected", 
            markers=True)
    fig.update_traces(connectgaps=False)
    fig.update_layout(
        yaxis_title='CO2 Concentration (ppm)',
        xaxis_title='',
        title='',
    )
    fig.show()
    
    
def plot_1_min_product(df_1_min):
       
    df_plot = df_1_min.with_columns(
    pl.when(pl.col("OriginalFlag") == 185)
    .then(pl.col("gmp343_corrected"))
    .otherwise(np.nan) 
    .alias("co2_389")
    ) \
    .with_columns(
    pl.when(pl.col("OriginalFlag") == 0)
    .then(pl.col("gmp343_corrected"))
    .otherwise(np.nan)
    .alias("co2_0")
    )
    
    fig = px.line(df_plot, 
            x="creation_timestamp", 
            y=["co2_0", "co2_389"], 
            )
    fig.update_traces(connectgaps=False)
    fig.update_layout(
        yaxis_title='CO2 Concentration (ppm)',
        xaxis_title='',
        title='',
    )
    fig.show()
    
def calculate_decimal_year(date) -> float:
    year = date.year 
    today = (date - datetime(year, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc)).total_seconds()
    seconds_total_year = (datetime(year, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc) - datetime(year-1, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc)).total_seconds()

    x=  ((today / seconds_total_year) + year)
    return float("{:.6f}".format(x))

# 1 min data (Level 1)

In [160]:
# Test export
start_date = datetime(2024, 11, 1, 0, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 11, 25, 23, 59, 59).replace(tzinfo=timezone.utc)

df_temp = df_1_min.filter(pl.col("creation_timestamp").is_between(start_date, end_date))
df_site = extract_site_data(df=df_temp, deployment_times = deployment_times, site_name="TUMR")
plot_1_min_product(df_site)

# 1 hour data (Level 2)

In [161]:
# Test export
df_site = extract_site_data(df=df_1_h, deployment_times = deployment_times, site_name="TUMR")
plot_1_h_product(df_site)

In [162]:
for site in deployment_times.keys():
    print(site)
    
    df_temp = extract_site_data(df=df_1_h, deployment_times = deployment_times, site_name=site) \
        .fill_null('') \
        .rename({"gmp343_corrected": "co2", "h2o_v%": "h2o", "wxt532_speed_avg": "ws", "wxt532_direction_avg":"wd"}) \
        .with_columns(
            pl.col("co2").round(2),
            pl.col("h2o").round(2),
            pl.col("ws").round(2),
            pl.col("wd").round(2),
            pl.col("Stdev").round(2)) \
        .with_columns(
            (pl.col("creation_timestamp").dt.year()).alias("Year"),
            (pl.col("creation_timestamp").dt.month()).alias("Month"),
            (pl.col("creation_timestamp").dt.day()).alias("Day"),
            (pl.col("creation_timestamp").dt.hour()).alias("Hour"),
            (pl.col("creation_timestamp").dt.minute()).alias("Minute"),
            (pl.col("creation_timestamp").dt.second()).alias("Second"),
            (pl.col('creation_timestamp').dt.to_string("%Y-%m-%d %H:%M:%S")).alias("#Datetime")) \
        .with_columns(pl.struct(['creation_timestamp']) \
        .map_elements(lambda x: calculate_decimal_year(x['creation_timestamp']), return_dtype=pl.Float64) \
        .alias("DecimalDate")) \
        .select(["#Datetime", "Year", "Month", "Day", "Hour", "Minute", "Second", "DecimalDate", "co2", "h2o", "ws", "wd", "NbPoints", "Stdev", "Flag", "OriginalFlag"]) \
        .with_columns(pl.exclude(pl.Utf8).cast(str)) \
        .fill_null('')
        
    # construct icos cities portal head

    file_name = f"munich_acropolis_{site}.csv"
    file_lines = len(df_temp) + 39
    data_level = 2
    site_short_name = site[:4]
    site_long_name = sites_meta.filter(pl.col("site") == site[:4]).select("site_name").item()
    latitude = sites_meta.filter(pl.col("site") == site[:4]).select("latitude").item()
    longitude = sites_meta.filter(pl.col("site") == site[:4]).select("longitude").item()
    altitude = sites_meta.filter(pl.col("site") == site[:4]).select("elevation").item()
    sampling_height = sites_meta.filter(pl.col("site") == site[:4]).select("height_of_building").item()
    start_date = df_temp.select("#Datetime").row(0)[0]
    stop_date = df_temp.select("#Datetime").row(-1)[0]

    with open(os.path.join(DATA_DIRECTORY,"processed", "icos_cities_portal_processing", "level_2",file_name), 'w', newline='') as file:
        writer = csv.writer(file, delimiter=';', lineterminator='\n')
        field = ["#Datetime", "Year", "Month", "Day", "Hour", "Minute", "Second", "DecimalDate", "co2", "h2o", "ws", "wd", "NbPoints", "Stdev", "Flag", "OriginalFlag"]
    
        writer.writerow(["# TITLE: co2 - continuous time series from low and mid cost sensors"])
        writer.writerow([f"# FILE NAME: {file_name}"])
        writer.writerow(["# DATA FORMAT: see the last line of this header for column description"])
        writer.writerow([f'# TOTAL LINES: {file_lines}'])
        writer.writerow(['# HEADER LINES: 39'])
        writer.writerow(['# PROJECT: ICOS CITIES'])
        writer.writerow([f'# DATA VERSION: L{data_level}'])
        writer.writerow([f'# STATION CODE: {site_short_name}'])
        writer.writerow([f'# STATION NAME: {site_long_name} ({site_short_name})'])
        writer.writerow(['# OBSERVATION CATEGORY: Air sampling observation at a stationary platform'])
        writer.writerow(['# COUNTRY/TERRITORY: DE'])
        writer.writerow([r'# RESPONSIBLE INSTITUTE: TUM, Technial University Munich'])
        writer.writerow(['# CONTRIBUTOR:  Patrick Aigner, Klaus Kürzinger, Jia Chen'])
        writer.writerow(['# CONTACT POINT: Patrick Aigner <patrick.aigner@tum.de>, Jia Chen <jia.chen@tum.de>'])
        writer.writerow(["# FUNDING: European Union's Horizon 2020 Research and Innovation Programme, Grant Agreement No. 101037319"])
        writer.writerow([f'# LATITUDE: {latitude:.6f}'])
        writer.writerow([f'# LONGITUDE: {longitude:.6f}'])
        writer.writerow([f'# ALTITUDE: {altitude:.1f} m asl'])
        
        if site[:4] == "BLUT":
            writer.writerow([f'# SAMPLING HEIGHTS: {float(site[-2:]):.1f} m agl'])
        else:
            writer.writerow([f'# SAMPLING HEIGHTS: {sampling_height:.1f} m agl'])
        
        writer.writerow(['# PARAMETER: co2'])
        writer.writerow([f'# COVERING PERIOD: {start_date} - {stop_date}'])
        writer.writerow(['# TIME INTERVAL: hourly'])
        writer.writerow(['# MEASUREMENT UNIT: µmol/mol'])
        writer.writerow(['# MEASUREMENT METHOD: NDIR'])
        writer.writerow(['# INSTRUMENT: Vaisala GMP343'])
        writer.writerow(['# SAMPLING TYPE: continuous'])
        writer.writerow(['# TIME ZONE: Central European Time (UTC+1), Central European Summer Time (UTC+2)'])
        writer.writerow(['# MEASUREMENT SCALE: WMO-CO2-X2019'])
        writer.writerow(['# DATA POLICY: ICOS CITIES DATA is licensed under a Creative Commons Attribution 4.0 international licence (http://creativecommons.org/licenses/by/4.0/.The ICOS CITIES data licence is described at https://data.icos-cities.eu/licence.'])
        writer.writerow(['# COMMENT:'])
        writer.writerow(['#'])
        writer.writerow(['#   - Times are UTC+0'])
        writer.writerow(['#   - Time-averaged values are reported at the middle of the averaging interval.'])
        writer.writerow(["#   - Flag 'O' = data correct after manual quality control"])
        writer.writerow(["#   - Flag 'K' = data incorrect after manual quality control"])
        writer.writerow(['#   - In case of gaps between instruments, the timeseries are filled with empty string'])
        writer.writerow(['#   - Release notes: '])
        writer.writerow(['#'])
        writer.writerow(field)
        
        for row in df_temp.iter_rows():
            writer.writerow([''.join(item) for item in row])


TUMR
FELR
TAUR
DLRR
SENR
RDIR
SCHR
FINR
SWMR
MAIR
PASR
GROR
BLUT_48
BLUT_85
NPLR
BOGR
HARR
BALR
