In [1]:
from datetime import datetime
from datetime import timezone
import polars as pl
import os
import plotly.express as px

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

# processed average measurement data
df_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_level_1_cities_portal.parquet"))
df_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_level_1_cities_portal.parquet"))
df_1h_unfiltered = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed",  "1h_cal_corr_acropolis.parquet"))


FileNotFoundError: No such file or directory (os error 2): ...cuments/PROJECTS/acropolis-visualisation/data/processed/1m_level_1_cities_portal.parquet

In [None]:
df_1m.tail(2).collect()

In [None]:
df_1h.tail(2).collect()

In [None]:
df_1h_unfiltered.tail(2).collect()

In [None]:
# Utility

def extract_site_data(df, dates:dict[list[tuple]], site_name:str, plot=False):
    extracted_dates = []
    
    for date in dates[site_name]:
        #print(date[0], date[1], date[2])
        
        df_temp = df.filter(pl.col("system_id") == date[0]).filter(pl.col("creation_timestamp").is_between(date[1], date[2])).collect()
        
        extracted_dates.append(df_temp)
    
    df_extracted = pl.concat(extracted_dates)
    
    if plot:
        fig = px.line(df_extracted, x="creation_timestamp", y="co2", markers=True, title=f"{site_name}: CO2 Corrected [ppm]")
        fig.show()
    else:
        return df_extracted

In [None]:
today = datetime(2024, 11, 1, 0, 0, 0).replace(tzinfo=timezone.utc)

# INFO: First two days of deployment are cut due to system warming up and adjusting to new environment

dates = {
      "SENR": [
            (1, datetime(2024, 2, 29, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "DLRR": [
            (14, datetime(2023, 11, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 23, 59, 59).replace(tzinfo=timezone.utc)),
            (5, datetime(2024, 2, 28, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "TUMR": [
            (6, datetime(2024, 2, 21, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 5, 11, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (6, datetime(2024, 5, 31, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "RDIR": [
            (2, datetime(2023, 9, 13, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (8, datetime(2024, 3, 15, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SCHR": [
            (10, datetime(2024, 4, 11, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "FINR": [
            (15, datetime(2023, 11, 16, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (3, datetime(2024, 2, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 4, 2, 23, 59, 59).replace(tzinfo=timezone.utc)),
            (11, datetime(2024, 4, 11, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "TAUR": [
            (8, datetime(2023, 10, 27, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (12, datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "FELR": [
            (7, datetime(2023, 12, 14, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (13, datetime(2024, 2, 22, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "SWMR": [
            (15, datetime(2024, 6, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "MAIR": [
            (1, datetime(2023, 9, 8, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2023, 12, 22, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (16, datetime(2024, 2, 8, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "PASR": [
            (5, datetime(2023, 11, 16, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 2, 6, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (18, datetime(2024, 2, 8, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "GROR": [
            (4, datetime(2023, 9, 22, 0, 0, 0).replace(tzinfo=timezone.utc), datetime(2024, 2, 12, 0, 0, 0).replace(tzinfo=timezone.utc)),
            (20, datetime(2024, 2, 14, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ],
      "BLUT_48": [
            (14, datetime(2024, 6, 23, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "BLUT_85": [
            (7, datetime(2024, 6, 23, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "NPLR": [
            (9, datetime(2024, 6, 26, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "BOGR": [
            (17, datetime(2024, 7, 9, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
      "HARR": [
            (4, datetime(2024, 7, 30, 0, 0, 0).replace(tzinfo=timezone.utc), today)
            ], 
 }

In [None]:
df_plot = extract_site_data(df=df_1h_unfiltered, dates = dates, site_name="DLRR")
df_plot = df_plot.filter(pl.col("creation_timestamp") > datetime(2024, 3, 1, 0, 0, 0).replace(tzinfo=timezone.utc))

fig = px.line(df_plot, x="creation_timestamp", y="gmp343_temperature", markers=True)
fig.update_layout(
    yaxis_title='Sensor Temperature (°C)',
    xaxis_title='',
    title='',
)
fig.show()
fig = px.line(df_plot, x="creation_timestamp", y="enclosure_bme280_temperature", markers=True)
fig.show()

In [None]:
extract_site_data(df=df_1h, dates = dates, site_name="RDIR", plot=True)

In [None]:
extracted_sites_1m = []
extracted_sites_1h = []

for key, site in dates.items():
    
    df_temp = extract_site_data(df=df_1m, dates = dates, site_name=key)
    df_temp = df_temp.with_columns(site_name = pl.lit(key))
    df_temp.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1m_acropolis_{key}_2024.parquet"))
    extracted_sites_1m.append(df_temp)
    
    df_temp = extract_site_data(df=df_1h, dates = dates, site_name=key)
    df_temp = df_temp.with_columns(site_name = pl.lit(key))
    df_temp.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1h_acropolis_{key}_2024.parquet"))
    extracted_sites_1h.append(df_temp)

In [None]:
df_sites = pl.concat(extracted_sites_1m)
df_sites.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1m_acropolis_site_data.parquet"))
df_sites.tail(1)

df_sites = pl.concat(extracted_sites_1h)
df_sites.write_parquet(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites", f"1h_acropolis_site_data.parquet"))
df_sites.tail(1)

In [None]:
def calculate_decimal_year(date):
    year = date.year 
    today = (date - datetime(year, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc)).total_seconds()
    seconds_total_year = (datetime(year, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc) - datetime(year-1, 1, 1, 0, 0, 0).replace(tzinfo=timezone.utc)).total_seconds()

    x=  ((today / seconds_total_year) + year)
    return float("{:.6f}".format(x))

In [None]:
# prepare the ICOS cities portal format
df_temp = df_temp.with_columns(
                (pl.col("creation_timestamp").dt.year()).alias("Year"),
                (pl.col("creation_timestamp").dt.month()).alias("Month"),
                (pl.col("creation_timestamp").dt.day()).alias("Day"),
                (pl.col("creation_timestamp").dt.hour()).alias("Hour"),
                (pl.col("creation_timestamp").dt.minute()).alias("Minute"),
                (pl.col("creation_timestamp").dt.second()).alias("Second"),
                (pl.col('creation_timestamp').dt.to_string("%Y-%m-%d %H:%M:%S")).alias("#Datetime")) \
        .with_columns(pl.struct(['creation_timestamp']) \
        .map_elements(lambda x: calculate_decimal_year(x['creation_timestamp'])) \
        .alias("DecimalDate")) \
    .select(["#Datetime", "Year", "Month", "Day", "Hour", "Minute", "Second", "DecimalDate", "co2", "NbPoints", "Stdev", "Flag", "OriginalFlag"]) \
    .with_columns(pl.exclude(pl.Utf8).cast(str))

In [None]:
# construct icos cities portal head
import csv

file_name = "example_site.csv"
file_lines = len(df_temp) + 39
data_level = 1
site_short_name = 'SMAI'
site_long_name = 'Mittelschule Maisach'
latitude = 12.345678
longitude = 12.3456789
altitude = 523.1234234
sampling_height = 24.234
start_date = df_temp.select("#Datetime").row(0)[0]
stop_date = df_temp.select("#Datetime").row(-1)[0]

with open(os.path.join(DATA_DIRECTORY,"processed/acropolis_sites",file_name), 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';', lineterminator='\n')
    field = ["#Datetime","Year","Month","Day","Hour","Minute","Second","DecimalDate","co2","NbPoints","Stdev","Flag","OriginalFlag"]
    
    writer.writerow(["# TITLE: co2 - continuous time series from low and mid cost sensors"])
    writer.writerow([f"# FILE NAME: {file_name}"])
    writer.writerow(["# DATA FORMAT: see the last line of this header for column description"])
    writer.writerow([f'# TOTAL LINES: {file_lines}'])
    writer.writerow(['# HEADER LINES: 39'])
    writer.writerow(['# PROJECT: ICOS CITIES'])
    writer.writerow([f'# DATA VERSION: L{data_level}'])
    writer.writerow([f'# STATION CODE: {site_short_name}'])
    writer.writerow([f'# STATION NAME: {site_long_name} ({site_short_name})'])
    writer.writerow(['# OBSERVATION CATEGORY: Air sampling observation at a stationary platform'])
    writer.writerow(['# COUNTRY/TERRITORY: DE'])
    writer.writerow([r'# RESPONSIBLE INSTITUTE: TUM, Technial University Munich'])
    writer.writerow(['# CONTRIBUTOR:  Patrick Aigner, Klaus Kürzinger, Jia Chen'])
    writer.writerow(['# CONTACT POINT: Patrick Aigner <patrick.aigner@tum.de>, Jia Chen <jia.chen@tum.de>'])
    writer.writerow(["# FUNDING: European Union's Horizon 2020 Research and Innovation Programme, Grant Agreement No. 101037319"])
    writer.writerow([f'# LATITUDE: {latitude:.6f}'])
    writer.writerow([f'# LONGITUDE: {longitude:.6f}'])
    writer.writerow([f'# ALTITUDE: {altitude:.1f} m asl'])
    writer.writerow([f'# SAMPLING HEIGHTS: {sampling_height:.1f} m agl'])
    writer.writerow(['# PARAMETER: co2'])
    writer.writerow([f'# COVERING PERIOD: {start_date} - {stop_date}'])
    writer.writerow(['# TIME INTERVAL: hourly'])
    writer.writerow(['# MEASUREMENT UNIT: µmol/mol'])
    writer.writerow(['# MEASUREMENT METHOD: NDIR'])
    writer.writerow(['# INSTRUMENT: Vaisala GMP343'])
    writer.writerow(['# SAMPLING TYPE: continuous'])
    writer.writerow(['# TIME ZONE: Central European Time (UTC+1), Central European Summer Time (UTC+2)'])
    writer.writerow(['# MEASUREMENT SCALE: WMO-CO2-X2019'])
    writer.writerow(['# DATA POLICY: ICOS CITIES DATA is licensed under a Creative Commons Attribution 4.0 international licence (http://creativecommons.org/licenses/by/4.0/.The ICOS CITIES data licence is described at https://data.icos-cities.eu/licence.'])
    writer.writerow(['# COMMENT:'])
    writer.writerow(['#'])
    writer.writerow(['#   - Times are UTC+0'])
    writer.writerow(['#   - Time-averaged values are reported at the middle of the averaging interval.'])
    writer.writerow(["#   - Flag 'O' = data correct after manual quality control"])
    writer.writerow(["#   - Flag 'K' = data incorrect after manual quality control"])
    writer.writerow(['#   - In case of gaps between instruments, the timeseries are filled with empty string'])
    writer.writerow(['#   - Release notes: '])
    writer.writerow(['#'])
    writer.writerow(field)
    
    for row in df_temp.iter_rows():
        writer.writerow([''.join(item) for item in row])
