In [1]:
import os
import glob
import polars as pl
import pandas as pd
import plotly.express as px

# local imports
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import hermes_download_client
from utils import ambient_parameter_conversion as apc
from utils import calibration_processing as cp

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

In [2]:
merge_picarro_files = False

# Process Picarro Data

In [3]:
if merge_picarro_files:
    filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

    # read all *.dat picarro measurement files and add to single db
    df_list = []
    for filename in filenames:
        df_list.append(pd.read_csv(filename,sep='\s+'))

    df_p_files = pd.concat(df_list, ignore_index=True)
    df_p_files["datetime"] = pd.to_datetime((df_p_files['DATE'] + ' ' + df_p_files['TIME']))
    df_p_files.sort_values(by='datetime', inplace = True)

    df_p_files.to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

    #Calibration

    #after 18.12
    picarro_slope = 1.0060713120261249
    picarro_intercept = 0.08088569875155827
    
    #1m averaged corrected Picarro dataset
    df_p_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
        .group_by_dynamic("creation_timestamp", every='1m') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "CO2_corr", "h2o_reported"]) \
        .with_columns([pl.lit(picarro_slope).alias("slope"),
            pl.lit(picarro_intercept).alias("intercept"),
            pl.lit("Picarro").alias("system_name"),
            pl.lit(0.0).alias("diff")]
            ) \

    df_p_1m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1m_cal_corr_picarro.parquet"))

# Perform Dry-Wet Conversion

# Import data files

In [4]:
# load calibration bottle concentrations (preprocessed)
df_gas = pl.read_csv(os.path.join(DATA_DIRECTORY,"input", "averaged_gases.csv"))
# load local db: acropolis.parquet
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet")) \
    .filter(pl.col("system_name") != "test-sensor") 

df_slope_intercept = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))
    
df_p_1m = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1m_cal_corr_picarro.parquet"))  

### Measurement Data

In [5]:
# aggregate data to 1m
df_wet_measurements = df_raw.sort("creation_timestamp") \
    .filter(pl.col('gmp343_filtered') > 0) \
    .filter(pl.col('gmp343_temperature') > 0) \
    .filter(pl.col('sht45_humidity') > 0) \
    .filter(pl.col('bme280_pressure') > 0) \
    .group_by_dynamic("creation_timestamp", every='1m', by= "system_name")  \
    .agg(pl.all().exclude(["creation_timestamp","system_name"]).mean()) \
    
# perform dry conversion for measurement data                
df_dry_measurements = df_wet_measurements.with_columns(pl.struct(['gmp343_temperature','sht45_humidity']) \
    .map_elements(lambda x: apc.rh_to_ah(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature'])))
    .alias("h2o_ah")) \
    .with_columns(pl.struct(['gmp343_temperature','sht45_humidity','bme280_pressure'])
    .map_elements(lambda x: (apc.rh_to_molar_mixing(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature']),x['bme280_pressure']*100))*100) \
    .alias("h2o_v%")) \
    .with_columns(pl.struct(['gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
    .map_elements(lambda x: apc.calculate_co2dry(x['gmp343_filtered'],x['gmp343_temperature'],x['sht45_humidity'],x['bme280_pressure']*100))
    .alias("gmp343_dry")) \
    .collect() 

In [6]:
df_dry_measurements.tail(3).select("creation_timestamp","system_name","gmp343_filtered", "h2o_ah", "h2o_v%" ,"gmp343_dry")

creation_timestamp,system_name,gmp343_filtered,h2o_ah,h2o_v%,gmp343_dry
"datetime[μs, UTC]",str,f64,f64,f64,f64
2024-03-28 14:15:00 UTC,"""tum-esm-midcos…",446.183333,7.912228,1.1935,451.572856
2024-03-28 14:16:00 UTC,"""tum-esm-midcos…",446.55,7.95561,1.20068,451.976796
2024-03-28 14:17:00 UTC,"""tum-esm-midcos…",445.72,7.96669,1.202043,451.142931


In [7]:
df_dry_measurements.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_acropolis_dry.parquet"))

# Perform Calibration Correction

## 1m aggregated data

In [8]:
df_dry_measurements = df_dry_measurements.with_columns(pl.col("system_name").str.extract(r'(\d+)',1) \
    .str.to_integer().alias("system_id")) \
    .with_columns(pl.struct(["system_id"]) \
    .map_elements(lambda x: f"mid-cost-{x['system_id']}") \
    .alias("sys_name_short"))

In [9]:
df_systems = []

for id in sensor_id:
    df_system_calibration = df_slope_intercept.filter(pl.col("system_id") == id) \
        .sort("creation_timestamp") \
        .drop("system_id", "system_name", "date")
    
    df_system = df_dry_measurements.filter(pl.col("system_id") == id) \
        .sort("creation_timestamp") \
        .join_asof(df_system_calibration, on="creation_timestamp", strategy="nearest", tolerance="10m") \
        .with_columns([
            pl.col("slope").interpolate(),
            pl.col("intercept").interpolate()
            ]) \
        .with_columns([
            pl.col("slope").forward_fill(),
            pl.col("intercept").forward_fill()
            ]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")).alias("gmp343_corrected")) \
        .with_columns((pl.col("creation_timestamp").dt.date()).alias("date"))
    
    df_systems.append(df_system)
        

df_dry_corrected_1m = pl.concat(df_systems, how="diagonal")

In [10]:
df_dry_corrected_1m.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_cal_corr_acropolis.parquet"))

In [None]:
df_dry_corrected_1m = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_cal_corr_acropolis.parquet"))

In [11]:
df_dry_corrected_1m_filtered = df_dry_corrected_1m.filter(pl.col("gmp343_corrected") > 0).rename({"gmp343_corrected":"CO2_corr"}).select("creation_timestamp","system_name", "CO2_corr")

df_p_1m_filtered = df_p_1m.select("creation_timestamp","system_name", "CO2_corr")

df_plot = pl.concat([df_dry_corrected_1m_filtered,df_p_1m_filtered])

In [12]:
df_plot.head()

creation_timestamp,system_name,CO2_corr
"datetime[μs, UTC]",str,f64
2023-07-06 09:10:00 UTC,"""tum-esm-midcos…",486.328999
2023-07-06 09:11:00 UTC,"""tum-esm-midcos…",429.804312
2023-07-06 09:12:00 UTC,"""tum-esm-midcos…",426.276491
2023-07-06 09:13:00 UTC,"""tum-esm-midcos…",420.606055
2023-07-06 09:14:00 UTC,"""tum-esm-midcos…",421.912972


In [13]:
# Quality Check

sbs = [2,7,9,10,11,14,15,17]

from datetime import datetime, timezone

start = datetime(2024, 3, 22, 6, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 3, 22, 23, 59, 59).replace(tzinfo=timezone.utc)

df_temp = df_plot.filter(pl.col("system_name").is_in([f"tum-esm-midcost-raspi-{id}" for id in sbs] + ["Picarro"])).filter(pl.col("creation_timestamp").is_between(start, end))

fig = px.line(df_temp, x="creation_timestamp", y = "CO2_corr", color = "system_name")
fig.show()