In [9]:
import os
import glob
import polars as pl
import pandas as pd

from utils import hermes_download_client
from utils import ambient_parameter_conversion as apc

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20]

# customize pipeline
download_files = True
merge_picarro_files = True

# load calibration bottle concentrations (preprocessed)
df_gas = pl.read_csv(os.path.join(DATA_DIRECTORY,"input", "averaged_gases.csv"))
# load local db: acropolis.parquet
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

# Download to local db

In [8]:
# download from hermes database
# Use Download/download_from_hermes notebook
if download_files:
    component = hermes_download_client.Extract()
    result = component.execute()

Start downloading from datetime: 
2024-02-23 09:12:51.994106+00:00
Performing merge.
Deleting merged chunks.


# Process Picarro Data

In [11]:
if merge_picarro_files:
    filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

    # read all *.dat picarro measurement files and add to single db
    df_list = []
    for filename in filenames:
        df_list.append(pd.read_csv(filename,sep='\s+'))

    df_p_files = pd.concat(df_list, ignore_index=True)
    df_p_files["datetime"] = pd.to_datetime((df_p_files['DATE'] + ' ' + df_p_files['TIME']))
    df_p_files.sort_values(by='datetime', inplace = True)

    df_p_files.to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

    #Calibration

    # TODO: Add ability for multiple calibration dates
    # before 23.10
    # picarro_slope = 1.0061589132696314
    # picarro_intercept = 0.14607153970888476

    # after 23.10
    #picarro_slope = 1.006374633215469
    #picarro_intercept = 0.0709482571842841
    
    #after 18.12
    picarro_slope = 1.0060429925902534 
    picarro_intercept = 0.09305508001614271
    
    #10m averaged corrected Picarro dataset
    df_p_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
        .group_by_dynamic("creation_timestamp", every='10m') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "CO2_corr", "h2o_reported"]) \
        .with_columns([pl.lit(picarro_slope).alias("slope"),
            pl.lit(picarro_intercept).alias("intercept"),
            pl.lit("Picarro").alias("system_name"),
            pl.lit(0.0).alias("diff")]
            ) \

    df_p_10m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))
    
    #1h averaged corrected Picarro dataset
    df_p_1h = df_p_10m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h') \
        .agg(pl.all().exclude("creation_timestamp").mean())
        
    df_p_1h.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

In [None]:
df_p_10m.head(3)

# Perform Dry-Wet Conversion

In [13]:
# aggregate data to 10m
df_wet_station = df.select(["creation_timestamp", "system_name", 'gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
    .sort("creation_timestamp") \
    .filter(pl.col('gmp343_filtered') > 0) \
    .filter(pl.col('gmp343_temperature') > 0) \
    .filter(pl.col('sht45_humidity') > 0) \
    .filter(pl.col('bme280_pressure') > 0) \
    .group_by_dynamic("creation_timestamp", every='10m', by= "system_name")  \
    .agg(pl.all().exclude(["creation_timestamp","system_name"]).mean()) \
    
                 
df_wet_station.with_columns(pl.struct(['gmp343_temperature','sht45_humidity']) \
    .map_elements(lambda x: apc.rh_to_ah(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature'])))
    .alias("h2o_ah")) \
    .with_columns(pl.struct(['gmp343_temperature','sht45_humidity','bme280_pressure'])
    .map_elements(lambda x: apc.rh_to_molar_mixing(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature']),x['bme280_pressure']*100)) \
    .alias("h2o_ppm")) \
    .with_columns(pl.struct(['gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
    .map_elements(lambda x: apc.calculate_co2dry(x['gmp343_filtered'],x['gmp343_temperature'],x['sht45_humidity'],x['bme280_pressure']*100))
    .alias("gmp343_dry")) \
    .select(["creation_timestamp", "system_name", "gmp343_dry", "h2o_ah", "h2o_ppm","gmp343_temperature","bme280_pressure","sht45_humidity"]) \
    .collect() \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))

# Process Calibration Data

In [14]:
# define functions

def average_bottle(data):
    data = data.to_list()
    #2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data)*0.3):int(len(data)*0.95)]
        return sum(x) / len(x)
    #1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data)*0.5):int(len(data)*0.95)]
        return sum(x) / len(x)
    else:
        return 0.0
    
def two_point_calibration(measured_values, true_values):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        raise ValueError("Both measured_values and true_values must have length 2")

    # Calculate calibration parameters (slope and intercept)
    # 
    slope = (true_values[1] - true_values[0]) / (measured_values[1] - measured_values[0])
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

def calc_slope(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return slope

def calc_intercept(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return intercept

In [15]:
df.with_columns((pl.col("creation_timestamp").dt.date()).alias("date")).head().collect()

creation_timestamp,system_name,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,revision,receipt_timestamp,raspi_memory_usage,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid,date
"datetime[μs, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64,date
2023-06-01 00:00:00.960 UTC,"""tum-esm-midcos…",,,,,,,,,,,,0.031,54.5,0.426,15.1,956.48,33.77,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01
2023-06-01 00:00:02.250 UTC,"""tum-esm-midcos…",,,,,0.7,0.2,1.0,356.0,156.0,255.0,1685600000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01
2023-06-01 00:02:11.070 UTC,"""tum-esm-midcos…",,,,,,,,,,,,0.032,54.0,0.426,15.15,956.43,33.69,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01
2023-06-01 00:02:12.350 UTC,"""tum-esm-midcos…",,,,,1.1,0.2,1.6,56.0,112.0,231.0,1685600000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01
2023-06-01 00:04:21.160 UTC,"""tum-esm-midcos…",,,,,,,,,,,,0.031,54.0,0.426,15.18,956.4,33.66,,,,,,,,,,,,,,,,,,,,,,,,,,,2023-06-01


In [21]:
df_cal = df.with_columns((pl.col("creation_timestamp").dt.date()).alias("date")).collect().lazy() \
.group_by([pl.col("date"), pl.col("system_name"), pl.col("cal_bottle_id")]) \
.agg([pl.col("cal_gmp343_filtered").drop_nulls(),
      pl.col("creation_timestamp").last()]) \
.filter(pl.col("cal_bottle_id") > 0) \



# perform averaging

df_cal = df_cal.with_columns(pl.col("cal_gmp343_filtered").map_elements(lambda x: average_bottle(x)).alias("mean_cal"))

# identify low and high span bottle
df_cal = df_cal.with_columns([
        pl.when(pl.col("mean_cal") < 460).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_low"),
        pl.when(pl.col("mean_cal") > 460).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_high"),
        pl.when(pl.col("mean_cal") < 460).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_low"),
        pl.when(pl.col("mean_cal") > 460).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_high")
    ]) \
    .group_by([pl.col("date").dt.date(), pl.col("system_name")]) \
    .agg([
        pl.col("mean_cal_low").sum(),
        pl.col("mean_cal_high").sum(),
        pl.col("id_cal_bottle_low").sum(),
        pl.col("id_cal_bottle_high").sum(),
        pl.col("creation_timestamp").last()
    ])

df_cal.collect()

# calculate slope and intercept

# filter for days that have a valid calibration for both bottles
df_cal = df_cal.sort(pl.col("date")) \
    .filter(pl.col("mean_cal_low") > 0.0 ) \
    .filter(pl.col("mean_cal_high") > 0.0 )

# calculate slope
df_cal = df_cal.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .map_elements(lambda x: calc_slope(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('slope'))

# calculate intercept
df_cal = df_cal.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .map_elements(lambda x: calc_intercept(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('intercept')) \
    .select(["date", "system_name","slope","intercept", "creation_timestamp"])  \
    #.rename({"creation_timestamp": "date"})

df_cal = df_cal.collect()

# safe results to parquet
df_cal.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))
df_cal.tail()

date,system_name,slope,intercept,creation_timestamp
date,str,f64,f64,"datetime[μs, UTC]"
2024-02-23,"""tum-esm-midcos…",1.004392,-33.840128,2024-02-23 03:31:30.780 UTC
2024-02-23,"""tum-esm-midcos…",0.98298,-38.761326,2024-02-23 03:32:13.520 UTC
2024-02-23,"""tum-esm-midcos…",1.024995,-31.269105,2024-02-23 03:31:29.590 UTC
2024-02-23,"""tum-esm-midcos…",1.019868,-41.202191,2024-02-23 03:21:17.860 UTC
2024-02-23,"""tum-esm-midcos…",1.03498,-5.535651,2024-02-23 03:20:33.140 UTC


# Perform Calibration Correction

In [17]:
# reduced version for calibration correction
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

df_dry = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))
    
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))

df_p_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))   

## 10m aggregated data

In [18]:
df_raw_agg_10m = df_raw.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='10m', by= "system_name") \
        .agg(pl.all().exclude(["creation_timestamp","system_name"]).mean()) \
        .with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
        .join(df_cal.select("date","system_name","slope","intercept"), on = ["date","system_name"], how= "left") \
        .join(df_dry.select("creation_timestamp","system_name", "gmp343_dry"), on = ["creation_timestamp","system_name"], how= "left") \
        .join(df_p_10m.select("creation_timestamp", "CO2_corr"), on = ["creation_timestamp"], how= "left") \
        .with_columns([
                pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id"),
                ((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")).alias("gmp343_corrected"),
                       ]) \
        .with_columns([
                (pl.col("CO2_corr") - pl.col("gmp343_corrected")).alias("diff"),
                       ]) \
        .collect()
        
df_raw_agg_10m.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))


In [19]:
df_raw_agg_1d = df_raw_agg_10m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1d', by="system_name") \
        .agg(pl.all().exclude(["creation_timestamp","system_name"]).mean())
        
df_raw_agg_1d.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_cal_corr_acropolis.parquet"))

In [20]:
import plotly.express as px

fig = px.line(df_raw_agg_1d, x="creation_timestamp", y=f"gmp343_corrected", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_raw_agg_1d, x="creation_timestamp", y=f"diff", markers=True, title = "Diff", color="system_name")
fig.show()

fig = px.line(df_raw_agg_1d, x="creation_timestamp", y=f"slope", markers=True, title = "slope", color="system_name")
fig.show()

fig = px.line(df_raw_agg_1d, x="creation_timestamp", y=f"intercept", markers=True, title = "intercept", color="system_name")
fig.show()
