In [1]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px

from download import extract
from utils import processing_utils as pu

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20]

# customize pipeline
download_files = True
merge_picarro_files = False

# load calibration bottle concentrations (preprocessed)
df_gas = pl.read_csv(os.path.join(DATA_DIRECTORY,"input", "averaged_gases.csv"))
# load local db: acropolis.parquet
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet"))

# Download to local db

In [2]:
# download from hermes database
# Use Download/download_from_hermes notebook
if download_files:
    component = extract.Extract()
    result = component.execute()

Start downloading from datetime: 
2024-02-20 12:56:14.221898+00:00
2024-02-20 13:12:47.262879+00:00
2024-02-20 13:29:18.085299+00:00
2024-02-20 13:45:49.170653+00:00
2024-02-20 14:02:17.614481+00:00
2024-02-20 14:18:06.463232+00:00
2024-02-20 14:33:18.381416+00:00
2024-02-20 14:48:46.930536+00:00
2024-02-20 15:03:30.274214+00:00
2024-02-20 15:18:41.264162+00:00
2024-02-20 15:33:29.041352+00:00
2024-02-20 15:48:11.754161+00:00
2024-02-20 16:03:12.297150+00:00
2024-02-20 16:17:54.352815+00:00
2024-02-20 16:32:45.099620+00:00
2024-02-20 16:47:30.413754+00:00
2024-02-20 17:02:15.479428+00:00
2024-02-20 17:17:01.004375+00:00
2024-02-20 17:31:44.059652+00:00
2024-02-20 17:46:28.901341+00:00
2024-02-20 18:01:13.282297+00:00
2024-02-20 18:15:58.436474+00:00
2024-02-20 18:30:41.573054+00:00
2024-02-20 18:45:25.192721+00:00
2024-02-20 19:00:10.440829+00:00
2024-02-20 19:14:53.109382+00:00
2024-02-20 19:29:38.781256+00:00
2024-02-20 19:44:22.611697+00:00
2024-02-20 19:59:03.030723+00:00
2024-02-2

# Process Picarro Data

In [3]:
if merge_picarro_files:
    filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

    # read all *.dat picarro measurement files and add to single db
    df_list = []
    for filename in filenames:
        df_list.append(pd.read_csv(filename,sep='\s+'))

    df_p_files = pd.concat(df_list, ignore_index=True)
    df_p_files["datetime"] = pd.to_datetime((df_p_files['DATE'] + ' ' + df_p_files['TIME']))
    df_p_files.sort_values(by='datetime', inplace = True)

    df_p_files.to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

    #Calibration

    # TODO: Add ability for multiple calibration dates
    # before 23.10
    # picarro_slope = 1.0061589132696314
    # picarro_intercept = 0.14607153970888476

    # after 23.10
    #picarro_slope = 1.006374633215469
    #picarro_intercept = 0.0709482571842841
    
    #after 18.12
    picarro_slope = 1.0060429925902534 
    picarro_intercept = 0.09305508001614271

    #1h averaged corrected Picarro dataset
    df_p_1h = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
        .groupby_dynamic("creation_timestamp", every='1h') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "CO2_corr", "h2o_reported"]) \
        .with_columns([pl.lit(picarro_slope).alias("slope"),
            pl.lit(picarro_intercept).alias("intercept"),
            pl.lit("Picarro").alias("system_name"),
            pl.lit(0.0).alias("diff")]
            ) \

    df_p_1h.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

    #10m averaged corrected Picarro dataset
    df_p_10m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
        .groupby_dynamic("creation_timestamp", every='10m') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "CO2_corr", "h2o_reported"]) \
        .with_columns([pl.lit(picarro_slope).alias("slope"),
            pl.lit(picarro_intercept).alias("intercept"),
            pl.lit("Picarro").alias("system_name"),
            pl.lit(0.0).alias("diff")]
            ) \

    df_p_10m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))

In [4]:
pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")).head(1).collect()

DATE,TIME,FRAC_DAYS_SINCE_JAN1,FRAC_HRS_SINCE_JAN1,JULIAN_DAYS,EPOCH_TIME,ALARM_STATUS,INST_STATUS,CavityPressure,CavityTemp,DasTemp,EtalonTemp,species,OutletValve,CH4,CH4_dry,CO2,CO2_dry,h2o_reported,ch4_base,ch4_pzt_std,co2_base,co2_pzt_std,wlm1_offset,wlm2_offset,datetime,__index_level_0__
str,str,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],i64
"""2023-06-23""","""00:00:01.489""",173.000017,4152.000414,174.000017,1687500000.0,0,963,139.983005,44.999779,43.375,45.151695,2.0,22398.639168,1.96398,2.009492,425.117804,437.149219,2.186075,1182.223133,84.583058,1085.124996,71.320269,-0.079042,-0.047342,2023-06-23 00:00:01.489,13303676


# Perform Dry-Wet Conversion

In [5]:
df.head(1).collect()

creation_timestamp,system_name,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,wxt532_speed_avg,wxt532_speed_min,wxt532_speed_max,wxt532_direction_avg,wxt532_direction_min,wxt532_direction_max,wxt532_last_update_time,raspi_cpu_usage,raspi_cpu_temperature,raspi_disk_usage,enclosure_bme280_humidity,enclosure_bme280_pressure,enclosure_bme280_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,cal_bottle_id,cal_gmp343_raw,cal_gmp343_compensated,cal_gmp343_filtered,cal_gmp343_temperature,cal_bme280_temperature,cal_bme280_humidity,cal_bme280_pressure,cal_sht45_temperature,cal_sht45_humidity,revision,receipt_timestamp,raspi_memory_usage,wxt532_temperature,wxt532_heating_voltage,wxt532_supply_voltage,wxt532_reference_voltage,ups_battery_error_detected,ups_battery_above_voltage_threshold,ups_battery_is_fully_charged,ups_powered_by_grid
"datetime[μs, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,f64
2023-06-01 00:00:00.960 UTC,"""tum-esm-midcos…",,,,,,,,,,,,0.031,54.5,0.426,15.1,956.48,33.77,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
df_wet_stations = []

for id in sensor_id:  
    # aggregate data to 1m
    df_wet_station = df.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
        .select(["creation_timestamp", "system_name", 'gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
        .sort("creation_timestamp") \
        .filter(pl.col('gmp343_filtered') > 0) \
        .filter(pl.col('gmp343_temperature') > 0) \
        .filter(pl.col('sht45_humidity') > 0) \
        .filter(pl.col('bme280_pressure') > 0) \
        .groupby_dynamic("creation_timestamp", every='10m')  \
        .agg([
            pl.all().exclude(["creation_timestamp","system_name"]).mean(),
            pl.col("system_name")
            ]) \
        .with_columns(pl.col("system_name").list.last()) \
    
    df_wet_stations.append(df_wet_station)
     
    # concat all station specific aggregated dfs   
    df_wet_concat = pl.concat(df_wet_stations, how="diagonal")
        
    df_wet_concat.with_columns(pl.struct(['gmp343_temperature','sht45_humidity']) \
    .apply(lambda x: pu.rh_to_ah(x['sht45_humidity'],pu.absolute_temperature(x['gmp343_temperature'])))
    .alias("h2o_ah")) \
    .with_columns(pl.struct(['gmp343_temperature','sht45_humidity','bme280_pressure'])
    .apply(lambda x: pu.rh_to_molar_mixing(x['sht45_humidity'],pu.absolute_temperature(x['gmp343_temperature']),x['bme280_pressure']*100)) \
    .alias("h2o_ppm")) \
    .with_columns(pl.struct(['gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
    .apply(lambda x: pu.calculate_co2dry(x['gmp343_filtered'],x['gmp343_temperature'],x['sht45_humidity'],x['bme280_pressure']*100))
    .alias("gmp343_dry")) \
    .select(["creation_timestamp", "system_name", "gmp343_dry", "h2o_ah", "h2o_ppm","gmp343_temperature","bme280_pressure","sht45_humidity"]) \
    .collect() \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))

ImportError: cannot import name 're_escape' from 'polars.utils.various' (/Users/patrickaigner/Documents/PROJECTS/acropolis-visualisation/.venv/lib/python3.11/site-packages/polars/utils/various.py)

# Process Calibration Data

In [None]:
# define functions

def average_bottle(data):
    data = data.to_list()
    #2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data)*0.3):int(len(data)*0.95)]
        return sum(x) / len(x)
    #1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data)*0.5):int(len(data)*0.95)]
        return sum(x) / len(x)
    else:
        return 0.0
    
def two_point_calibration(measured_values, true_values):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        raise ValueError("Both measured_values and true_values must have length 2")

    # Calculate calibration parameters (slope and intercept)
    # 
    slope = (true_values[1] - true_values[0]) / (measured_values[1] - measured_values[0])
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

def calc_slope(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return slope

def calc_intercept(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return intercept

In [None]:
df.with_columns((pl.col("creation_timestamp").dt.date()).alias("date")).head().collect()

In [None]:

df_cal = df.with_columns((pl.col("creation_timestamp").dt.date()).alias("date")).collect().lazy() \
.groupby([pl.col("date"), pl.col("system_name"), pl.col("cal_bottle_id")]) \
.agg([pl.col("cal_gmp343_filtered").drop_nulls(),
      pl.col("creation_timestamp").last()]) \
.filter(pl.col("cal_bottle_id") > 0) \



# perform averaging

df_cal = df_cal.with_columns(pl.col("cal_gmp343_filtered").apply(lambda x: average_bottle(x)).alias("mean_cal"))

# identify low and high span bottle
df_cal = df_cal.with_columns([
        pl.when(pl.col("mean_cal") < 460).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_low"),
        pl.when(pl.col("mean_cal") > 460).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_high"),
        pl.when(pl.col("mean_cal") < 460).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_low"),
        pl.when(pl.col("mean_cal") > 460).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_high")
    ]) \
    .groupby([pl.col("date").dt.date(), pl.col("system_name")]) \
    .agg([
        pl.col("mean_cal_low").sum(),
        pl.col("mean_cal_high").sum(),
        pl.col("id_cal_bottle_low").sum(),
        pl.col("id_cal_bottle_high").sum(),
        pl.col("creation_timestamp").last()
    ])

df_cal.collect()

# calculate slope and intercept

# filter for days that have a valid calibration for both bottles
df_cal = df_cal.sort(pl.col("date")) \
    .filter(pl.col("mean_cal_low") > 0.0 ) \
    .filter(pl.col("mean_cal_high") > 0.0 )

# calculate slope
df_cal = df_cal.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .apply(lambda x: calc_slope(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('slope'))

# calculate intercept
df_cal = df_cal.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .apply(lambda x: calc_intercept(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('intercept')) \
    .select(["date", "system_name","slope","intercept", "creation_timestamp"])  \
    #.rename({"creation_timestamp": "date"})

df_cal = df_cal.collect()

# safe results to parquet
df_cal.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))
df_cal.tail()

# Perform Calibration Correction

In [None]:
# reduced version for calibration correction
df_dry = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "acropolis_dry.parquet"))
    
df_cal = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))

df_p_10m = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))   
df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

In [None]:
df_cal.filter(pl.col("system_name")=="tum-esm-midcost-raspi-1").tail().collect()

## 1h aggregated data

In [None]:
# reduce timestamp to date DD.XX.YYYY for measurement dataset and join slope and intercept from df_cal
df_date = df_dry.filter(pl.col("gmp343_dry") > 0) \
    .with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
    .join(df_cal, on = ["date","system_name"], how= "left")
    
        
l_df_cal_corr =[df_p_1h]

for id in sensor_id:  
    # broadcast: via backward and forward fill
    # calibration correction: via coloumn operation 
    # aggregation: defined by filter
    # offset calculation to reference instrument PICARRO
    df_cal_corr = df_date.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
    .fill_null(strategy = "backward") \
    .fill_null(strategy = "forward") \
    .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
    .alias("CO2_corr")) \
    .sort("creation_timestamp") \
    .groupby_dynamic("creation_timestamp", every='1h')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean(),
        pl.col("system_name")
        ]) \
    .with_columns(pl.col("system_name").list.last()) \
    .collect()
        
    df_cal_corr = df_cal_corr.join(df_p_1h.select("creation_timestamp", "CO2_corr") \
                .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
                .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
                .drop("temp")
        
    l_df_cal_corr.append(df_cal_corr)
        
    
df_cal_corr_agg = pl.concat(l_df_cal_corr, how="diagonal")
df_cal_corr_agg.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_cal_corr_acropolis.parquet"))


In [None]:
import plotly.express as px

fig = px.line(df_cal_corr_agg, x="creation_timestamp", y=f"CO2_corr", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_cal_corr_agg, x="creation_timestamp", y=f"diff", markers=True, title = "CO2", color="system_name")
fig.show()

fig = px.line(df_cal_corr_agg, x="creation_timestamp", y=f"slope", markers=True, title = "slope", color="system_name")
fig.show()

fig = px.line(df_cal_corr_agg, x="creation_timestamp", y=f"intercept", markers=True, title = "intercept", color="system_name")
fig.show()


## 10m aggregated data

In [None]:
# reduce timestamp to date DD.XX.YYYY for measurement dataset and join slope and intercept from df_cal
df_date = df_dry.with_columns(pl.col("creation_timestamp").dt.date().alias("date")) \
    .join(df_cal, on = ["date","system_name"], how= "left")
        
l_df_cal_corr =[df_p_10m]

for id in sensor_id:  
    # broadcast: via backward and forward fill
    # calibration correction: via coloumn operation 
    # aggregation: defined by filter
    # offset calculation to reference instrument PICARRO
    df_cal_corr = df_date.filter(pl.col("system_name") == f"tum-esm-midcost-raspi-{id}") \
    .filter(pl.col("gmp343_dry") > 0) \
    .fill_null(strategy = "backward") \
    .fill_null(strategy = "forward") \
    .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")) \
    .alias("CO2_corr")) \
    .sort("creation_timestamp") \
    .groupby_dynamic("creation_timestamp", every='10m')  \
    .agg([
        pl.all().exclude(["creation_timestamp","system_name"]).mean(),
        pl.col("system_name")
        ]) \
    .with_columns(pl.col("system_name").list.last()) \
    .collect()
        
    df_cal_corr = df_cal_corr.join(df_p_10m.select("creation_timestamp", "CO2_corr") \
                .rename({"CO2_corr": "temp"}), on="creation_timestamp", how= "left") \
                .with_columns((pl.col("CO2_corr") - pl.col("temp")).alias("diff")) \
                .drop("temp")
        
    l_df_cal_corr.append(df_cal_corr)
        
    
df_cal_corr_agg = pl.concat(l_df_cal_corr, how="diagonal")
df_cal_corr_agg.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))
