In [129]:
import os
import glob
import polars as pl
import pandas as pd
import plotly.express as px

from utils import hermes_download_client
from utils import ambient_parameter_conversion as apc
from utils import calibration_processing as cp

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

# customize pipeline
download_files = False
merge_picarro_files = False

# Download to local db

In [130]:
# download from hermes database
# Use Download/download_from_hermes notebook
if download_files:
    
    while(True):
        try:
            component = hermes_download_client.Extract()
            result = component.execute()
            break
        except Exception as e:
            print(e)

# Process Picarro Data

In [131]:
if merge_picarro_files:
    filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

    # read all *.dat picarro measurement files and add to single db
    df_list = []
    for filename in filenames:
        df_list.append(pd.read_csv(filename,sep='\s+'))

    df_p_files = pd.concat(df_list, ignore_index=True)
    df_p_files["datetime"] = pd.to_datetime((df_p_files['DATE'] + ' ' + df_p_files['TIME']))
    df_p_files.sort_values(by='datetime', inplace = True)

    df_p_files.to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

    #Calibration

    # TODO: Add ability for multiple calibration dates
    # before 23.10
    # picarro_slope = 1.0061589132696314
    # picarro_intercept = 0.14607153970888476

    # after 23.10
    #picarro_slope = 1.0063874771746113
    #picarro_intercept = 0.06621464961165202
    
     
    #after 18.12
    picarro_slope = 1.0060713120261249
    picarro_intercept = 0.08088569875155827
    
    #1m averaged corrected Picarro dataset
    df_p_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
        .group_by_dynamic("creation_timestamp", every='1m') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "CO2_corr", "h2o_reported"]) \
        .with_columns([pl.lit(picarro_slope).alias("slope"),
            pl.lit(picarro_intercept).alias("intercept"),
            pl.lit("Picarro").alias("sys_name_short"),
            pl.lit(0.0).alias("diff")]
            ) 

    df_p_1m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1m_cal_corr_picarro.parquet"))
    
    
    #1h averaged corrected Picarro dataset
    df_p_10m = df_p_1m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='10m') \
        .agg(pl.all().exclude("creation_timestamp").mean())
        
    df_p_10m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))
    
    #1h averaged corrected Picarro dataset
    df_p_1h = df_p_1m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h') \
        .agg(pl.all().exclude("creation_timestamp").mean())
        
    df_p_1h.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

# Import data files

In [132]:
# load calibration bottle concentrations (preprocessed)
df_gas = pl.read_csv(os.path.join(DATA_DIRECTORY,"input", "averaged_gases.csv"))
# load local db: acropolis.parquet
df_raw = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "acropolis.parquet")) \
    .filter(pl.col("system_name") != "test-sensor") \
    .with_columns(pl.col("system_name").str.extract(r'(\d+)',1).str.to_integer().alias("system_id"))
    
df_p_1m = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1m_cal_corr_picarro.parquet"))  

# Perform Dry-Wet Conversion

### Measurement Data

In [133]:
# aggregate data to 1m
df_1_m = df_raw.sort("creation_timestamp") \
    .select(pl.all().exclude('^wxt532_.*$')) \
    .select(pl.all().exclude('^cal_.*$')) \
    .select(pl.all().exclude('^enclosure_.*$')) \
    .select(pl.all().exclude('^raspi_.*$')) \
    .select(pl.all().exclude('^ups_.*$')) \
    .filter(pl.col('gmp343_filtered') > 0) \
    .filter(pl.col('gmp343_temperature') > 0) \
    .filter(pl.col('sht45_humidity') > 0) \
    .filter(pl.col('bme280_pressure') > 0) \
    .group_by_dynamic("creation_timestamp", every='1m', by= "system_id")  \
    .agg(pl.all().exclude(["creation_timestamp","system_id"]).mean()) \
    
# perform dry conversion for measurement data                
df_1_m = df_1_m.with_columns(pl.struct(['gmp343_temperature','sht45_humidity']) \
    .map_elements(lambda x: apc.rh_to_ah(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature'])))
    .alias("h2o_ah")) \
    .with_columns(pl.struct(['gmp343_temperature','sht45_humidity','bme280_pressure'])
    .map_elements(lambda x: (apc.rh_to_molar_mixing(x['sht45_humidity'],apc.absolute_temperature(x['gmp343_temperature']),x['bme280_pressure']*100))*100) \
    .alias("h2o_v%")) \
    .with_columns(pl.struct(['gmp343_filtered','gmp343_temperature','sht45_humidity','bme280_pressure']) \
    .map_elements(lambda x: apc.calculate_co2dry(x['gmp343_filtered'],x['gmp343_temperature'],x['sht45_humidity'],x['bme280_pressure']*100))
    .alias("gmp343_dry")).collect()

In [None]:
df_1_m

system_id,creation_timestamp,system_name,gmp343_raw,gmp343_compensated,gmp343_filtered,gmp343_temperature,sht45_humidity,sht45_temperature,bme280_humidity,bme280_temperature,bme280_pressure,revision,receipt_timestamp,h2o_ah,h2o_v%,gmp343_dry
i64,"datetime[μs, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,"datetime[ns, UTC]",f64,f64,f64
7,2023-06-26 13:50:00 UTC,,447.633333,494.3,494.3,28.666667,34.293333,27.815,34.733333,27.67,947.113333,,,9.681615,1.423876,501.439884
7,2023-06-26 13:51:00 UTC,,451.1,498.116667,498.116667,28.766667,34.341667,27.823333,34.775,27.673333,948.611667,,,9.748392,1.431907,505.352849
7,2023-06-26 13:53:00 UTC,,451.516667,500.15,500.15,28.833333,34.296667,27.831667,34.746667,27.68,948.365,,,9.771131,1.435937,507.436469
7,2023-06-26 13:54:00 UTC,,457.7,506.4,506.4,28.8,34.28,27.84,34.72,27.68,947.5,,,9.748621,1.433779,513.766272
7,2023-06-26 14:29:00 UTC,,464.15,517.625,517.625,30.375,30.47,30.145,30.8225,29.9825,946.7075,,,9.438548,1.396584,524.956457
7,2023-06-26 14:30:00 UTC,,462.65,514.833333,514.833333,30.416667,30.573333,30.203333,30.781667,30.046667,948.613333,,,9.491868,1.401844,522.153106
7,2023-06-26 14:31:00 UTC,,456.0,508.05,508.05,30.5,30.368333,30.295,30.57,30.13,947.39,,,9.47068,1.400905,515.268424
7,2023-06-26 14:32:00 UTC,,456.2,508.166667,508.166667,30.5,30.253333,30.378333,30.451667,30.201667,947.246667,,,9.434816,1.395812,515.360123
7,2023-06-26 14:33:00 UTC,,453.471429,504.685714,504.685714,30.542857,30.081429,30.47,30.278571,30.317143,948.428571,,,9.402898,1.389552,511.797406
7,2023-06-27 08:39:00 UTC,,526.2,573.1,573.1,25.9,32.91,25.63,33.74,25.5,951.54,,,7.973691,1.156536,579.805664


In [None]:
df_1_m.tail(3).select("creation_timestamp","system_id","gmp343_filtered", "h2o_ah", "h2o_v%" ,"gmp343_dry")

creation_timestamp,system_id,gmp343_filtered,h2o_ah,h2o_v%,gmp343_dry
"datetime[μs, UTC]",i64,f64,f64,f64,f64
2024-07-11 07:12:00 UTC,17,460.416667,20.122035,3.048661,474.894594
2024-07-11 07:13:00 UTC,17,458.7,20.176543,3.060798,473.183182
2024-07-11 07:14:00 UTC,17,458.78,20.155355,3.066917,473.295583


In [None]:
#df_1_m.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_acropolis_dry.parquet"))

### Calibration Data

In [None]:
df_dry_calibration = df_raw.filter(pl.col("cal_gmp343_filtered") > 0) \
    .filter(pl.col("cal_gmp343_temperature") > 0) \
    .filter(pl.col("cal_bme280_pressure") > 0) \
    .with_columns(pl.col("cal_sht45_humidity").fill_null(0.0)) \
    .with_columns(pl.struct(['cal_gmp343_temperature','cal_sht45_humidity','cal_bme280_pressure'])
    .map_elements(lambda x: apc.rh_to_molar_mixing(x['cal_sht45_humidity'],apc.absolute_temperature(x['cal_gmp343_temperature']),x['cal_bme280_pressure']*100)) \
    .alias("cal_h2o_v%")) \
    .with_columns(pl.struct(['cal_gmp343_filtered','cal_gmp343_temperature','cal_sht45_humidity','cal_bme280_pressure']) \
    .map_elements(lambda x: apc.calculate_co2dry(x['cal_gmp343_filtered'],x['cal_gmp343_temperature'],x['cal_sht45_humidity'],x['cal_bme280_pressure']*100))
    .alias("cal_gmp343_dry")) \
    .collect()

In [None]:
df_dry_calibration.tail(3).select("creation_timestamp","system_id","cal_gmp343_filtered", "cal_h2o_v%", "cal_gmp343_dry")

creation_timestamp,system_id,cal_gmp343_filtered,cal_h2o_v%,cal_gmp343_dry
"datetime[μs, UTC]",i64,f64,f64,f64
2024-07-11 03:35:57.620 UTC,1,443.6,0.000307,443.736384
2024-07-11 03:36:07.590 UTC,1,447.6,0.000314,447.740407
2024-07-11 03:36:17.590 UTC,1,443.2,0.000319,443.341496


# Calculate Calibration Correction

In [None]:
df_slope_intercept = df_dry_calibration.filter((pl.col("cal_bottle_id") > 0) & (pl.col("cal_bottle_id") < 81)) \
    .join(df_gas.cast({"cal_bottle_id": pl.Float64}), on = ["cal_bottle_id"], how= "left") \
    .with_columns(diff = pl.col("cal_gmp343_dry") - pl.col("cal_bottle_CO2")) \
    .with_columns((pl.col("creation_timestamp").dt.date()).alias("date")) \
    .sort("date") \
    .group_by([pl.col("date"), pl.col("system_id"), pl.col("cal_bottle_id")]) \
    .agg([
        pl.col("cal_sht45_humidity"),
        pl.col("cal_gmp343_temperature"),
        pl.col("cal_bme280_pressure"),
        pl.col("cal_gmp343_filtered"),
        pl.col("cal_gmp343_dry"),
        pl.col("cal_bottle_CO2").last(),
        pl.col("creation_timestamp").last(),
        ]) \
    .with_columns([
        (pl.col("cal_sht45_humidity").list.len()).alias("len"),
        pl.when(pl.col("cal_bottle_CO2") > 460).then(pl.lit("high")).otherwise(pl.lit("low")).alias("cal_bottle_span"),
        pl.col("cal_gmp343_dry").map_elements(lambda x: cp.process_bottle(x)).alias("median_cal_gmp343_dry"),
        pl.col("cal_gmp343_temperature").map_elements(lambda x: cp.process_bottle(x)).alias("median_cal_gmp343_temperature"),
        pl.col("cal_bme280_pressure").map_elements(lambda x: cp.process_bottle(x)).alias("median_cal_bme280_pressure"),
        pl.col("cal_sht45_humidity").map_elements(lambda x: cp.process_bottle(x)).alias("median_cal_sht45_humidity")
        ]) \
    .with_columns([
        (pl.col("median_cal_gmp343_dry") - pl.col("cal_bottle_CO2")).alias("diff_cut_avg_gmp343_dry")
                  ]) \
    .filter(pl.col("median_cal_gmp343_dry") > 0) \
    .sort("cal_bottle_span") \
    .group_by(["date", "system_id"]) \
    .agg([
        pl.col("cal_bottle_span"),
        pl.col("median_cal_gmp343_dry"),
        pl.col("cal_bottle_CO2"),
        pl.col("creation_timestamp").last()
        ]) \
    .with_columns(pl.struct(['median_cal_gmp343_dry','cal_bottle_CO2']) \
    .map_elements(lambda x: cp.two_point_calibration(x['median_cal_gmp343_dry'],x['cal_bottle_CO2'])) \
    .alias('slope, intercept')) \
    .with_columns([(pl.col("slope, intercept").list.first()).alias("slope"),
                   (pl.col("slope, intercept").list.last()).alias("intercept"),
                   (pl.col("median_cal_gmp343_dry").list.first()).alias("median_bottle_1"),
                   (pl.col("median_cal_gmp343_dry").list.last()).alias("median_bottle_2"),
                   ]) \
    .select("creation_timestamp", "date", "system_id", "slope", "intercept","median_bottle_1", "median_bottle_2") \
    .filter(pl.col("slope") > 0)

In [None]:
df_slope_intercept.head(3)

creation_timestamp,date,system_id,slope,intercept,median_bottle_1,median_bottle_2
"datetime[μs, UTC]",date,i64,f64,f64,f64,f64
2023-07-15 17:32:15.020 UTC,2023-07-15,9,1.010083,-22.747309,820.15,431.752038
2023-07-18 20:42:17.880 UTC,2023-07-18,5,1.030215,2.883579,781.8,391.95
2023-08-05 13:20:59.530 UTC,2023-08-05,11,1.014272,-7.965972,802.244525,409.178784


In [None]:
# safe results to parquet
df_slope_intercept.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))

In [None]:
fig = px.scatter(df_slope_intercept.filter((pl.col("slope") > 0.7) & (pl.col("slope") < 1.1)), x="creation_timestamp", y = "slope", color = "system_id")
fig.show()
fig = px.histogram(df_slope_intercept.filter((pl.col("slope") > 0.7) & (pl.col("slope") < 1.1)), x="slope", color = "system_id")
fig.show()
fig = px.histogram(df_slope_intercept.filter((pl.col("intercept") < 100) & (pl.col("intercept") > -100)), x="intercept", color = "system_id")
fig.show()

# Perform Calibration Correction

## 1m aggregated data

In [None]:
df_1_m = df_1_m.with_columns(pl.struct(["system_id"]) \
    .map_elements(lambda x: f"acropolis-{x['system_id']}") \
    .alias("sys_name_short")) \
    .drop("system_name")

In [None]:
df_wind = df_raw.select(pl.col("^(wxt532_.*|creation_timestamp|system_id)$")).filter(pl.col('wxt532_direction_avg') > 0).collect()

In [None]:
df_systems = []

for id in sensor_id:
    df_system_calibration = df_slope_intercept.filter(pl.col("system_id") == id) \
        .sort("creation_timestamp") \
        .drop("system_id", "system_name", "date")
    
    df_system = df_1_m.filter(pl.col("system_id") == id) \
        .sort("creation_timestamp") \
        .join_asof(df_system_calibration, on="creation_timestamp", strategy="nearest", tolerance="10m") \
        .join_asof(df_wind.filter(pl.col("system_id") == id).sort("creation_timestamp"), on="creation_timestamp", strategy="nearest", tolerance="10m") \
        .join(df_p_1m.select("creation_timestamp", "CO2_corr"), on = ["creation_timestamp"], how= "left") \
        .with_columns([
            pl.col("slope").interpolate(),
            pl.col("intercept").interpolate()
            ]) \
        .with_columns([
            pl.col("slope").forward_fill(),
            pl.col("intercept").forward_fill()
            ]) \
        .with_columns(((pl.col("gmp343_dry")) * pl.col("slope") + pl.col("intercept")).alias("gmp343_corrected")) \
        .with_columns((pl.col("CO2_corr") - pl.col("gmp343_corrected")).alias("diff")) \
        .with_columns((pl.col("creation_timestamp").dt.date()).alias("date"))
    
    df_systems.append(df_system)
        

df_1_m = pl.concat(df_systems, how="diagonal")

In [None]:
df_1_m.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1m_cal_corr_acropolis.parquet"))

## 10m aggregated data

In [None]:
df_10_m = df_1_m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='10m', by=["system_name", "sys_name_short"]) \
        .agg(pl.all().exclude(["creation_timestamp","system_name","sys_name_short"]).mean())
        
df_10_m.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "10m_cal_corr_acropolis.parquet"))

## 1h aggregated data

In [None]:
# TODO: implement despiking algorithm

In [None]:
df_1_h = df_1_m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h', by=["system_name", "sys_name_short"]) \
        .agg(pl.all().exclude(["creation_timestamp","system_name","sys_name_short"]).mean())
        
df_1_h.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_cal_corr_acropolis.parquet"))

# Plot data

In [None]:
df_plot_1 = df_p_1m[["creation_timestamp", "sys_name_short", "CO2_corr"]].rename({"CO2_corr": "CO2"})

In [None]:
df_plot_2 = df_1_m[["creation_timestamp", "sys_name_short", "gmp343_corrected"]].rename({"gmp343_corrected": "CO2"})

In [None]:
df_plot = pl.concat([df_plot_1,df_plot_2])

In [None]:
# Quality Check

from datetime import datetime, timezone

start = datetime(2024, 3, 16, 10, 0, 0).replace(tzinfo=timezone.utc)
end = datetime(2024, 3, 16, 22, 0, 0).replace(tzinfo=timezone.utc)

df_temp = df_plot.filter(pl.col("creation_timestamp").is_between(start, end)).filter(pl.col("sys_name_short").is_in(["acropolis-7", "Picarro"]))

fig = px.scatter(df_temp, x="creation_timestamp", y = "CO2", color = "sys_name_short")
fig.show()