In [31]:
import glob
import polars as pl
import os
import sys
import polars.selectors as cs

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")


if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)

from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY

assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [33]:
def read_picarro_data(directory_path:str, device_name:str, filter:str) -> pl.DataFrame:
    
    path = os.path.join(directory_path, device_name, "*/*.parquet")
    paths = glob.glob(path)

    df_years = []

    for path in paths:
        df_chunk = pl.scan_parquet(path).select("datetime", "CO2_dry", "h2o_reported").with_columns(pl.col("datetime").dt.cast_time_unit("ms"))
        df_years.append(df_chunk)

    df_p = pl.concat(df_years, how="diagonal").collect() \
        .sort("datetime") \
        .group_by_dynamic("datetime", every=filter) \
            .agg(cs.numeric().mean())
            
    return df_p

In [34]:
df_p_413 = read_picarro_data(directory_path=PROCESSED_PICARRO_DATA_DIRECTORY, device_name="DWD_Picarro_G2301_413", filter="1m") \
    .with_columns(system_name = pl.lit("Picarro_G2301")).with_columns(system_id = pl.lit(413))
df_p_413.head(1).vstack(df_p_413.tail(1))

datetime,CO2_dry,h2o_reported,system_name,system_id
datetime[ms],f64,f64,str,i32
2023-07-01 00:00:00,433.484034,2.266186,"""Picarro_G2301""",413
2024-12-20 16:17:00,607.152171,-0.001034,"""Picarro_G2301""",413


In [35]:
df_p_529 = read_picarro_data(directory_path=PROCESSED_PICARRO_DATA_DIRECTORY, device_name="ICOS_Picarro_G2401_529", filter="1m") \
    .with_columns(system_name = pl.lit("Picarro_G2401")).with_columns(system_id = pl.lit(529))
df_p_529.head(1).vstack(df_p_529.tail(1))

datetime,CO2_dry,h2o_reported,system_name,system_id
datetime[ms],f64,f64,str,i32
2024-09-23 11:21:00,695.688105,1.795375,"""Picarro_G2401""",529
2025-03-03 15:27:00,445.753775,0.489036,"""Picarro_G2401""",529


In [42]:
df_cal = pl.read_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "picarro_slope_intercept.parquet"))
df_cal.head(1).vstack(df_cal.tail(1))

datetime,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept
datetime[ms],i64,f64,f64,f64,f64
2023-10-23 13:31:00,413,424.60174,607.006545,1.006388,0.065919
2025-02-24 12:55:00,529,424.605472,607.069369,1.006062,0.200554


In [44]:
def apply_calibration_correction(df_p:pl.DataFrame, df_cal:pl.DataFrame, id=int):
    df_cal = df_cal.filter(pl.col("Picarro ID") == id)
    
    return df_p.sort("datetime") \
    .join_asof(df_cal, on="datetime", strategy="nearest", tolerance="10m") \
    .with_columns([
        pl.col("slope").interpolate().alias("slope_interpolated"),
        pl.col("intercept").interpolate().alias("intercept_interpolated")
        ]) \
    .with_columns([
        pl.col("slope_interpolated").forward_fill().backward_fill(),
        pl.col("intercept_interpolated").forward_fill().backward_fill()
        ]) \
    .with_columns(((pl.col("CO2_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias("picarro_corrected")) \
    .select("datetime", "system_name", "system_id", "picarro_corrected", "h2o_reported")

In [46]:
df_corr = apply_calibration_correction(df_p=df_p_413, df_cal=df_cal, id=413)

df_corr.write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))

df_corr.group_by_dynamic("datetime", every='1m', group_by=["system_name", "system_id"]).agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_h_DWD_Picarro_G2301_413.parquet"))
    
df_corr.head(1).vstack(df_corr.tail(1))

datetime,system_name,system_id,picarro_corrected,h2o_reported
datetime[ms],str,i32,f64,f64
2023-07-01 00:00:00,"""Picarro_G2301""",413,436.319033,2.266186
2024-12-20 16:17:00,"""Picarro_G2301""",413,610.944729,-0.001034


In [47]:
df_corr = apply_calibration_correction(df_p=df_p_529, df_cal=df_cal, id=529)

df_corr.write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))

df_corr.group_by_dynamic("datetime", every='1m', group_by=["system_name", "system_id"]).agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_h_ICOS_Picarro_G2401_529.parquet"))
    
df_corr.head(1).vstack(df_corr.tail(1))

datetime,system_name,system_id,picarro_corrected,h2o_reported
datetime[ms],str,i32,f64,f64
2024-09-23 11:21:00,"""Picarro_G2401""",529,700.150721,1.795375
2025-03-03 15:27:00,"""Picarro_G2401""",529,448.656505,0.489036
