In [9]:
# Imports & Inits
import os
import sys
import polars as pl
import polars.selectors as cs

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")

if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)
    
from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY

df_p_413 = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "DWD_Picarro_G2301_413.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("ms"))
df_p_529 = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "ICOS_Picarro_G2401_529.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("ms")) 
df_p_cal_corr = pl.scan_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "picarro_slope_intercept.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("ms"))

In [10]:
df_p_cal_corr.collect()

datetime,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept
datetime[ms],i64,f64,f64,f64,f64
2023-10-23 13:31:00,413,424.60174,607.006545,1.006388,0.065919
2023-12-18 15:32:00,413,424.720504,607.182719,1.006071,0.080886
2024-05-06 12:31:00,413,424.737684,607.144954,1.006374,-0.065117
2024-09-04 14:00:00,413,424.37114,606.636933,1.007156,-0.027737
2024-09-24 10:02:00,413,424.671389,607.080922,1.006362,0.006904
2024-12-03 13:43:00,413,424.494511,606.837327,1.00673,0.028601
2024-12-20 17:17:00,413,424.70576,607.157409,1.00613,0.070976
2024-09-23 13:22:00,529,424.55278,607.022187,1.006032,0.266464
2024-11-21 13:23:00,529,424.618376,607.100354,1.005962,0.229899
2025-02-24 12:55:00,529,424.605472,607.069369,1.006062,0.200554


In [11]:
df_cal_corr = df_p_cal_corr.filter(pl.col("Picarro ID") == 413) \
    .sort("datetime")

df_p_413_corr = df_p_413.sort("datetime") \
    .join_asof(df_cal_corr, on="datetime", strategy="nearest", tolerance="10m") \
    .with_columns([
        pl.col("slope").interpolate().alias("slope_interpolated"),
        pl.col("intercept").interpolate().alias("intercept_interpolated")
        ]) \
    .with_columns([
        pl.col("slope_interpolated").forward_fill().backward_fill(),
        pl.col("intercept_interpolated").forward_fill().backward_fill()
        ]) \
    .with_columns(((pl.col("CO2_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias("picarro_corrected")) \
    .with_columns(system_name = pl.lit("Picarro_G2301")) \
    .with_columns(system_id = pl.lit(413)) \
    .collect()
    
# save raw file
df_p_413_corr.write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_Raw_DWD_Picarro_G2301_413.parquet"))

# 1m
df_p_413_corr.select("datetime", "system_name", "system_id", "picarro_corrected", "h2o_reported") \
    .group_by_dynamic("datetime", every='1m', group_by=["system_name", "system_id"]) \
    .agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))
    
# 1h  
df_p_413_corr.select("datetime", "system_name", "system_id", "picarro_corrected", "h2o_reported") \
    .group_by_dynamic("datetime", every='1h', group_by=["system_name", "system_id"]) \
    .agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_h_DWD_Picarro_G2301_413.parquet"))
    
df_p_413_corr.head(1).vstack(df_p_413_corr.tail(1))

In [12]:
df_cal_corr = df_p_cal_corr.filter(pl.col("Picarro ID") == 529) \
    .sort("datetime")

df_p_529_corr = df_p_529.sort("datetime") \
    .join_asof(df_cal_corr, on="datetime", strategy="nearest", tolerance="10m") \
    .with_columns([
        pl.col("slope").interpolate().alias("slope_interpolated"),
        pl.col("intercept").interpolate().alias("intercept_interpolated")
        ]) \
    .with_columns([
        pl.col("slope_interpolated").forward_fill().backward_fill(),
        pl.col("intercept_interpolated").forward_fill().backward_fill()
        ]) \
    .with_columns(((pl.col("CO2_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias("picarro_corrected")) \
    .with_columns(system_name = pl.lit("Picarro_G2401")) \
    .with_columns(system_id = pl.lit(529)) \
    .collect()
    
# save raw file
df_p_529_corr.write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_Raw_ICOS_Picarro_G2401_529.parquet"))

# 1m
df_p_529_corr.select("datetime", "system_name", "system_id", "picarro_corrected", "h2o_reported") \
    .group_by_dynamic("datetime", every='1m', group_by=["system_name", "system_id"]) \
    .agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))
    
# 1h  
df_p_529_corr.select("datetime", "system_name", "system_id", "picarro_corrected", "h2o_reported") \
    .group_by_dynamic("datetime", every='1h', group_by=["system_name", "system_id"]) \
    .agg(cs.numeric().mean()) \
    .write_parquet(os.path.join(PROCESSED_PICARRO_DATA_DIRECTORY, "Calibrated_1_h_ICOS_Picarro_G2401_529.parquet"))
    
df_p_529_corr.head(1).vstack(df_p_529_corr.tail(1))

DATE,TIME,FRAC_DAYS_SINCE_JAN1,FRAC_HRS_SINCE_JAN1,JULIAN_DAYS,EPOCH_TIME,ALARM_STATUS,INST_STATUS,CavityPressure,CavityTemp,DasTemp,EtalonTemp,WarmBoxTemp,species,MPVPosition,OutletValve,CO,CO2,CO2_dry,CH4,CH4_dry,H2O,h2o_reported,co2_base,ch4_base,wlm2_offset,co2_pzt_std,ch4_pzt_std,b_h2o_pct,peak_14,peak84_raw,datetime,__index_level_0__,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept,slope_interpolated,intercept_interpolated,picarro_corrected,system_name,system_id
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ms],i64,i64,f64,f64,f64,f64,f64,f64,f64,str,i32
"""2024-09-23""","""11:21:13.016""",266.473067,6395.353616,267.473067,1727100000.0,0.0,963.0,140.010322,44.854485,37.625,44.7799,44.805504,1.0,0.0,32230.379956,0.0,725.528657,0.0,0.0,0.0,0.0,0.0,761.902019,0.0,0.0,570.310965,0.0,0.0,1031.44608,0.0,2024-09-23 11:21:13.016,17234767,,,,,,1.006032,0.266464,0.266464,"""Picarro_G2401""",529
"""2025-03-03""","""15:27:57.660""",61.644417,1479.466017,62.644417,1741000000.0,0.0,963.0,139.997345,44.998833,39.0625,45.102409,44.99865,3.0,0.0,32583.15625,0.150781,443.572592,446.22185,2.081851,2.092025,0.381206,0.487782,760.844567,913.553978,-0.033572,81.322562,118.540749,0.51286,630.539621,0.336018,2025-03-03 15:27:57.660,241953,,,,,,1.006062,0.200554,449.127417,"""Picarro_G2401""",529
