In [161]:
from datetime import datetime
import glob
import polars as pl
import os
import plotly.express as px
import sys

PROJECT_PATH = os.path.abspath(os.path.join("..", ".."))
PIPELINE_PATH = os.path.join(PROJECT_PATH, "pipeline")
DATA_DIRECTORY = os.path.join(PROJECT_PATH, "data")


if PIPELINE_PATH not in sys.path:
    sys.path.append(PIPELINE_PATH)

from utils.paths import PROCESSED_PICARRO_DATA_DIRECTORY
from utils.calibration_processing import two_point_calibration, process_bottle

assert(os.path.exists(PROCESSED_PICARRO_DATA_DIRECTORY))

In [162]:
def read_picarro_data(directory_path:str, device_name:str) -> pl.DataFrame:
    
    path = os.path.join(directory_path, device_name, "*/*.parquet")
    paths = glob.glob(path)

    df_years = []

    for path in paths:
        df_chunk = pl.scan_parquet(path).select("datetime", "CO2_dry", "h2o_reported").with_columns(pl.col("datetime").dt.cast_time_unit("ms"))
        df_years.append(df_chunk)

    df_p = pl.concat(df_years, how="diagonal").collect().sort("datetime")
    return df_p

In [163]:
df_p_413 = read_picarro_data(directory_path=PROCESSED_PICARRO_DATA_DIRECTORY, device_name="DWD_Picarro_G2301_413")
df_p_413.head(1).vstack(df_p_413.tail(1))

datetime,CO2_dry,h2o_reported
datetime[ms],f64,f64
2023-07-01 00:00:01.150,433.462829,2.274335
2024-12-20 16:17:31.513,607.165968,-0.001279


In [164]:
df_p_529 = read_picarro_data(directory_path=PROCESSED_PICARRO_DATA_DIRECTORY, device_name="ICOS_Picarro_G2401_529")
df_p_529.head(1).vstack(df_p_529.tail(1))

datetime,CO2_dry,h2o_reported
datetime[ms],f64,f64
2024-09-23 11:21:13.016,0.0,0.0
2025-03-03 15:27:57.660,446.22185,0.487782


In [165]:
# Notebook specific functions
df = pl.DataFrame({
    "datetime": pl.Series([], dtype=pl.Datetime()),
    "Picarro ID": pl.Series([], dtype=pl.Int64),
    "Bottle_1_Median": pl.Series([], dtype=pl.Float64),
    "Bottle_2_Median": pl.Series([], dtype=pl.Float64),
    "slope": pl.Series([], dtype=pl.Float64),
    "slope": pl.Series([], dtype=pl.Float64),
    "intercept": pl.Series([], dtype=pl.Float64)
})

# Function to add a new row
def add_row(dataframe, datetime, picarro_id, median_bottle_1, median_bottle_2, slope, intercept):
    new_row = pl.DataFrame({
        "datetime": [datetime],
        "Picarro ID": [picarro_id],
        "Bottle_1_Median": [median_bottle_1],
        "Bottle_2_Median": [median_bottle_2],
        "slope": [slope],
        "intercept": [intercept]
    })
    return dataframe.vstack(new_row)

def process_two_point_picarro_calibration(df, low_start_date, low_end_date, high_start_date, high_end_date, true_values):

    df_p_400 = df.sort("datetime").filter(pl.col("datetime").is_between(low_start_date, low_end_date)) 
                    
    df_p_600 = df.sort("datetime").filter(pl.col("datetime").is_between(high_start_date, high_end_date)) 
                
    measured_values = [None, None]

    # 400 ppm
    data = df_p_400.select(pl.col("CO2_dry")).to_series().to_list()
    measured_values[0] = process_bottle(data=data, ignore_len=True)

    # 600 ppm
    data = df_p_600.select(pl.col("CO2_dry")).to_series().to_list()
    measured_values[1] = process_bottle(data=data, ignore_len=True)

    result = two_point_calibration(measured_values, true_values)
    
    return measured_values[0], measured_values[1], result["slope"], result["intercept"] 
    
def process_calibration_times(df_p:pl.DataFrame, id:int, calibration_times: dict, df:pl.DataFrame, true_values:list[int]) -> None:
    for key in calibration_times[str(id)].keys():
        low_start_date = calibration_times[str(id)][key]["low_start_date"]
        low_end_date = calibration_times[str(id)][key]["low_end_date"]
        high_start_date = calibration_times[str(id)][key]["high_start_date"]
        high_end_date = calibration_times[str(id)][key]["high_end_date"]
        
        # process
        median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p, low_start_date, low_end_date, high_start_date, high_end_date, true_values)
        
        # add row
        df = add_row(df, high_end_date, id , median_bottle_1, median_bottle_2, slope, intercept)
    return df    
    
def plot_calibration_time(df_p, low_start_date, low_end_date, high_start_date, high_end_date):
    
    df_p_400 = df_p.sort("datetime").filter(pl.col("datetime").is_between(low_start_date, low_end_date)) 
    df_p_600 = df_p.sort("datetime").filter(pl.col("datetime").is_between(high_start_date, high_end_date)) 
    
    #plots
    fig = px.line(df_p_400, x="datetime", y="h2o_reported")
    fig.show()

    fig = px.line(df_p_400, x="datetime", y="CO2_dry")
    fig.show()

    fig = px.line(df_p_600, x="datetime", y="h2o_reported")
    fig.show()

    fig = px.line(df_p_600, x="datetime", y="CO2_dry")
    fig.show()

In [166]:
calibration_times = {
    "413": { 
        "23.10.2023": {
            "low_start_date": datetime(2023, 10, 23, 13, 31, 30),
            "low_end_date": datetime(2023, 10, 23, 14, 6, 0),
            "high_start_date": datetime(2023, 10, 23, 13, 6, 0),
            "high_end_date": datetime(2023, 10, 23, 13, 31, 0)
        },
        "18.12.2023": {
            "low_start_date":datetime(2023, 12, 18, 14, 33, 0),
            "low_end_date": datetime(2023, 12, 18, 15, 2, 0),
            "high_start_date":  datetime(2023, 12, 18, 15, 4, 0),
            "high_end_date": datetime(2023, 12, 18, 15, 32, 0)
        },
        "06.05.2024":
        {
            "low_start_date": datetime(2024, 5, 6, 11, 33, 0),
            "low_end_date": datetime(2024, 5, 6, 12, 1, 0),
            "high_start_date": datetime(2024, 5, 6, 12, 2, 30),
            "high_end_date": datetime(2024, 5, 6, 12, 31, 0)
        }, 
        "04.09.2024":
        {
            "low_start_date": datetime(2024, 9, 4, 13, 0, 0),
            "low_end_date": datetime(2024, 9, 4, 13, 30, 0),
            "high_start_date": datetime(2024, 9, 4, 13, 31, 0),
            "high_end_date": datetime(2024, 9, 4, 14, 0, 0)
        },
        "24.09.2024":
        {
            "low_start_date": datetime(2024, 9, 24, 9, 11, 0),
            "low_end_date": datetime(2024, 9, 24, 9, 37, 0),
            "high_start_date": datetime(2024, 9, 24, 9, 37, 0),
            "high_end_date": datetime(2024, 9, 24, 10, 2, 0)
        },
        "03.12.2024":
        {
            "low_start_date": datetime(2024, 12, 3, 12, 42, 0),
            "low_end_date": datetime(2024, 12, 3, 13, 23, 0),
            "high_start_date": datetime(2024, 12, 3, 13, 23, 0),
            "high_end_date": datetime(2024, 12, 3, 13, 53, 0)
        },
        "20.12.2024":
        {
            "low_start_date": datetime(2024, 12, 20, 15, 10, 0),
            "low_end_date": datetime(2024, 12, 20, 15, 47, 0),
            "high_start_date": datetime(2024, 12, 20, 15, 47, 40),
            "high_end_date": datetime(2024, 12, 20, 16, 17, 0)
        }},
    "529": {
        "23.09.2024":
        {
            "low_start_date": datetime(2024, 9, 23, 12, 19, 0),
            "low_end_date": datetime(2024, 9, 23, 12, 52, 0),
            "high_start_date": datetime(2024, 9, 23, 12, 54, 0),
            "high_end_date": datetime(2024, 9, 23, 13, 22)
        },
        "21.11.2024":
        {
            "low_start_date": datetime(2024, 11, 21, 12, 26, 0),
            "low_end_date": datetime(2024, 11, 21, 12, 53, 30),
            "high_start_date": datetime(2024,11, 21, 12, 55, 0),
            "high_end_date": datetime(2024, 11, 21, 13, 23, 0)
        },
        "24.02.2025":
        {
            "low_start_date": datetime(2025, 2, 24, 12, 8, 0),
            "low_end_date": datetime(2025, 2, 24, 12, 36, 40),
            "high_start_date": datetime(2025, 2, 24, 12, 38, 0),
            "high_end_date": datetime(2025, 2, 24, 12, 55, 0)
        }}
    }


In [167]:
# Plot Tool to tailor calibration times

id = 529
key = "24.02.2025"

low_start_date = calibration_times[str(id)][key]["low_start_date"]
low_end_date = calibration_times[str(id)][key]["low_end_date"]
high_start_date = calibration_times[str(id)][key]["high_start_date"]
high_end_date = calibration_times[str(id)][key]["high_end_date"]

#plot_calibration_time(df_p_529, low_start_date, low_end_date, high_start_date, high_end_date)

In [168]:
df = process_calibration_times(df_p=df_p_413, id=413, calibration_times=calibration_times, df=df, true_values=[427.38, 610.95])
df = process_calibration_times(df_p=df_p_529, id=529, calibration_times=calibration_times, df=df, true_values=[427.38, 610.95])

df

datetime,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept
datetime[μs],i64,f64,f64,f64,f64
2023-10-23 13:31:00,413,424.60174,607.006545,1.006388,0.065919
2023-12-18 15:32:00,413,424.720504,607.182719,1.006071,0.080886
2024-05-06 12:31:00,413,424.737684,607.144954,1.006374,-0.065117
2024-09-04 14:00:00,413,424.37114,606.636933,1.007156,-0.027737
2024-09-24 10:02:00,413,424.671389,607.080922,1.006362,0.006904
2024-12-03 13:53:00,413,424.494511,606.837327,1.00673,0.028601
2024-12-20 16:17:00,413,424.70576,607.157409,1.00613,0.070976
2024-09-23 13:22:00,529,424.55278,607.022187,1.006032,0.266464
2024-11-21 13:23:00,529,424.618376,607.100354,1.005962,0.229899
2025-02-24 12:55:00,529,424.605472,607.069369,1.006062,0.200554


In [169]:
df.with_columns(pl.col("datetime").dt.cast_time_unit("ms")) \
    .write_parquet(os.path.join(DATA_DIRECTORY, "input", "picarro", "picarro_slope_intercept.parquet"))