In [34]:
# Imports & Inits

from datetime import datetime
from datetime import timezone
import polars as pl
import os
import plotly.express as px
import sys

module_path = os.path.abspath(os.path.join("..", ".."))

if module_path not in sys.path:
    sys.path.append(module_path)

from functions.calibration_processing import two_point_calibration, process_bottle

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

df_p_413 = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "input", "picarro", "DWD_Picarro_G2301_413.parquet")) 
df_p_529 = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "input", "picarro", "ICOS_Picarro_G2401_529.parquet")) 

In [35]:
# Initialize the DataFrame with the specified columns
df = pl.DataFrame({
    "datetime": pl.Series([], dtype=pl.Datetime(time_zone="UTC")),
    "Picarro ID": pl.Series([], dtype=pl.Int64),
    "Bottle_1_Median": pl.Series([], dtype=pl.Float64),
    "Bottle_2_Median": pl.Series([], dtype=pl.Float64),
    "slope": pl.Series([], dtype=pl.Float64),
    "slope": pl.Series([], dtype=pl.Float64),
    "intercept": pl.Series([], dtype=pl.Float64)
})

# Function to add a new row
def add_row(dataframe, datetime, picarro_id, median_bottle_1, median_bottle_2, slope, intercept):
    new_row = pl.DataFrame({
        "datetime": [datetime],
        "Picarro ID": [picarro_id],
        "Bottle_1_Median": [median_bottle_1],
        "Bottle_2_Median": [median_bottle_2],
        "slope": [slope],
        "intercept": [intercept]
    })
    return dataframe.vstack(new_row)

In [36]:
df_p_413.head(3).collect()

DATE,TIME,FRAC_DAYS_SINCE_JAN1,FRAC_HRS_SINCE_JAN1,JULIAN_DAYS,EPOCH_TIME,ALARM_STATUS,INST_STATUS,CavityPressure,CavityTemp,DasTemp,EtalonTemp,species,OutletValve,CH4,CH4_dry,CO2,CO2_dry,h2o_reported,ch4_base,ch4_pzt_std,co2_base,co2_pzt_std,wlm1_offset,wlm2_offset,datetime,__index_level_0__
str,str,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],i64
"""2023-07-01""","""00:00:01.150""",181.000013,4344.000319,182.000013,1688200000.0,0,963,140.012831,44.99987,40.375,45.15374,2.0,22129.628477,1.997874,2.046012,421.023033,433.462829,2.274335,1181.940043,82.55631,1084.999673,78.542541,-0.074797,-0.045017,2023-07-01 00:00:01.150,25156741
"""2023-07-01""","""00:00:02.069""",181.000024,4344.000575,182.000024,1688200000.0,0,963,140.004444,44.99987,40.375,45.15374,1.0,22128.306954,1.997874,2.046012,421.041592,433.462829,2.274335,1181.940043,82.55631,1084.974592,78.542541,-0.074794,-0.045017,2023-07-01 00:00:02.069,25156742
"""2023-07-01""","""00:00:02.579""",181.00003,4344.000717,182.00003,1688200000.0,0,963,139.998415,44.99987,40.375,45.15374,3.0,22127.101788,1.997874,2.046037,421.041592,433.454005,2.268414,1181.940043,82.55631,1084.974592,78.542541,-0.074794,-0.045017,2023-07-01 00:00:02.579,25156743


In [37]:
def process_two_point_picarro_calibration(df, low_start_date, low_end_date, high_start_date, high_end_date, plot:bool=False):

    df_p_400 = df.select(pl.col("datetime").dt.cast_time_unit("ns").dt.replace_time_zone("UTC").alias("creation_timestamp"),
                    pl.col("CO2_dry"), 
                    pl.col("h2o_reported")) \
                    .sort("creation_timestamp").filter(pl.col("creation_timestamp").is_between(low_start_date, low_end_date)) \
                    .collect()


    df_p_600 = df.select(pl.col("datetime").dt.cast_time_unit("ns").dt.replace_time_zone("UTC").alias("creation_timestamp"),
                    pl.col("CO2_dry"), 
                    pl.col("h2o_reported")) \
                    .sort("creation_timestamp").filter(pl.col("creation_timestamp").is_between(high_start_date, high_end_date)) \
                    .collect()


    measured_values = [None, None]
    true_values = [427.38, 610.95]

    # 400 ppm
    data = df_p_400.select(pl.col("CO2_dry")).to_series().to_list()
    measured_values[0] = process_bottle(data=data, ignore_len=True)

    # 600 ppm
    data = df_p_600.select(pl.col("CO2_dry")).to_series().to_list()
    measured_values[1] = process_bottle(data=data, ignore_len=True)

    print(measured_values)

    result = two_point_calibration(measured_values, true_values)
    picarro_slope = result["slope"] 
    picarro_intercept = result["intercept"] 
    
    print(picarro_slope, picarro_intercept)
    
    if plot:
        #plots
        fig = px.line(df_p_400, x="creation_timestamp", y="h2o_reported")
        fig.show()

        fig = px.line(df_p_400, x="creation_timestamp", y="CO2_dry")
        fig.show()

        fig = px.line(df_p_600, x="creation_timestamp", y="h2o_reported")
        fig.show()

        fig = px.line(df_p_600, x="creation_timestamp", y="CO2_dry")
        fig.show()
        
    return (measured_values[0], measured_values[1], picarro_slope, picarro_intercept)

In [38]:
# 23.10.2023

low_start_date = datetime(2023, 10, 23, 13, 31, 30).replace(tzinfo=timezone.utc)
low_end_date = datetime(2023, 10, 23, 14, 6, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2023, 10, 23, 13, 6, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2023, 10, 23, 13, 31, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2023, 10, 23, 13, 31, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.60174029, 607.0065449]
1.006387964354839 0.06591892802487109


In [39]:
# 18.12.2023

low_start_date = datetime(2023, 12, 18, 14, 33, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2023, 12, 18, 15, 2, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2023, 12, 18, 15, 4, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2023, 12, 18, 15, 32, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2023, 12, 18, 15, 32, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.7205036, 607.18271856]
1.0060713120261249 0.08088569875155827


In [40]:
# 06.05.2024

low_start_date = datetime(2024, 5, 6, 11, 33, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 5, 6, 12, 1, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 5, 6, 12, 2, 30).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 5, 6, 12, 31, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 5, 6, 12, 31, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.73768416, 607.1449536]
1.0063743652518329 -0.06511729505348285


In [41]:
# 04.09.2024: DWD Picarro

low_start_date = datetime(2024, 9, 4, 13, 00, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 9, 4, 13, 30, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 9, 4, 13, 31, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 9, 4, 14, 00, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 9, 4, 14, 00, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.37113967, 606.63693264]
1.007155522760185 -0.02773701867431555


In [42]:
# 24.09.2024

low_start_date = datetime(2024, 9, 24, 9, 11, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 9, 24, 9, 37, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 9, 24, 9, 37, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 9, 24, 10, 2, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 9, 24, 10, 2, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.67138877, 607.08092176]
1.006361876986241 0.006904095069103278


In [43]:
# 03.12.2024

low_start_date = datetime(2024, 12, 3, 12, 42, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 12, 3, 13, 23, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 12, 3, 13, 23, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 12, 3, 13, 53, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 12, 3, 13, 43, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.49451092, 606.8373265800001]
1.0067300942763118 0.028601001731544784


In [44]:
# 20.12.2024 DWD Picarro

low_start_date = datetime(2024, 12, 20, 15, 10, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 12, 20, 15, 47, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 12, 20, 15, 47, 40).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 12, 20, 16, 17, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_413, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 12, 20, 17, 17, 0).replace(tzinfo=timezone.utc), 413 , median_bottle_1, median_bottle_2, slope, intercept)

[424.70575989, 607.15740946]
1.006129571492698 0.07097579139355048


# ICOS Picarro

In [45]:
# 23.09.2024: ICOS Picarro

low_start_date = datetime(2024, 9, 23, 12, 19, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 9, 23, 12, 52, 0).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024, 9, 23, 12, 54, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 9, 23, 13, 22, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_529, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 9, 23, 13, 22, 0).replace(tzinfo=timezone.utc), 529 , median_bottle_1, median_bottle_2, slope, intercept)

[424.552779525, 607.02218673]
1.0060316565492182 0.2664639218892262


In [46]:
# 21.11.2024: ICOS Picarro

low_start_date = datetime(2024, 11, 21, 12, 26, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2024, 11, 21, 12, 53, 30).replace(tzinfo=timezone.utc)

high_start_date = datetime(2024,11, 21, 12, 55, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2024, 11, 21, 13, 23, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_529, low_start_date, low_end_date, high_start_date, high_end_date)

df = add_row(df, datetime(2024, 11, 21, 13, 23, 0).replace(tzinfo=timezone.utc), 529 , median_bottle_1, median_bottle_2, slope, intercept)

[424.61837593, 607.10035397]
1.0059623529495145 0.22989944385568606


In [52]:
# 24.02.2025: ICOS Picarro

low_start_date = datetime(2025, 2, 24, 12, 8, 0).replace(tzinfo=timezone.utc)
low_end_date = datetime(2025, 2, 24, 12, 36, 40).replace(tzinfo=timezone.utc)

high_start_date = datetime(2025, 2, 24, 12, 38, 0).replace(tzinfo=timezone.utc)
high_end_date = datetime(2025, 2, 24, 12, 55, 0).replace(tzinfo=timezone.utc)

median_bottle_1, median_bottle_2, slope, intercept = process_two_point_picarro_calibration(df_p_529, low_start_date, low_end_date, high_start_date, high_end_date, plot=True)

df = add_row(df, datetime(2025, 2, 24, 12, 55, 0).replace(tzinfo=timezone.utc), 529 , median_bottle_1, median_bottle_2, slope, intercept)

[424.60547195, 607.06936922]
1.006062036087957 0.20055435589506487


In [53]:
df

datetime,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept
"datetime[μs, UTC]",i64,f64,f64,f64,f64
2023-10-23 13:31:00 UTC,413,424.60174,607.006545,1.006388,0.065919
2023-12-18 15:32:00 UTC,413,424.720504,607.182719,1.006071,0.080886
2024-05-06 12:31:00 UTC,413,424.737684,607.144954,1.006374,-0.065117
2024-09-04 14:00:00 UTC,413,424.37114,606.636933,1.007156,-0.027737
2024-09-24 10:02:00 UTC,413,424.671389,607.080922,1.006362,0.006904
2024-12-03 13:43:00 UTC,413,424.494511,606.837327,1.00673,0.028601
2024-12-20 17:17:00 UTC,413,424.70576,607.157409,1.00613,0.070976
2024-09-23 13:22:00 UTC,529,424.55278,607.022187,1.006032,0.266464
2024-11-21 13:23:00 UTC,529,424.618376,607.100354,1.005962,0.229899
2025-02-24 12:55:00 UTC,529,424.605472,607.069369,1.006062,0.200554


In [56]:
df.write_parquet(os.path.join(DATA_DIRECTORY, "input", "picarro", "picarro_slope_intercept.parquet"))