In [1]:
# Imports & Inits

import os
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")

df_p_413 = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "DWD_Picarro_G2301_413.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("us"))
df_p_529 = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "ICOS_Picarro_G2401_529.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("us")) 
df_p_cal_corr = pl.read_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "picarro_slope_intercept.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("us"))

In [2]:
df_p_cal_corr

datetime,Picarro ID,Bottle_1_Median,Bottle_2_Median,slope,intercept
"datetime[μs, UTC]",i64,f64,f64,f64,f64
2023-10-23 13:31:00 UTC,413,424.60174,607.006545,1.006388,0.065919
2023-12-18 15:32:00 UTC,413,424.720504,607.182719,1.006071,0.080886
2024-05-06 12:31:00 UTC,413,424.737684,607.144954,1.006374,-0.065117
2024-09-04 14:00:00 UTC,413,424.37114,606.636933,1.007156,-0.027737
2024-09-24 10:02:00 UTC,413,424.671389,607.080922,1.006362,0.006904
2024-12-03 13:43:00 UTC,413,424.494511,606.837327,1.00673,0.028601
2024-09-23 13:22:00 UTC,529,424.55278,607.022187,1.006032,0.266464
2024-11-21 13:23:00 UTC,529,424.618376,607.100354,1.005962,0.229899


In [3]:
df_cal_corr = df_p_cal_corr.filter(pl.col("Picarro ID") == 413) \
    .sort("datetime")

df_p_413_corr = df_p_413.with_columns(pl.col("datetime").dt.replace_time_zone("UTC")) \
    .sort("datetime") \
    .join_asof(df_cal_corr, on="datetime", strategy="nearest", tolerance="10m") \
    .with_columns([
        pl.col("slope").interpolate().alias("slope_interpolated"),
        pl.col("intercept").interpolate().alias("intercept_interpolated")
        ]) \
    .with_columns([
        pl.col("slope_interpolated").forward_fill(),
        pl.col("intercept_interpolated").forward_fill()
        ]) \
    .with_columns([
        pl.col("slope_interpolated").backward_fill(),
        pl.col("intercept_interpolated").backward_fill()
        ]) \
    .with_columns(((pl.col("CO2_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias("picarro_corrected"))
    
# save raw file
df_p_413_corr.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_Raw_DWD_Picarro_G2301_413.parquet"))

In [4]:
# 1m
df_p_413_corr.with_columns(pl.lit("Picarro").alias("sys_name_short")) \
    .select("datetime", "sys_name_short", "picarro_corrected", "h2o_reported") \
    .rename({"datetime":"creation_timestamp"}) \
    .group_by_dynamic("creation_timestamp", every='1m') \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("picarro_corrected").std().alias("std")) \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_DWD_Picarro_G2301_413.parquet"))
    
# 1h  
df_p_413_corr.with_columns(pl.lit("Picarro").alias("sys_name_short")) \
    .select("datetime", "sys_name_short", "picarro_corrected", "h2o_reported") \
    .rename({"datetime":"creation_timestamp"}) \
    .group_by_dynamic("creation_timestamp", every='1h') \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("picarro_corrected").std().alias("std")) \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_h_DWD_Picarro_G2301_413.parquet"))

In [5]:
df_cal_corr = df_p_cal_corr.filter(pl.col("Picarro ID") == 529) \
    .sort("datetime")

df_p_529_corr = df_p_529.with_columns(pl.col("datetime").dt.replace_time_zone("UTC")) \
    .sort("datetime") \
    .join_asof(df_cal_corr, on="datetime", strategy="nearest", tolerance="10m") \
    .with_columns([
        pl.col("slope").interpolate().alias("slope_interpolated"),
        pl.col("intercept").interpolate().alias("intercept_interpolated")
        ]) \
    .with_columns([
        pl.col("slope_interpolated").forward_fill(),
        pl.col("intercept_interpolated").forward_fill()
        ]) \
    .with_columns([
        pl.col("slope_interpolated").backward_fill(),
        pl.col("intercept_interpolated").backward_fill()
        ]) \
    .with_columns(((pl.col("CO2_dry")) * pl.col("slope_interpolated") + pl.col("intercept_interpolated")).alias("picarro_corrected"))
    
# save raw file
df_p_529_corr.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_Raw_ICOS_Picarro_G2401_529.parquet"))

In [6]:
# 1m
df_p_529_corr.with_columns(pl.lit("Picarro").alias("sys_name_short")) \
    .select("datetime", "sys_name_short", "picarro_corrected", "h2o_reported") \
    .rename({"datetime":"creation_timestamp"}) \
    .group_by_dynamic("creation_timestamp", every='1m') \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("picarro_corrected").std().alias("std")) \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_min_ICOS_Picarro_G2401_529.parquet"))
    
# 1h  
df_p_529_corr.with_columns(pl.lit("Picarro").alias("sys_name_short")) \
    .select("datetime", "sys_name_short", "picarro_corrected", "h2o_reported") \
    .rename({"datetime":"creation_timestamp"}) \
    .group_by_dynamic("creation_timestamp", every='1h') \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("picarro_corrected").std().alias("std")) \
    .write_parquet(os.path.join(DATA_DIRECTORY, "processed", "picarro", "Calibrated_1_h_ICOS_Picarro_G2401_529.parquet"))

In [7]:
# verify by plotting

from datetime import datetime, timezone, timedelta
import plotly.express as px

df_p = df_p_529_corr.with_columns(pl.lit("Picarro").alias("sys_name_short")) \
    .select("datetime", "sys_name_short", "picarro_corrected", "h2o_reported") \
    .rename({"datetime":"creation_timestamp"}) \
    .group_by_dynamic("creation_timestamp", every='1h') \
    .agg(pl.all().exclude(["creation_timestamp","sys_name_short"]).mean(),
            pl.col("picarro_corrected").std().alias("std")) \
    .with_columns((pl.col("creation_timestamp") + timedelta(minutes=30)))
    
df_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "pipeline","flagged_1_h_acropolis.parquet")) \
    .filter(pl.col("system_id") == 2) \
    .filter(pl.col("Flag") == 'O') \


start_date = datetime(2024, 10, 14, 8, 0, 0).replace(tzinfo=timezone.utc)
end_date = datetime(2024, 11, 21, 9, 59, 59).replace(tzinfo=timezone.utc)


df_plot = df_1h.join(df_p.select("creation_timestamp", "picarro_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected")) \
    .filter(pl.col("creation_timestamp").is_between(start_date, end_date))
    
fig = px.scatter(df_plot, x="creation_timestamp", y = ["gmp343_corrected", "picarro_corrected"])
fig.show()

fig = px.scatter(df_plot, x="creation_timestamp", y = "diff")
fig.show()

  df_plot = df_1h.join(df_p.select("creation_timestamp", "picarro_corrected"), on = ["creation_timestamp"], how= "left") \


In [8]:
import numpy as np

df_plot.select((np.square(pl.col("diff")).mean()).alias("mse_full_deployment")) \
        .with_columns(pl.col("mse_full_deployment").sqrt().alias("rmse_full_deployment")) 

mse_full_deployment,rmse_full_deployment
f64,f64
0.39602,0.629301
