In [21]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")


# load calibration bottle concentrations (preprocessed)
df_gas = pl.read_csv(os.path.join(DATA_DIRECTORY,"input", "averaged_gases.csv"))
# load local db: measurements.parquet
df = pl.scan_parquet(os.path.join(DATA_DIRECTORY, "download", "measurements.parquet"))

# Download to local db

In [22]:
# download from hermes database
# Use Download/download_from_hermes notebook

# Process Calibration Data

In [23]:
# define functions

def average_bottle(data):
    data = data.to_list()
    #2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data)*0.3):int(len(data)*0.95)]
        return sum(x) / len(x)
    #1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data)*0.5):int(len(data)*0.95)]
        return sum(x) / len(x)
    else:
        return 0.0
    
def two_point_calibration(measured_values, true_values):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        raise ValueError("Both measured_values and true_values must have length 2")

    # Calculate calibration parameters (slope and intercept)
    # 
    slope = (true_values[1] - true_values[0]) / (measured_values[1] - measured_values[0])
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

def calc_slope(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return slope

def calc_intercept(meas_low, meas_high, id_low, id_high):
    if (meas_low == None) or (meas_high == None):
        return None
    
    bottles_meas = [meas_low, meas_high]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_low)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_high)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return intercept

In [24]:
dfg = df.groupby([pl.col("creation_timestamp").dt.date(), pl.col("system_name"), pl.col("cal_bottle_id")]) \
.agg([pl.col("cal_gmp343_filtered").drop_nulls()]) \
.filter(pl.col("cal_bottle_id") > 0) \

# perform averaging

dfg = dfg.with_columns(pl.col("cal_gmp343_filtered").apply(lambda x: average_bottle(x)).alias("mean_cal"))

# identify low and hight span bottle
dfg = dfg.with_columns([
        pl.when(pl.col("mean_cal") < 600).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_low"),
        pl.when(pl.col("mean_cal") > 600).then(pl.col("mean_cal")).otherwise(None).alias("mean_cal_high"),
        pl.when(pl.col("mean_cal") < 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_low"),
        pl.when(pl.col("mean_cal") > 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("id_cal_bottle_high")
    ]) \
    .groupby([pl.col("creation_timestamp").dt.date(), pl.col("system_name")]) \
    .agg([
        pl.col("mean_cal_low").sum(),
        pl.col("mean_cal_high").sum(),
        pl.col("id_cal_bottle_low").sum(),
        pl.col("id_cal_bottle_high").sum(),
    ])

# calculate slope and intercept

# filter for days that have a valid calibration for both bottles
dfg = dfg.sort(pl.col("creation_timestamp")) \
    .filter(pl.col("mean_cal_low") > 0.0 ) \
    .filter(pl.col("mean_cal_high") > 0.0 )

# calculate slope
dfg = dfg.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .apply(lambda x: calc_slope(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('slope'))

# calculate intercept
dfg = dfg.with_columns(pl.struct(['mean_cal_low','mean_cal_high','id_cal_bottle_low','id_cal_bottle_high']) \
    .apply(lambda x: calc_intercept(x['mean_cal_low'],x['mean_cal_high'],x['id_cal_bottle_low'],x['id_cal_bottle_high'])) \
    .alias('intercept')) \
    .select(["creation_timestamp", "system_name","slope","intercept"])  \
    .rename({"creation_timestamp": "date"})

dfg = dfg.collect()
dfg.tail()

date,system_name,slope,intercept
date,str,f64,f64
2023-11-09,"""tum-esm-midcos…",1.045115,-10.468422
2023-11-09,"""tum-esm-midcos…",1.063665,-21.265094
2023-11-09,"""tum-esm-midcos…",1.045106,-3.516906
2023-11-09,"""tum-esm-midcos…",0.99701,-27.60323
2023-11-09,"""tum-esm-midcos…",0.967431,4.625379


In [25]:
# safe results to parquet
dfg.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "slope_intercept_acropolis.parquet"))

In [27]:
# visualise slope and intercept results

x = dfg.select("date", "slope","intercept","system_name")

fig = px.line(x, x="date", y="slope", markers=True, title = "slope", color="system_name")
fig.show()
fig = px.line(x, x="date", y="intercept", markers=True, title = "intercept",color="system_name")
fig.show()

# Process Picarro Data

In [29]:
filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

# read all *.dat picarro measurement files and add to single db
df_list = []
for filename in filenames:
    df_list.append(pd.read_csv(filename,sep='\s+'))

big_frame = pd.concat(df_list, ignore_index=True)
big_frame["datetime"] = pd.to_datetime((big_frame['DATE'] + ' ' + big_frame['TIME']))
big_frame.sort_values(by='datetime', inplace = True)

big_frame.to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

#Calibration

# TODO: Add ability for multiple calibration dates
# before 23.10
# picarro_slope = 1.0061589132696314
# picarro_intercept = 0.14607153970888476

# after 23.10
picarro_slope = 1.006374633215469
picarro_intercept = 0.0709482571842841

#1h averaged corrected Picarro dataset
df_p = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
    .sort("creation_timestamp") \
    .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
    .groupby_dynamic("creation_timestamp", every='1h') \
    .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
    .select(["creation_timestamp", "CO2_corr"]) \
    .with_columns([pl.lit(picarro_slope).alias("slope"),
        pl.lit(picarro_intercept).alias("intercept"),
        pl.lit("Picarro").alias("system_name"),
        pl.lit(0.0).alias("diff")]
        ) \

df_p.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))

#10m averaged corrected Picarro dataset
df_p = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
    .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
    .sort("creation_timestamp") \
    .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("CO2_corr")) \
    .groupby_dynamic("creation_timestamp", every='10m') \
    .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
    .select(["creation_timestamp", "CO2_corr"]) \
    .with_columns([pl.lit(picarro_slope).alias("slope"),
        pl.lit(picarro_intercept).alias("intercept"),
        pl.lit("Picarro").alias("system_name"),
        pl.lit(0.0).alias("diff")]
        ) \

df_p.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))