In [None]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px

quickflow_directory = "/Users/patrickaigner/Documents/PROJECTS/ACROPOLIS/Software/quickflow/acropolis/"
local_path = "../data/"
picarro_path = "/Users/patrickaigner/Documents/PROJECTS/ACROPOLIS/Database/PICARRO"

# Download to local db

In [None]:
# download from hermes database

# Perform Pivot on local db

In [None]:
# perform pivot on downloaded hermes database
# TODO: perform the pivot during the download to safe memory space

df_parq = pl.scan_parquet(os.path.join(quickflow_directory, "measurements.parquet")).collect().pivot(
                values="value",
                index=[
                    "system_name",
                    "revision",
                    "creation_timestamp",
                    "receipt_timestamp",
                ],
                columns="attribute",
                aggregate_function="first",
            )

df_parq.write_parquet(
            os.path.join(local_path, "pivot_measurements.parquet"),
            statistics=True,
        )

df_parq = None

# Process Calibration Data

In [None]:
# join old and new dv

df_new = pl.scan_parquet(os.path.join(local_path, "pivot_measurements.parquet"))
# old db was preprocessed and transformed to match the columns of the pivot(new_db)
df_old = pl.scan_parquet(os.path.join(local_path, "old_db_renamed_measurements.parquet"))

df_new = df_new.select(["creation_timestamp","system_name", "cal_gmp343_filtered", "cal_bottle_id",]) \
.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us"))
df_old = df_old.select(["creation_timestamp","system_name", "cal_gmp343_filtered", "cal_bottle_id",]) \
.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us"))

df = pl.concat([df_new, df_old]).sort("creation_timestamp")

In [None]:
# split calibration readings into calibration bottles
# currently the split is performed by CO2 concentration as 400 and 800ppm is far apart
# TODO: think about splitting by bottle id

df = df.with_columns([pl.when(pl.col("cal_gmp343_filtered") < 600).then(pl.col("cal_gmp343_filtered")).otherwise(None).alias("cal_400"),
    pl.when(pl.col("cal_gmp343_filtered") > 600).then(pl.col("cal_gmp343_filtered")).otherwise(None).alias("cal_800"),
    pl.when(pl.col("cal_gmp343_filtered") < 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("cal_bottle_id_400"),
    pl.when(pl.col("cal_gmp343_filtered") > 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("cal_bottle_id_800")])

In [None]:
# group by day

# group calibration data by day and add back bottle id for later processing
dfg = df.groupby([pl.col("creation_timestamp").dt.date(), pl.col("system_name")]).agg([pl.col("cal_400").drop_nulls(),
    pl.col("cal_800").drop_nulls(),
    pl.col("cal_bottle_id_400").drop_nulls().median(),
    pl.col("cal_bottle_id_800").drop_nulls().median()])

In [None]:
# perform averaging

def average_bottle(data):
    data = data.to_list()
    #2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data)*0.3):int(len(data)*0.95)]
        return sum(x) / len(x)
    #1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data)*0.5):int(len(data)*0.95)]
        return sum(x) / len(x)
    else:
        return 0.0

dfg = dfg.select([pl.col("creation_timestamp"),
    pl.col("system_name"),
    pl.col("cal_400").apply(lambda x: average_bottle(x)).alias("mean_cal_400"),
    pl.col("cal_800").apply(lambda x: average_bottle(x)).alias("mean_cal_800"),
    pl.col("cal_bottle_id_400"),
    pl.col("cal_bottle_id_800")
    ])

dfg.tail().collect()

In [None]:
# calculate slope and intercept

# this was preprocessed on recorded lab data
df_gas = pl.read_csv(os.path.join(local_path, "averaged_gases.csv"))

def two_point_calibration(measured_values, true_values):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        raise ValueError("Both measured_values and true_values must have length 2")

    # Calculate calibration parameters (slope and intercept)
    # 
    slope = (true_values[1] - true_values[0]) / (measured_values[1] - measured_values[0])
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

def calc_slope(meas_400, meas_800, id_400, id_800):
    if (meas_400 == None) or (meas_800 == None):
        return None
    
    bottles_meas = [meas_400, meas_800]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_400)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_800)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return slope

def calc_intercept(meas_400, meas_800, id_400, id_800):
    if (meas_400 == None) or (meas_800 == None):
        return None
    
    bottles_meas = [meas_400, meas_800]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_400)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_800)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return intercept

# filter for days that have a valid calibration for both bottles
dfg = dfg.sort(pl.col("creation_timestamp")) \
    .filter(pl.col("mean_cal_400") > 0.0 ) \
    .filter(pl.col("mean_cal_800") > 0.0 )

# calculate slope
dfg = dfg.with_columns(pl.struct(['mean_cal_400','mean_cal_800','cal_bottle_id_400','cal_bottle_id_800']) \
    .apply(lambda x: calc_slope(x['mean_cal_400'],x['mean_cal_800'],x['cal_bottle_id_400'],x['cal_bottle_id_800'])) \
    .alias('slope'))

# calculate intercept
dfg = dfg.with_columns(pl.struct(['mean_cal_400','mean_cal_800','cal_bottle_id_400','cal_bottle_id_800']) \
    .apply(lambda x: calc_intercept(x['mean_cal_400'],x['mean_cal_800'],x['cal_bottle_id_400'],x['cal_bottle_id_800'])) \
    .alias('intercept'))

dfg.tail().collect()

In [None]:
# safe results to parquet

dfg.collect().write_parquet(os.path.join(local_path, "calibration_correction.parquet"))


In [None]:
# visualise slope and intercept results

x = dfg.select("creation_timestamp", "slope","intercept","system_name").collect()

fig = px.line(x, x="creation_timestamp", y="slope", markers=True, title = "slope", color="system_name")
fig.show()
fig = px.line(x, x="creation_timestamp", y="intercept", markers=True, title = "intercept",color="system_name")
fig.show()

# Process Picarro Data

In [None]:
filenames = glob.glob(picarro_path + "/*/*/*.dat")

# read all *.dat picarro measurement files and add to single db
df_list = []
for filename in filenames:
    df_list.append(pd.read_csv(filename,sep='\s+'))

big_frame = pd.concat(df_list, ignore_index=True)
big_frame["datetime"] = pd.to_datetime((big_frame['DATE'] + ' ' + big_frame['TIME']))
big_frame.sort_values(by='datetime', inplace = True)

big_frame.to_parquet(path = f"{picarro_path}/picarro.parquet")
big_frame.info()