In [1]:
import os
import glob
from datetime import datetime
from datetime import timezone
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px

quickflow_directory = "/Users/patrickaigner/Documents/PROJECTS/ACROPOLIS/Software/quickflow/acropolis/"
local_path = "../data/"
picarro_path = "/Users/patrickaigner/Documents/PROJECTS/ACROPOLIS/Database/PICARRO"

# Download to local db

In [2]:
# download from hermes database

# Perform Pivot on local db

In [3]:
# perform pivot on downloaded hermes database
# TODO: perform the pivot during the download to safe memory space

df_parq = pl.scan_parquet(os.path.join(quickflow_directory, "measurements.parquet")).collect().pivot(
                values="value",
                index=[
                    "system_name",
                    "revision",
                    "creation_timestamp",
                    "receipt_timestamp",
                ],
                columns="attribute",
                aggregate_function="first",
            )

df_parq.write_parquet(
            os.path.join(local_path, "pivot_measurements.parquet"),
            statistics=True,
        )

df_parq = None

# Process Calibration Data

In [4]:
# join old and new dv

df_new = pl.scan_parquet(os.path.join(local_path, "pivot_measurements.parquet"))
# old db was preprocessed and transformed to match the columns of the pivot(new_db)
df_old = pl.scan_parquet(os.path.join(local_path, "old_db_renamed_measurements.parquet"))

df_new = df_new.select(["creation_timestamp","system_name", "cal_gmp343_filtered", "cal_bottle_id",]) \
.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us"))
df_old = df_old.select(["creation_timestamp","system_name", "cal_gmp343_filtered", "cal_bottle_id",]) \
.with_columns(pl.col("creation_timestamp").dt.cast_time_unit("us"))

df = pl.concat([df_new, df_old]).sort("creation_timestamp")

In [5]:
# split calibration readings into calibration bottles
# currently the split is performed by CO2 concentration as 400 and 800ppm is far apart
# TODO: think about splitting by bottle id

df = df.with_columns([pl.when(pl.col("cal_gmp343_filtered") < 600).then(pl.col("cal_gmp343_filtered")).otherwise(None).alias("cal_400"),
    pl.when(pl.col("cal_gmp343_filtered") > 600).then(pl.col("cal_gmp343_filtered")).otherwise(None).alias("cal_800"),
    pl.when(pl.col("cal_gmp343_filtered") < 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("cal_bottle_id_400"),
    pl.when(pl.col("cal_gmp343_filtered") > 600).then(pl.col("cal_bottle_id")).otherwise(None).alias("cal_bottle_id_800")])

In [6]:
# group by day

# group calibration data by day and add back bottle id for later processing
dfg = df.groupby([pl.col("creation_timestamp").dt.date(), pl.col("system_name")]).agg([pl.col("cal_400").drop_nulls(),
    pl.col("cal_800").drop_nulls(),
    pl.col("cal_bottle_id_400").drop_nulls().median(),
    pl.col("cal_bottle_id_800").drop_nulls().median()])

In [7]:
# perform averaging

def average_bottle(data):
    data = data.to_list()
    #2nd bottle
    if 50 < len(data) < 70:
        x = data[int(len(data)*0.3):int(len(data)*0.95)]
        return sum(x) / len(x)
    #1st bottle
    elif 70 < len(data) < 130:
        x = data[int(len(data)*0.5):int(len(data)*0.95)]
        return sum(x) / len(x)
    else:
        return 0.0

dfg = dfg.select([pl.col("creation_timestamp"),
    pl.col("system_name"),
    pl.col("cal_400").apply(lambda x: average_bottle(x)).alias("mean_cal_400"),
    pl.col("cal_800").apply(lambda x: average_bottle(x)).alias("mean_cal_800"),
    pl.col("cal_bottle_id_400"),
    pl.col("cal_bottle_id_800")
    ])

dfg.tail().collect()

creation_timestamp,system_name,mean_cal_400,mean_cal_800,cal_bottle_id_400,cal_bottle_id_800
date,str,f64,f64,f64,f64
2023-10-28,"""tum-esm-midcos…",429.6575,835.111111,6.0,34.0
2023-10-30,"""tum-esm-midcos…",0.0,0.0,,
2023-10-30,"""tum-esm-midcos…",0.0,0.0,,
2023-10-30,"""tum-esm-midcos…",0.0,0.0,,
2023-10-30,"""tum-esm-midcos…",0.0,0.0,,


In [8]:
# calculate slope and intercept

# this was preprocessed on recorded lab data
df_gas = pl.read_csv(os.path.join(local_path, "averaged_gases.csv"))

def two_point_calibration(measured_values, true_values):
    # Check if input lists have length 2
    if len(measured_values) != 2 or len(true_values) != 2:
        raise ValueError("Both measured_values and true_values must have length 2")

    # Calculate calibration parameters (slope and intercept)
    # 
    slope = (true_values[1] - true_values[0]) / (measured_values[1] - measured_values[0])
    # y_true = m * y_meas + t
    intercept = true_values[0] - slope * measured_values[0]

    return slope, intercept

def calc_slope(meas_400, meas_800, id_400, id_800):
    if (meas_400 == None) or (meas_800 == None):
        return None
    
    bottles_meas = [meas_400, meas_800]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_400)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_800)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return slope

def calc_intercept(meas_400, meas_800, id_400, id_800):
    if (meas_400 == None) or (meas_800 == None):
        return None
    
    bottles_meas = [meas_400, meas_800]
    bottles_true = [df_gas.filter(pl.col("Bottle_ID")== id_400)["CO2_dry"][0],df_gas.filter(pl.col("Bottle_ID")== id_800)["CO2_dry"][0]]

    slope, intercept = two_point_calibration(bottles_meas, bottles_true)
    
    return intercept

# filter for days that have a valid calibration for both bottles
dfg = dfg.sort(pl.col("creation_timestamp")) \
    .filter(pl.col("mean_cal_400") > 0.0 ) \
    .filter(pl.col("mean_cal_800") > 0.0 )

# calculate slope
dfg = dfg.with_columns(pl.struct(['mean_cal_400','mean_cal_800','cal_bottle_id_400','cal_bottle_id_800']) \
    .apply(lambda x: calc_slope(x['mean_cal_400'],x['mean_cal_800'],x['cal_bottle_id_400'],x['cal_bottle_id_800'])) \
    .alias('slope'))

# calculate intercept
dfg = dfg.with_columns(pl.struct(['mean_cal_400','mean_cal_800','cal_bottle_id_400','cal_bottle_id_800']) \
    .apply(lambda x: calc_intercept(x['mean_cal_400'],x['mean_cal_800'],x['cal_bottle_id_400'],x['cal_bottle_id_800'])) \
    .alias('intercept'))

dfg.tail().collect()

creation_timestamp,system_name,mean_cal_400,mean_cal_800,cal_bottle_id_400,cal_bottle_id_800,slope,intercept
date,str,f64,f64,f64,f64,f64,f64
2023-10-29,"""tum-esm-midcos…",387.848148,791.941026,1.0,38.0,0.040956,782.230649
2023-10-29,"""tum-esm-midcos…",408.074359,812.757407,23.0,1.0,1.007098,-20.411106
2023-10-29,"""tum-esm-midcos…",404.510256,840.474074,5.0,12.0,0.916767,45.753137
2023-10-29,"""tum-esm-midcos…",429.9725,835.498148,6.0,34.0,0.996699,-27.012043
2023-10-29,"""tum-esm-midcos…",429.09,838.788889,36.0,15.0,0.960639,-5.669046


In [9]:
# safe results to parquet

dfg.collect().write_parquet(os.path.join(local_path, "calibration_correction.parquet"))


In [10]:
# visualise slope and intercept results

x = dfg.select("creation_timestamp", "slope","intercept","system_name").collect()

fig = px.line(x, x="creation_timestamp", y="slope", markers=True, title = "slope", color="system_name")
fig.show()
fig = px.line(x, x="creation_timestamp", y="intercept", markers=True, title = "intercept",color="system_name")
fig.show()

  v = v.dt.to_pydatetime()



The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



# Process Picarro Data

In [11]:
filenames = glob.glob(picarro_path + "/*/*/*.dat")

# read all *.dat picarro measurement files and add to single db
df_list = []
for filename in filenames:
    df_list.append(pd.read_csv(filename,sep='\s+'))

big_frame = pd.concat(df_list, ignore_index=True)
big_frame["datetime"] = pd.to_datetime((big_frame['DATE'] + ' ' + big_frame['TIME']))
big_frame.sort_values(by='datetime', inplace = True)

big_frame.to_parquet(path = f"{local_path}/picarro.parquet")
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7884905 entries, 7718854 to 916783
Data columns (total 26 columns):
 #   Column                Dtype         
---  ------                -----         
 0   DATE                  object        
 1   TIME                  object        
 2   FRAC_DAYS_SINCE_JAN1  float64       
 3   FRAC_HRS_SINCE_JAN1   float64       
 4   JULIAN_DAYS           float64       
 5   EPOCH_TIME            float64       
 6   ALARM_STATUS          int64         
 7   INST_STATUS           int64         
 8   CavityPressure        float64       
 9   CavityTemp            float64       
 10  DasTemp               float64       
 11  EtalonTemp            float64       
 12  species               float64       
 13  OutletValve           float64       
 14  CH4                   float64       
 15  CH4_dry               float64       
 16  CO2                   float64       
 17  CO2_dry               float64       
 18  h2o_reported          float64       
 19  