In [70]:
import os
import glob
import polars as pl
import pandas as pd
import plotly.express as px
from datetime import datetime, timezone
from plotly.subplots import make_subplots

DATA_DIRECTORY = os.environ.get("DATA_DIRECTORY")
PICARRO_DATA_DIRECTORY = os.environ.get("PICARRO_DATA_DIRECTORY")

sensor_id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

# customize pipeline
merge_picarro_files = True

In [75]:
df_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_acropolis.parquet"))

if not merge_picarro_files:
    df_p_1h = pl.read_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))
else:
    filenames = glob.glob(PICARRO_DATA_DIRECTORY + "/*/*/*.dat")

    
    # read all *.dat picarro measurement files and add to single db
    df_list = []
    for filename in filenames:
        df_list.append(pd.read_csv(filename,sep='\s+'))

    #PANDAS DF
    df_p_files = pd.concat(df_list, ignore_index=True)
    df_p_files["datetime"] = pd.to_datetime((df_p_files['DATE'] + ' ' + df_p_files['TIME']))
    df_p_files.sort_values(by='datetime', inplace = True)

    df_p_files[["datetime", "CO2_dry","h2o_reported", "CavityPressure", "CavityTemp"]].to_parquet(path = os.path.join(DATA_DIRECTORY, "input", "picarro.parquet"))

    #Calibration

    # TODO: Add ability for multiple calibration dates
    # before 23.10
    # picarro_slope = 1.0061589132696314
    # picarro_intercept = 0.14607153970888476

    # after 23.10
    #picarro_slope = 1.0063874771746113
    #picarro_intercept = 0.06621464961165202
    
     
    #after 18.12
    picarro_slope = 1.0060713120261249
    picarro_intercept = 0.08088569875155827
    
    #POLARS DF
    #1m averaged corrected Picarro dataset
    df_p_1m = pl.scan_parquet(os.path.join(DATA_DIRECTORY,"input", "picarro.parquet")) \
        .with_columns(pl.col("datetime").dt.cast_time_unit("us").dt.replace_time_zone("UTC").alias("creation_timestamp")) \
        .sort("creation_timestamp") \
        .with_columns((pl.col("CO2_dry") * picarro_slope + picarro_intercept).alias("picarro_corrected")) \
        .group_by_dynamic("creation_timestamp", every='1m') \
        .agg(pl.all().exclude("creation_timestamp").mean()).collect() \
        .select(["creation_timestamp", "picarro_corrected", "h2o_reported", "CavityPressure", "CavityTemp"]) \
        .with_columns(pl.lit("Picarro").alias("sys_name_short"),
            pl.lit(0.0).alias("diff")) 

    df_p_1m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1m_cal_corr_picarro.parquet"))

    #1h averaged corrected Picarro dataset
    df_p_10m = df_p_1m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='10m', by=["sys_name_short"]) \
        .agg(pl.all().exclude("creation_timestamp").mean())
        
    df_p_10m.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "10m_cal_corr_picarro.parquet"))
    
    #1h averaged corrected Picarro dataset
    df_p_1h = df_p_1m.sort("creation_timestamp") \
        .group_by_dynamic("creation_timestamp", every='1h', by=["sys_name_short"]) \
        .agg(pl.all().exclude("creation_timestamp").mean())
        
    df_p_1h.write_parquet(os.path.join(DATA_DIRECTORY,"processed", "1h_cal_corr_picarro.parquet"))
    


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



In [None]:
fig = px.scatter(df_p_1h, x="creation_timestamp", y = "picarro_corrected")
fig.show()

In [None]:
df_1h = df_1h.join(df_p_1h.select("creation_timestamp", "picarro_corrected"), on = ["creation_timestamp"], how= "left") \
    .with_columns(diff = pl.col("gmp343_corrected") - pl.col("picarro_corrected"))

In [None]:
df_1h.write_parquet(os.path.join(DATA_DIRECTORY, "processed", "1h_acropolis_with_picarro.parquet"))

In [None]:
fig = px.scatter(df_1h.filter(pl.col("system_id") == 2), x="creation_timestamp", y = "diff")
fig.update_layout(yaxis_range=[-20,20])
fig.show()

fig = px.scatter(df_1h.filter(pl.col("system_id") == 2), x="creation_timestamp", y = "gmp343_temperature")
fig.show()