In [None]:
!pip install plotly dash ipyfilechooser 

In [None]:
import numpy as np
from scipy.integrate import cumulative_trapezoid
from scipy.signal import find_peaks
from scipy import signal
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, callback
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from ipywidgets import widgets
from ipyfilechooser import FileChooser
from dataclasses import dataclass

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [None]:
@dataclass
class TestFile:
    excel_path: Path
    raw_path: Path
    label: str

def list_test_file_pairs(data_dir: Path) -> list[TestFile]:
    # Todo update this logic for files with name as prefix
    files: list[TestFile] = []
    for xls in data_dir.glob('*.xlsx'):
        prefix = xls.name.split('_')[0]
        log_match = list(data_dir.glob(f'{prefix}*raw.log'))
        if len(log_match) == 0:
            print(f'No matching raw log for Excel file {xls}')
        elif len(log_match) > 1:
            print(f'More then 1 matching raw log for Excel file {xls}?')
        else:
            files.append(TestFile(xls, log_match[0], label=prefix))
    return files

In [None]:
cycle_col_names = ['phase', 'load', 'vo2/kg', 'fc', 'vo2', 'vco2', 'qr', 'vol_instant', 'bf', 've', 've/vo2', 've/vco2', 'peto2', 'petco2',
    'vol_in', 't_in', 'vol_ex', 't_ex', 'pulse_o2', 'spo2', 'sbp', 'dbp', 'rpm', 'fo2et', 'fco2et', 'fio2et', 'fico2', 'feo2', 'feco2', 'delay_o2', 'delay_co2', 'temp_ambient',
    'pressure_ambient', 'humidity_ambient', 'duration']
def load_dataframes(file: TestFile, debug=False):
    print(f'Loading files:\n{str(file.excel_path)}\n{str(file.raw_path)}')
    # some xls files have empty rows at the top, so read once to know how many to skip
    df_cycles = pd.read_excel(file.excel_path, header=None)

    drop_cols = [1]
    keep_cols = [col for col in range(len(df_cycles.columns)) if col not in drop_cols]
    # keep_col_names = ['phase', 'vol_instant', 'vol_in', 't_in', 'vol_ex', 't_ex', 'duration']
    assert len(keep_cols) == len(cycle_col_names)
    # drop all columns except indices in keep_cols
    df_cycles.drop([df_cycles.columns[col] for col in drop_cols], axis=1, inplace=True)
    df_cycles.columns = cycle_col_names
    # for all columns except col 0, cast to float and set string data in headers to NaN
    for col in df_cycles.columns[1:]:
        df_cycles[col] = pd.to_numeric(df_cycles[col], errors='coerce')
    first_valid_row = 0
    while df_cycles.loc[first_valid_row, ['phase', 'vol_instant', 'vol_in', 'vol_ex', 't_in', 't_ex', 'duration']].isna().any():
        first_valid_row += 1
    last_valid_row = df_cycles.count().max()
    if debug: print(f'Dropping {first_valid_row - 1} first rows')
    df_cycles.drop(range(first_valid_row), inplace=True)
    df_cycles.reset_index(drop=True, inplace=True)
    n_rows = len(df_cycles)
    invalid_rows_at_end = 0
    while df_cycles.loc[n_rows - invalid_rows_at_end - 1, df_cycles.columns[1:]].isna().any():
        invalid_rows_at_end += 1
    if debug: print(f'Dropping {invalid_rows_at_end} last rows of {n_rows}')
    if invalid_rows_at_end:
        df_cycles.drop(range(n_rows - invalid_rows_at_end, n_rows), inplace=True)

    df_cycles['phase'] = df_cycles['phase'].replace('Repos', 'rest')
    df_cycles['phase'] = df_cycles['phase'].replace('Charge', 'load')
    df_cycles['phase'] = df_cycles['phase'].replace('Récupération', 'recovery')
    df_cycles['phase'] = df_cycles['phase'].ffill()

    df_cycles.reset_index(drop=True, inplace=True)
    if debug: print(f'{len(df_cycles)} cycle rows')

    df_raw = pd.read_csv(file.raw_path, delimiter='\t', names=['t', 'flow', 'fo2', 'fco2'])
    first_non_zero_flow_row = 0
    flow_thresh = 1e-5
    while np.abs(df_raw.loc[first_non_zero_flow_row, 'flow']) < flow_thresh:
        first_non_zero_flow_row += 1
    if debug: print(f'Dropping first {first_non_zero_flow_row} rows with flow<{flow_thresh}')
    df_raw.drop(range(first_non_zero_flow_row), inplace=True)
    df_raw.reset_index(inplace=True, drop=True)
    return df_cycles, df_raw

def match_cycles_with_raw_data(df_cycles, df_raw, debug = False):
    sampling_freq_hz = 1 / (df_raw['t'][:-1] - df_raw['t'].shift(1)[1:]).mean()
    filter_freq_hz = 2*1e-6
    df_raw['instant_vol_raw'] = cumulative_trapezoid(y=df_raw['flow'], x=df_raw['t'], initial=0) 
    sos = signal.butter(4, Wn=filter_freq_hz * sampling_freq_hz, btype='highpass', output='sos')
    flow_filtered = signal.sosfilt(sos, df_raw['flow'])
    df_raw['instant_vol'] = cumulative_trapezoid(y=flow_filtered, x=df_raw['t'], initial=0) 
    if debug: print(f'Sum over all instantaneous volume: {df_raw.instant_vol.sum()}')

    MIN_PROMINENCE = 0.15
    peaks, peakprops  = signal.find_peaks(df_raw['instant_vol'], prominence=MIN_PROMINENCE)
    valls, vallprops = signal.find_peaks(-df_raw['instant_vol'], prominence=MIN_PROMINENCE)
    if debug: print(f'Found {len(peaks)} peaks, {len(valls)} valleys')
    # index of first peak that comes after first valley
    first_peak_idx = np.argwhere(peaks > valls[0]).min()
    if not (valls[-1] > peaks).all(): # there is a peak after the last valley
        # index of last peak that comes before last valley
        last_peak_idx = np.argwhere(valls[-1] < peaks).min()
    else:
        last_peak_idx = len(peaks - 1)
    if debug: print(f'Dropping first {first_peak_idx} and last {len(peaks) - last_peak_idx} peaks')
    peaks = peaks[first_peak_idx:last_peak_idx]

    #   p   p   p   n peaks
    #  / \_/ \_/ \
    # v   v   v   v n+1 valleys

    # That makes n complete cycles

    assert len(peaks) == len(valls) - 1
    iv = df_raw['instant_vol']
    vins = iv[peaks].array - iv[valls[:-1]].array
    vexs = iv[peaks].array - iv[valls[1:]].array
    if debug: print(f'Found {len(peaks)} complete cycles in 125Hz data')

    want_winsize = 100
    winstart = min(100, len(df_cycles))
    winend = min(winstart + want_winsize, len(df_cycles))
    winsize = winend - winstart
    if debug: print(f'Finding correlation over window {winstart}-{winend}')
    if winsize < 50:
        print(f'Warning: small window to find initial cycle correlation')
    if len(valls) < winsize:
        print(f'Error: not enough cycles in raw data to match window of size {winsize}')
        return
    # durations_hires[i] is duration of cycle from valls[i] to valls[i+1]
    durations_hires = df_raw['t'][valls[1:]].array - df_raw['t'][valls[:-1]].array
    win_durations_cycles = df_cycles.loc[range(winstart, winend), 'duration'].array
    start_cutoff_hires = 30 # cut off first hires samples that may have extreme values
    corrs = np.correlate(durations_hires[start_cutoff_hires:] - durations_hires.mean(), win_durations_cycles - win_durations_cycles.mean(), mode='valid')
    shift = corrs.argmax() + start_cutoff_hires

    duration_diffs = df_cycles.loc[range(winstart, winend), 'duration'].array - durations_hires[shift:shift+winsize]
    if debug: print(f'Mean cycle duration error in {winsize} cycle window: {np.abs(duration_diffs).mean()}')
    bad_match = np.abs(duration_diffs).mean() > 0.2
    if bad_match:
        print(f'Warning: potentially bad match between cycles and 125Hz data')
    if debug or bad_match:
        fig, axs = plt.subplots(2)
        axs[0].bar(range(len(corrs)), corrs), shift
        axs[0].set_title(f'Correlation for window of {winsize} cycles (raw data)')

        axs[1].plot(range(winsize), durations_hires[shift:shift+winsize], label='cycle duration 125Hz')
        axs[1].plot(range(winsize), win_durations_cycles, label='cycle duration')
        axs[1].legend()

    best_duration_match_idx = np.abs(duration_diffs).argmin()

    df_raw['cycle_index'] = pd.Series(dtype=int)
    df_cycles['hires_tstart'] = pd.Series(dtype=float)
    df_cycles['hires_tend'] = pd.Series(dtype=float)
    df_cycles['hires_mismatch'] = pd.Series(dtype=bool)
    df_cycles['hires_mismatch'] = False
    matched_cycle_index = winstart + best_duration_match_idx
    matched_hires_valley_idx = shift + best_duration_match_idx
    if debug: print(f'Matched cycle {matched_cycle_index}')
    df_cycles.loc[matched_cycle_index, 'hires_tstart'] = df_raw.loc[valls[matched_hires_valley_idx], 't']
    # df_cycles.loc[matched_cycle_index, 'hires_tend'] = df_raw.loc[valls[matched_hires_valley_idx + 1], 't']

    d = df_raw.loc[valls[matched_hires_valley_idx], 't'] - df_raw.loc[valls[matched_hires_valley_idx - 1], 't']
    dd = df_cycles.loc[matched_cycle_index, 't_in'] + df_cycles.loc[matched_cycle_index, 't_ex']
    df_cycles.loc[matched_cycle_index]
    # accepted relative error (%) between durations from high-res and cycle-by-cycle data
    MAX_DURATION_ERROR = 10 / 100

    # last matched valley index. start of cycle after this one in time
    current_valley_idx = matched_hires_valley_idx
    # walk backwards in time, matching up cycles before matched_cycle_index
    for cycle_idx in reversed(range(0, matched_cycle_index)):
        cycle_tend = df_cycles.loc[cycle_idx + 1, 'hires_tstart']
        df_cycles.loc[cycle_idx, 'hires_tend'] = cycle_tend
        if current_valley_idx <= 0:
            print(f'Warning: not enough cycles in raw data ({cycle_idx+1} cycles left to match up during backwards walk, but no more local minima in 125Hz data)')
            break
        valls_before = valls[:current_valley_idx]
        # duration if cycle starts at a valley 
        duration_valley_start = -df_raw.loc[valls_before, 't'].array + cycle_tend
        true_cycle_duration = df_cycles.loc[cycle_idx, 't_in'] + df_cycles.loc[cycle_idx, 't_ex']
        best_valley_idx = np.argmin(np.abs(duration_valley_start - true_cycle_duration))
        duration_error = (duration_valley_start[best_valley_idx] - true_cycle_duration) / duration_valley_start[best_valley_idx]
        if np.abs(duration_error) < MAX_DURATION_ERROR:
            df_cycles.loc[cycle_idx, 'hires_tstart'] = df_raw.loc[valls_before[best_valley_idx], 't']
            current_valley_idx = best_valley_idx
        else: # no valley matches cycle duration in excel data
            # duration if cycle starts at any t
            duration_t = -df_raw['t'].array + cycle_tend
            best_raw_idx = np.argmin(np.abs(duration_t - true_cycle_duration))
            while best_raw_idx <= valls[current_valley_idx]: 
                current_valley_idx -= 1
            df_cycles.loc[cycle_idx, 'hires_tstart'] = df_raw.loc[best_raw_idx, 't']
            df_cycles.loc[cycle_idx, 'hires_mismatch'] = True
    
    current_valley_idx = matched_hires_valley_idx + 1
    # walk forwards in time, matching up cycles after matched_cycle_index
    for cycle_idx in range(matched_cycle_index, len(df_cycles)):
        cycle_tstart = df_cycles.loc[cycle_idx - 1, 'hires_tend']
        df_cycles.loc[cycle_idx, 'hires_tstart'] = cycle_tstart
        if current_valley_idx > len(valls) - 1:
            print(f'Warning: not enough cycles in raw data ({len(df_cycles)-cycle_idx-1} cycles left to match up during forwards walk, but no more local minima in 125Hz data)')
            break
        valls_after = valls[current_valley_idx:]
        true_cycle_duration = df_cycles.loc[cycle_idx, 't_in'] + df_cycles.loc[cycle_idx, 't_ex']
        # duration if cycle ends at a valley 
        duration_valley_end = df_raw.loc[valls_after, 't'].array - cycle_tstart
        best_valley_idx = np.argmin(np.abs(duration_valley_end - true_cycle_duration))
        duration_error = np.abs(duration_valley_end[best_valley_idx] - true_cycle_duration) / true_cycle_duration
        if duration_error < MAX_DURATION_ERROR:
            df_cycles.loc[cycle_idx, 'hires_tend'] = df_raw.loc[valls_after[best_valley_idx], 't']
            current_valley_idx += best_valley_idx + 1 # best_valley_idx indexes into the slide valls_after so it's an offset on top of current_valley_idx
            continue
        else:
            pass
            # print(f'cycle {cycle_idx} duration error {duration_error}')
        # no valley matched
        # duration if cycle starts at any t
        raw_after_last_matched = df_raw.loc[range(valls[current_valley_idx], len(df_raw))]
        duration_t = raw_after_last_matched['t'] - cycle_tstart
        best_raw_idx = (duration_t - true_cycle_duration).idxmin()
        while current_valley_idx < len(valls) and valls[current_valley_idx] <= best_raw_idx:
            current_valley_idx += 1
        df_cycles.loc[cycle_idx, 'hires_tend'] = df_raw.loc[best_raw_idx, 't']
        df_cycles.loc[cycle_idx, 'hires_mismatch'] = True

    df_cycles['hires_duration'] = df_cycles['hires_tend'] - df_cycles['hires_tstart']
    for index, cycle in df_cycles.iterrows():
        df_raw.loc[(cycle['hires_tstart'] <= df_raw['t']) & (df_raw['t'] < cycle['hires_tend']), 'cycle_index'] = index

def find_sighs(df_cycles, window_size: int):
    vol_ex = df_cycles['vol_ex']
    rolling_median = vol_ex.rolling(window=window_size, center=True).median().bfill().ffill()
    df_cycles['is_sigh'] = vol_ex > 2 * rolling_median

In [None]:
def on_analyze_clicked(file: TestFile, debug=False):
    global df_cycles
    global df_raw
    global test_file 
    test_file = file
    df_cycles, df_raw = load_dataframes(file, debug)
    match_cycles_with_raw_data(df_cycles, df_raw, debug)
    find_sighs(df_cycles, window_size=14)

    cycle_maxs = df_raw.dropna(subset='cycle_index').sort_values('instant_vol', ascending=False).drop_duplicates('cycle_index').sort_values('cycle_index')
    cycle_maxs = cycle_maxs.join(df_cycles, on='cycle_index')

    fig = px.line(df_raw, y='instant_vol', x='t')
    for tstart in df_cycles.loc[df_cycles['is_sigh'], 'hires_tstart']:
        fig.add_vline(x=tstart)

    scatter = go.Scatter(
            x=cycle_maxs['t'],
            y=cycle_maxs['instant_vol'],
            name='Cycles',
            mode='markers',
            customdata=cycle_maxs,
        )
    fig.add_trace(scatter)
    fig.update_layout(clickmode='event+select')

    phase_changes = list(df_cycles.loc[df_cycles['phase'].shift(1) != df_cycles['phase']].iterrows())
    text_y = df_raw['instant_vol'].max()
    phase_colors = {
        'rest': 'LightGreen', 
        'recovery': 'LightGreen', 
        'load': 'LightSkyBlue'
    }
    phase_labels = {
        'rest': 'Repos',
        'recovery': 'Récupération',
        'load': 'Charge'
    }
    for i, (_, row) in enumerate(phase_changes):
        left = row['hires_tstart']
        right = df_raw['t'].max() if i == len(phase_changes) - 1 else phase_changes[i+1][1]['hires_tstart']
        fig.add_vrect(x0=left, x1=right, fillcolor=phase_colors[row['phase']], opacity=0.3, line_width=0, layer='below')
        fig.add_annotation(x=left, y=text_y, showarrow=False, text=phase_labels[row['phase']], xanchor='left', xshift=10)
    # todo legend
    external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
    app = Dash("Soupirs", external_stylesheets=external_stylesheets)
    app.layout = html.Div([
        dcc.Graph(id='graph', figure=fig),
        html.Pre(id='click-data')
    ])
    app.run()

@callback(
    Output('click-data', 'children'),
    Input('graph', 'clickData'))
def display_click_data(clickData):
    clickdata = clickData
    if not clickdata:
        return None
    if 'points' not in clickdata or len(clickdata['points']) == 0 or 'customdata' not in clickdata['points'][0]:
        return None
    data = clickdata['points'][0]['customdata']
    # assert len(data) == 7 + 4 + len(cycle_col_names)
    cols = data[7:-4]
    # rows = [f'<tr><td>{cycle_col_names[i]}</td><td>{col}</td></tr>' for i, col in enumerate(cols)]
    return html.Table([html.Tr([html.Td(cycle_col_names[i]), html.Td(col)]) for i, col in enumerate(cols)])
    # return '<table>' + '\n'.join(rows) + '</table>'
    # return str(cols)

def on_dir_chosen(chooser, out):
    data_dir = Path(chooser.value)
    files = list_test_file_pairs(data_dir)
    select = widgets.Select(options=[(file.label, file) for file in files], layout={'height': '300px'})
    debug = widgets.Checkbox(value=False, description='Enable debug output')
    button = widgets.Button(description='Analyze')
    button.on_click(lambda button: on_analyze_clicked(select.value, debug.value))
    box = widgets.VBox([select, widgets.HBox([button, debug])])
    with out:
        display(box)

In [None]:
fc = FileChooser()
fc.show_only_dirs=True
fc.default_path = '/home/thomas/p/soupirs/data/CPET ambu anonyme'
fc.title = 'Pick folder with raw csv and excel files'
out = widgets.Output()
fc.register_callback(lambda chooser: on_dir_chosen(chooser, out))
with out:
    display(fc)
display(out)

In [None]:
df_cycles.loc[73]

In [None]:
on_analyze_clicked(test_file, debug=True)

*Edge cases*
|file|time|
|-|-|
|ambu 1091879|836

312525 is fucked
491508
264992

### our fault

518108 broken

49683 breaks

98442996

### fancy matching algo

durations -> small window -> correlation to find anchor points to match hires time to cycle times

then find all zero crossings or value close to zero of flow/derivative (points of interest) and build out cycles by taking 
the next point of interest most closely matching the excel cycle we're currently reconstucting

In [None]:
df_cycles, df_raw = load_dataframes(test_file)

sampling_freq_hz = 1 / (df_raw['t'][:-1] - df_raw['t'].shift(1)[1:]).mean()
filter_freq_hz = 2*1e-6
df_raw['instant_vol_raw'] = cumulative_trapezoid(y=df_raw['flow'], x=df_raw['t'], initial=0) 
sos = signal.butter(4, Wn=filter_freq_hz * sampling_freq_hz, btype='highpass', output='sos')
flow_filtered = signal.sosfilt(sos, df_raw['flow'])
df_raw['instant_vol'] = cumulative_trapezoid(y=flow_filtered, x=df_raw['t'], initial=0) 
print(f'Sum over all instantaneous volume: {df_raw.instant_vol.sum()}')

MIN_PROMINENCE = 0.10
peaks, peakprops  = signal.find_peaks(df_raw['instant_vol'], prominence=MIN_PROMINENCE)
valls, vallprops = signal.find_peaks(-df_raw['instant_vol'], prominence=MIN_PROMINENCE)
print(f'Found {len(peaks)} peaks, {len(valls)} valleys')
# index of first peak that comes after first valley
first_peak_idx = np.argwhere(peaks > valls[0]).min()
if not (valls[-1] > peaks).all(): # there is a peak after the last valley
    # index of last peak that comes before last valley
    last_peak_idx = np.argwhere(valls[-1] < peaks).min()
else:
    last_peak_idx = len(peaks - 1)
print(f'Dropping first {first_peak_idx} and last {len(peaks) - last_peak_idx} peaks')
peaks = peaks[first_peak_idx:last_peak_idx]

#   p   p   p   n peaks
#  / \_/ \_/ \
# v   v   v   v n+1 valleys

# That makes n complete cycles

assert len(peaks) == len(valls) - 1
iv = df_raw['instant_vol']
vins = iv[peaks].array - iv[valls[:-1]].array
vexs = iv[peaks].array - iv[valls[1:]].array
print(f'Found {len(peaks)} complete cycles')

In [None]:
winstart = min(100, len(df_cycles))
want_winsize = 100
winend = min(winstart + want_winsize, len(df_cycles))
winsize = winend - winstart
if winsize < 50:
    print(f'Warning: small window to find initial cycle correlation')
if len(valls) < winsize:
    print(f'Error: not enough cycles in raw data to match window of size {winsize}')
durations_hires = df_raw['t'][valls[1:]].array - df_raw['t'][valls[:-1]].array
win_durations_cycles = df_cycles.loc[range(winstart, winend), 'duration'].array
start_cutoff_hires = 30 # cut off first hires samples that may have extreme values
corrs = np.correlate(durations_hires[start_cutoff_hires:] - durations_hires.mean(), win_durations_cycles - win_durations_cycles.mean(), mode='valid')
shift = corrs.argmax() + start_cutoff_hires
plt.bar(range(len(corrs)), corrs), shift

In [None]:
fig, ax = plt.subplots()
ax.plot(range(winsize), durations_hires[shift:shift+winsize], label='duration hires')
ax.plot(range(winsize), win_durations_cycles, label='duration cycles')
ax.legend()

In [None]:
duration_diffs = df_cycles.loc[range(winstart, winend), 'duration'].array - durations_hires[shift:shift+winsize]
assert np.abs(duration_diffs).max() < 0.15
best_duration_match_idx = np.abs(duration_diffs).argmin()
# this cycle starts at                           this time
df_cycles.loc[winstart+best_duration_match_idx], durations_hires[shift+best_duration_match_idx], df_raw.loc[valls[shift+best_duration_match_idx]], df_raw.loc[valls[shift+best_duration_match_idx+1]]

In [None]:
df_cycles['hires_tstart'] = pd.Series()
df_cycles['hires_tend'] = pd.Series()
matched_cycle_index = winstart + best_duration_match_idx
matched_hires_valley_idx = shift + best_duration_match_idx
df_cycles.loc[matched_cycle_index, 'hires_tstart'] = df_raw.loc[valls[matched_hires_valley_idx], 't']

flow_zeros = df_raw['flow'][df_raw['flow'].abs() < 0.001].index

DURATION_ERROR = 0.1

current_valley_idx = matched_hires_valley_idx
# walk backwards in time
for cycle_idx in reversed(range(0, matched_cycle_index)):
    cycle_tend = df_cycles.loc[cycle_idx + 1, 'hires_tstart']
    df_cycles.loc[cycle_idx, 'hires_tend'] = cycle_tend
    valls_before = valls[:current_valley_idx]
    # duration if cycle starts at a valley 
    duration_valley_start = -df_raw.loc[valls_before, 't'].array + cycle_tend
    best_valley_idx = np.argmin(np.abs(duration_valley_start))
    true_cycle_duration = df_cycles.loc[cycle_idx, 't_in'] + df_cycles.loc[cycle_idx, 't_ex']
    if np.abs(duration_valley_start[best_valley_idx] - true_cycle_duration) / true_cycle_duration < DURATION_ERROR:
        df_cycles.loc[cycle_idx, 'hires_tstart'] = df_raw.loc[valls_before[best_valley_idx], 't']
        current_valley_idx = best_valley_idx
    else:
        # duration if cycle starts at any t
        duration_t = -df_raw['t'].array + cycle_tend
        best_raw_idx = np.argmin(np.abs(duration_t - true_cycle_duration))
        print(f'error: {cycle_idx}, {duration_t[best_raw_idx] - true_cycle_duration}')
        while best_raw_idx <= valls[current_valley_idx]: 
            current_valley_idx -= 1
        df_cycles.loc[cycle_idx, 'hires_tstart'] = df_raw.loc[best_raw_idx, 't']

current_valley_idx = matched_hires_valley_idx + 1
for cycle_idx in range(matched_cycle_index, len(df_cycles)):
    cycle_tstart = df_cycles.loc[cycle_idx - 1, 'hires_tend']
    df_cycles.loc[cycle_idx, 'hires_tstart'] = cycle_tstart
    valls_after = valls[current_valley_idx:]
    true_cycle_duration = df_cycles.loc[cycle_idx, 't_in'] + df_cycles.loc[cycle_idx, 't_ex']
    if len(valls_after) > 0:
        # duration if cycle ends at a valley 
        duration_valley_end = df_raw.loc[valls_after, 't'].array - cycle_tstart
        best_valley_idx = np.argmin(np.abs(duration_valley_end))
        if np.abs(duration_valley_end[best_valley_idx] - true_cycle_duration) / true_cycle_duration < DURATION_ERROR:
            df_cycles.loc[cycle_idx, 'hires_tend'] = df_raw.loc[valls_after[best_valley_idx], 't']
            current_valley_idx += best_valley_idx + 1 # best_valley_idx indexes into the slide valls_after so it's an offset on top of current_valley_idx
            continue
        print(f'no matching valley: {cycle_idx}, {true_cycle_duration}, {duration_valley_end[best_valley_idx]}, {duration_valley_end[best_valley_idx] - true_cycle_duration}')
    # no valley matched
    # duration if cycle starts at any t
    duration_t = df_raw['t'].array - cycle_tstart
    best_raw_idx = np.argmin(np.abs(duration_t - true_cycle_duration))
    # print(f'error: {cycle_idx}, {(duration_t[best_raw_idx] - true_cycle_duration) / true_cycle_duration}')
    while current_valley_idx < len(valls) and best_raw_idx <= valls[current_valley_idx]: 
        current_valley_idx += 1
    df_cycles.loc[cycle_idx, 'hires_tend'] = df_raw.loc[best_raw_idx, 't']

In [None]:
df_cycles['hires_duration'] = df_cycles['hires_tend'] - df_cycles['hires_tstart']
df_cycles['hires_duration']

In [None]:
fig, ax = plt.subplots() 
ax.plot(range(len(df_cycles)), df_cycles['duration'], label='duration cycles')
ax.plot(range(len(df_cycles)), df_cycles['hires_duration'], label='duration hires')
ax.legend()

In [None]:

pad_cycle_data = len(vins) - len(df_cycles['vol_in'])
cyc_vol_in_padded = np.pad(df_cycles['vol_in'].array, (pad_cycle_data, 0))
corr = np.correlate(cyc_vol_in_padded, vins, 'full')
# plt.bar(range(len(corr)), corr)
# return
shift = corr.argmax() - (len(vins) - 1)
assert shift >= 0
drop_highres_cycles_front = pad_cycle_data - shift
drop_highres_cycles_back = shift 
vins_trimmed = vins[drop_highres_cycles_front:len(vins) - drop_highres_cycles_back]
peaks_trimmed = peaks[drop_highres_cycles_front:len(peaks) - drop_highres_cycles_back]
valls_trimmed = valls[drop_highres_cycles_front:len(valls) - drop_highres_cycles_back]
print(len(peaks_trimmed), len(valls_trimmed), len(df_cycles))
assert len(peaks_trimmed) == len(df_cycles)
assert len(valls_trimmed) == len(df_cycles) + 1
df_cycles['highres_t_start'] = df_raw['t'][valls_trimmed[:-1]].values
df_cycles['highres_t_max'] = df_raw['t'][peaks_trimmed].values
df_cycles['highres_t_end'] = df_raw['t'][valls_trimmed[1:]].values
df_cycles['highres_duration'] = df_cycles['highres_t_end'] - df_cycles['highres_t_start']
# df_raw.set_index('t', inplace=True)

In [None]:
px.line(df_cycles, y=['duration', df_cycles['highres_t_end'] - df_cycles['highres_t_start']], hover_data=['highres_t_start'])

In [None]:
df_raw['vol_cycle_start'] = pd.Series(dtype=float)
df_raw.loc[df_raw['t'].isin(df_cycles['highres_t_start'].values), 'vol_cycle_start'] = df_raw[df_raw['t'].isin(df_cycles['highres_t_start'].values)]['instant_vol']
fig = px.line(df_raw, x='t', y='instant_vol')
fig = fig.add_scatter(x=df_raw.t, y=df_raw['vol_cycle_start'], mode='lines+markers')
fig.show()

In [None]:
df_correlated = pd.DataFrame()
df_correlated['vol_in_highres'] = vins_trimmed
df_correlated['vol_in_cycles'] = df_cycles['vol_in']
px.scatter(df_correlated, df_correlated.index, ['vol_in_highres', 'vol_in_cycles'], title='Vol Insp cycles/highres')

In [None]:
vexs_trimmed = vexs[drop_highres_cycles_front:len(vexs)-drop_highres_cycles_back]
cyc_vol_ex = df_cycles['vol_ex']
df_correlated['vol_ex_highres'] = vexs_trimmed
df_correlated['vol_ex_cycles'] = df_cycles['vol_ex']
px.scatter(df_correlated, df_correlated.index, ['vol_ex_highres', 'vol_ex_cycles'], title='Vol Exp cycles/highres')

In [None]:
df_raw

In [None]:
px.line(df_raw, x='t', y=['instant_vol', 'flow'])
# px.scatter(df_raw, x='t', y=peaks)
# px.line(df_raw, x='t', y=['flow', 'flow_ma', 'filtered', 'instant_vol'])