# Visualize PV curves from segment `.npz` files
This notebook mirrors *Visualize-PVcurve.ipynb* but reads the processed sensor segments stored as `.npz` under `data/MMDataset_segments`. Configure the segment path, optionally list available files, and the notebook will load flow/pressure traces, integrate flow to obtain volume (with baseline drift correction), and draw the PV curve.

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 120


In [None]:

def list_npz_files(segments_root: str, max_show: int = 5):
    pattern = os.path.join(segments_root, "**", "*.npz")
    files = sorted(glob.glob(pattern, recursive=True))
    print(f"Found {len(files)} segment files.")
    for i, f in enumerate(files[:max_show]):
        print(f"[{i}] {f}")
    if len(files) > max_show:
        print("... (and more)")
    return files


def _resolve_column(df: pd.DataFrame, hint: str, description: str) -> str:
    """Return the column name that matches hint exactly or by prefix (case-insensitive)."""
    if not hint:
        raise ValueError(f"No {description} hint provided.")
    if hint in df.columns:
        return hint
    candidates = [c for c in df.columns if c.startswith(hint)]
    if not candidates:
        candidates = [c for c in df.columns if c.lower().startswith(hint.lower())]
    if not candidates:
        available = ', '.join(df.columns[:10])
        raise ValueError(f"Could not find {description} matching '{hint}'. Available columns (first 10): {available}")
    if len(candidates) > 1:
        candidates.sort(key=len, reverse=True)
    return candidates[0]


def _scalar_value(arr):
    if np.ndim(arr) == 0:
        return arr.item()
    return arr


def load_npz_segment(
    npz_path: str,
    flow_col: str,
    press_col: str,
    time_col: str = "sensor_time_epoch"
) -> tuple[pd.DataFrame, str, str, str, dict]:
    data = np.load(npz_path, allow_pickle=True)
    sensor_cols = [str(c) for c in data["sensor_cols"]]
    sensor_values = data["sensor_values"]
    df = pd.DataFrame(sensor_values, columns=sensor_cols)
    if time_col not in data:
        raise ValueError(f"'{time_col}' not found in npz (available keys: {data.files})")
    time_values = data[time_col].astype(float)
    df = df.copy()
    df.insert(0, time_col, time_values)

    time_name = _resolve_column(df, time_col, "time column")
    flow_name = _resolve_column(df, flow_col, "flow column")
    press_name = _resolve_column(df, press_col, "pressure column")

    meta = {
        "label": str(_scalar_value(data.get("label", ""))),
        "sensor_id": str(_scalar_value(data.get("sensor_id", ""))),
        "audio_id": str(_scalar_value(data.get("audio_id", ""))),
        "audio_start_utc": float(_scalar_value(data.get("audio_start_utc", 0.0))),
        "audio_end_utc": float(_scalar_value(data.get("audio_end_utc", 0.0))),
    }
    return df, flow_name, press_name, time_name, meta


def select_sensor_window(
    df: pd.DataFrame,
    time_col: str = "sensor_time_epoch",
    start_utc: float | None = None,
    duration_sec: float | None = None,
    start_index: int | None = None,
    num_samples: int | None = None,
) -> pd.DataFrame:
    window = df
    if start_utc is not None and duration_sec is not None:
        end_utc = start_utc + duration_sec
        mask = (df[time_col] >= start_utc) & (df[time_col] <= end_utc)
        window = df.loc[mask]
    elif start_index is not None:
        end_idx = start_index + num_samples if num_samples is not None else None
        window = df.iloc[start_index:end_idx]
    return window.reset_index(drop=True)


def compute_volume_from_flow(
    df: pd.DataFrame,
    flow_col: str,
    time_col: str
) -> tuple[np.ndarray, np.ndarray]:
    """Integrate flow to get volume (litres) and remove slow baseline drift."""
    time_s = df[time_col].to_numpy(dtype=float)
    time_rel = time_s - time_s[0]
    flow_slm = df[flow_col].to_numpy(dtype=float)
    flow_lps = flow_slm / 60.0
    dt = np.diff(time_rel, prepend=0.0)
    volume_l = np.cumsum(flow_lps * dt)
    volume_l -= volume_l[0]

    if len(volume_l) > 1:
        slope, intercept = np.polyfit(time_rel, volume_l, 1)
        drift = slope * time_rel + intercept
        volume_l = volume_l - drift
        volume_l -= volume_l[0]

    return time_rel, volume_l


def plot_flow_and_pressure(
    df: pd.DataFrame,
    time_rel: np.ndarray,
    flow_col: str,
    press_col: str
) -> None:
    fig, ax_flow = plt.subplots(figsize=(10, 4))
    ln1 = ax_flow.plot(time_rel, df[flow_col], color="tab:blue", label="Flow (slm)")
    ax_flow.axhline(0.0, color="tab:blue", linestyle="--", linewidth=0.5)
    ax_flow.set_xlabel("Time (s)")
    ax_flow.set_ylabel("Flow (slm)", color="tab:blue")

    ax_press = ax_flow.twinx()
    ln2 = ax_press.plot(time_rel, df[press_col], color="tab:orange", linestyle="--", label="Pressure (Pa)")
    ax_press.axhline(0.0, color="tab:orange", linestyle=":", linewidth=0.5)
    ax_press.set_ylabel("Pressure (Pa)")

    lines = ln1 + ln2
    labels = [l.get_label() for l in lines]
    ax_flow.legend(lines, labels, loc="best")
    ax_flow.set_title("Flow / Pressure")
    fig.tight_layout()
    plt.show()


def plot_pv_curve(
    volume_l: np.ndarray,
    pressure: np.ndarray,
    color: str = "#8bc34a",
    alpha: float = 0.55,
) -> None:
    """Draw PV curve with a light color so repeated loops stay readable."""
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.plot(volume_l, pressure, color=color, alpha=alpha, linewidth=2.0)
    ax.set_xlabel("Volume (L)")
    ax.set_ylabel("Pressure (Pa)")
    ax.set_title("Pressureâ€“Volume curve")
    ax.grid(True, linestyle=":", alpha=0.5)
    plt.show()


In [None]:
# --- Configuration ---
# Use list_npz_files(...) to locate candidate segments, then paste a path below.
npz_path = r"data/MMDataset_segments/251107/2025-11-07_22-35-56-SFM3300AW_24410080.edf__record_20251107T145040Z.wav/record_20251107T145040Z.wav__2025-11-07_22-35-56-SFM3300AW_24410080.edf_win00056.npz"
flow_col = "F_"
press_col = "P_"
time_col = "sensor_time_epoch"

# Window options (pick either UTC or index window)
start_utc = None
duration_sec = None
start_index = 0
num_samples = None

# --- Load, slice, and plot ---
df, flow_col, press_col, time_col, meta = load_npz_segment(
    npz_path,
    flow_col=flow_col,
    press_col=press_col,
    time_col=time_col,
)
print("Resolved columns -> flow:", flow_col, "pressure:", press_col, "time:", time_col)
print("Meta:", meta)

window = select_sensor_window(
    df,
    time_col=time_col,
    start_utc=start_utc,
    duration_sec=duration_sec,
    start_index=start_index,
    num_samples=num_samples,
)
print(f"Window has {len(window):,} samples")
if len(window) < 2:
    raise RuntimeError("Selected window is empty. Adjust the configuration above.")

relative_time_s, volume_l = compute_volume_from_flow(window, flow_col, time_col=time_col)
pressure = window[press_col].to_numpy(dtype=float)

plot_flow_and_pressure(window, relative_time_s, flow_col, press_col)
plot_pv_curve(volume_l, pressure)
