In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math

In [2]:
IN_DIR = Path(r"working path")
path_results=Path(r"result path")

In [3]:
def pick_col(cols, key_prefix):
    """select column"""
    for c in cols:
        if str(c).startswith(key_prefix):
            return c
    return None

def fisher_z(r):
    return np.arctanh(np.clip(r, -0.999999, 0.999999))

def inv_fisher_z(z):
    return np.tanh(z)

In [4]:
pairs = [("ECG_II", "PPG"), ("ECG_II", "ABP"), ("PPG", "ABP")]

In [5]:
# calculate by files
rows = []
csv_files = sorted(IN_DIR.glob("*.csv"))
for f in tqdm(csv_files, desc="Computing per-record correlations"):
    try:
        # select col
        head = pd.read_csv(f, nrows=0)
        col_ecg = pick_col(head.columns, "ECG_II")
        col_ppg = pick_col(head.columns, "PPG")
        col_abp = pick_col(head.columns, "ABP")
        needed = [c for c in [col_ecg, col_ppg, col_abp] if c is not None]
        if len(needed) < 3:
            continue

        df = pd.read_csv(f, usecols=needed)  # t_sec not read
        # omit missing
        corr = df.corr(method="pearson")  # pairwise complete obs

        # total lines
        n_valid_all3 = int(df.dropna().shape[0])

        # pairwise
        n_pair = {}
        for a,b in pairs:
            ca = pick_col(df.columns, a)
            cb = pick_col(df.columns, b)
            n_pair[(a,b)] = int(df[[ca,cb]].dropna().shape[0])

        rec_id = f.stem
        rows.append({
            "record": rec_id,
            "r_ECG_PPG": float(corr.loc[col_ecg, col_ppg]),
            "n_ECG_PPG": n_pair[("ECG_II","PPG")],
            "r_ECG_ABP": float(corr.loc[col_ecg, col_abp]),
            "n_ECG_ABP": n_pair[("ECG_II","ABP")],
            "r_PPG_ABP": float(corr.loc[col_ppg, col_abp]),
            "n_PPG_ABP": n_pair[("PPG","ABP")],
            "n_all3_complete": n_valid_all3,
        })
    except Exception:
        # skip error
        continue

per_record = pd.DataFrame(rows)
per_record_path = path_results / "per_record_corr.csv"
per_record.to_csv(per_record_path, index=False)
per_record.head()

Computing per-record correlations:   0%|          | 0/983 [00:00<?, ?it/s]

Unnamed: 0,record,r_ECG_PPG,n_ECG_PPG,r_ECG_ABP,n_ECG_ABP,r_PPG_ABP,n_PPG_ABP,n_all3_complete
0,3019644_0001,0.00421,1290750,0.047029,1290750,-0.141969,1290750,1290750
1,3019644_0002,0.011285,12013,-0.019972,12013,-0.168079,12125,12013
2,3019644_0003,-0.117771,2275638,0.118116,2275638,-0.115378,2275750,2275638
3,3019644_0004,-0.275423,125,0.226332,125,-0.414644,125,125
4,3019644_0005,-0.303626,125,0.315841,125,-0.369783,125,125


In [6]:
# overall corr

def pooled_fisher_z(r_vals, n_vals):
    """Fisher z + (n-3) weighted avg"""
    mask = np.isfinite(r_vals) & np.isfinite(n_vals) & (n_vals >= 4)
    if not mask.any():
        return np.nan
    z = fisher_z(r_vals[mask])
    w = n_vals[mask] - 3.0
    z_bar = np.sum(w * z) / np.sum(w)
    return float(inv_fisher_z(z_bar))

In [7]:
summary_rows = []
for key_r, key_n, label in [
    ("r_ECG_PPG", "n_ECG_PPG", "ECG_II vs PPG"),
    ("r_ECG_ABP", "n_ECG_ABP", "ECG_II vs ABP"),
    ("r_PPG_ABP", "n_PPG_ABP", "PPG vs ABP"),
]:
    r_vals = per_record[key_r].to_numpy()
    n_vals = per_record[key_n].to_numpy()
    pooled = pooled_fisher_z(r_vals, n_vals)
    median = float(np.nanmedian(r_vals)) if len(r_vals) else np.nan
    q25 = float(np.nanpercentile(r_vals, 25)) if len(r_vals) else np.nan
    q75 = float(np.nanpercentile(r_vals, 75)) if len(r_vals) else np.nan
    summary_rows.append({
        "pair": label,
        "pooled_fisher": pooled,      # overall
        "median_record_r": median,    # by record
        "IQR_low": q25,
        "IQR_high": q75,
        "num_records": int(np.sum(np.isfinite(r_vals))),
        "total_effective_n": int(np.nansum(n_vals)),
    })

overall = pd.DataFrame(summary_rows)
overall_path = path_results / "overall_corr_summary.csv"
overall.to_csv(overall_path, index=False)
overall

Unnamed: 0,pair,pooled_fisher,median_record_r,IQR_low,IQR_high,num_records,total_effective_n
0,ECG_II vs PPG,0.020647,0.018769,-0.064258,0.094544,922,1002152385
1,ECG_II vs ABP,0.017865,0.019206,-0.035539,0.093246,926,998912690
2,PPG vs ABP,-0.247482,-0.33852,-0.515874,-0.186022,959,998906600


In [8]:
def fisher_z(r):
    return np.arctanh(np.clip(r, -0.999999, 0.999999))

def inv_fisher_z(z):
    return np.tanh(z)

def pooled_numbers(df, r_col, n_col):
    r = df[r_col].to_numpy()
    n = df[n_col].to_numpy()
    mask = np.isfinite(r) & np.isfinite(n) & (n >= 2)

    if not mask.any():
        return dict(
            pooled_fisher=np.nan,
            mean_weighted_by_n=np.nan,
            mean_unweighted=np.nan,
            num_records=0,
            total_effective_n=0,
        )

    r_ = r[mask]; n_ = n[mask]

    # 1) Fisher z weighted
    z = fisher_z(r_)
    w = np.maximum(n_ - 3.0, 0.0)
    z_bar = np.sum(w * z) / np.sum(w) if np.sum(w) > 0 else np.nan
    pooled_fisher = float(inv_fisher_z(z_bar)) if np.isfinite(z_bar) else np.nan

    # 2) weighted by size
    mean_weighted_by_n = float(np.sum(n_ * r_) / np.sum(n_))

    # 3) simple avg
    mean_unweighted = float(np.nanmean(r_))

    return dict(
        pooled_fisher=pooled_fisher,
        mean_weighted_by_n=mean_weighted_by_n,
        mean_unweighted=mean_unweighted,
        num_records=int(mask.sum()),
        total_effective_n=int(np.nansum(n_)),
    )

rows = []
specs = [
    ("r_ECG_PPG", "n_ECG_PPG", "ECG_II vs PPG"),
    ("r_ECG_ABP", "n_ECG_ABP", "ECG_II vs ABP"),
    ("r_PPG_ABP", "n_PPG_ABP", "PPG vs ABP"),
]

for r_col, n_col, label in specs:
    res = pooled_numbers(per_record, r_col, n_col)
    res["pair"] = label
    rows.append(res)

pooled_simple = pd.DataFrame(rows)[
    ["pair", "pooled_fisher", "mean_weighted_by_n", "mean_unweighted",
     "num_records", "total_effective_n"]
]

pooled_simple_path = path_results / "pooled_corr_simple.csv"
pooled_simple.to_csv(pooled_simple_path, index=False)
pooled_simple

Unnamed: 0,pair,pooled_fisher,mean_weighted_by_n,mean_unweighted,num_records,total_effective_n
0,ECG_II vs PPG,0.020647,0.02064,0.010987,922,1001767938
1,ECG_II vs ABP,0.017865,0.017591,0.031535,926,998724429
2,PPG vs ABP,-0.247482,-0.242636,-0.339705,959,998662010
