# Create Normal Class in Dataset

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import pyedflib
import random

# ----------------- Config -----------------
combined_csv = "/Users/gusgoodman/Documents/V89/snore-apnea-analyze/EDF_RML/respiratory_events_all.csv"
edf_root = "/Users/gusgoodman/Documents/V89/data_2"
out_normals = "/Users/gusgoodman/Documents/V89/snore-apnea-analyze/EDF_RML/normal_events_all.csv"
out_merged = "/Users/gusgoodman/Documents/V89/snore-apnea-analyze/EDF_RML/respiratory_plus_normal.csv"

exclusion_buffer_sec = 0.5
target_normal_total = 1157
duration_strategy = "match_distribution"
fixed_duration_sec = 10.0
max_normal_duration_sec = 20.0
min_normal_duration_sec = 1.0

# ------------------------------------------


def list_patient_edfs(edf_dir: Path, patient_id: str):
    """Return EDF paths for a patient, sorted by segment number if present."""
    # Convert patient_id to string and try multiple formats
    pid_str = str(patient_id)
    pid_padded = pid_str.zfill(8)  # e.g., 995 -> 00000995
    # Try zero-padded and non-padded formats
    patterns = [
        f"*{pid_padded}*.edf",  # e.g., 00000995-...
        f"*{pid_str}*.edf"      # e.g., 995-...
    ]
    edfs = []
    for pattern in patterns:
        found = sorted(edf_dir.rglob(pattern))
        edfs.extend(found)
    edfs = sorted(list(set(edfs)))  # Remove duplicates and sort
    if not edfs:
        print(f"No EDF files found for patient {patient_id}")
    else:
        print(f"Found {len(edfs)} EDF files for patient {patient_id}: {edfs}")
    return edfs


def edf_segment_windows(edf_paths):
    """Return cumulative windows [(t0, t1)] and durations for each EDF, in seconds."""
    wins, durs = [], []
    t = 0.0
    for p in edf_paths:
        try:
            f = pyedflib.EdfReader(str(p))
            dur = float(f.file_duration)
            f.close()
            wins.append((t, t + dur))
            durs.append(dur)
            t += dur
        except Exception as e:
            print(f"Error reading EDF {p}: {e}")
            continue
    return wins, durs


def build_forbidden_intervals(events_df):
    """Build list of forbidden global intervals [start, end] (with buffer)."""
    forb = []
    for _, r in events_df.iterrows():
        s = float(r["start_sec"]) - exclusion_buffer_sec
        e = float(r["end_sec"]) + exclusion_buffer_sec
        if e > s:
            forb.append([s, e])
    forb = sorted(forb, key=lambda x: x[0])
    merged = []
    for iv in forb:
        if not merged or iv[0] > merged[-1][1]:
            merged.append(iv)
        else:
            merged[-1][1] = max(merged[-1][1], iv[1])
    return merged


def subtract_intervals(total_iv, forb_list):
    """Given a total interval [a,b] and a (merged) forbidden list, return free sub-intervals."""
    a, b = total_iv
    free = []
    cur = a
    for s, e in forb_list:
        if e <= cur:
            continue
        if s > b:
            break
        if s > cur:
            free.append([cur, min(s, b)])
        cur = max(cur, e)
        if cur >= b:
            break
    if cur < b:
        free.append([cur, b])
    free = [iv for iv in free if iv[1] - iv[0] >= min_normal_duration_sec]
    return free


def sample_duration(resp_durations: np.ndarray):
    """Sample a duration for normal windows."""
    if duration_strategy == "fixed":
        d = fixed_duration_sec
    else:
        d = float(np.random.choice(resp_durations))
    d = max(min_normal_duration_sec, min(d, max_normal_duration_sec))
    return d


def draw_normals_for_patient(pid, df_patient, edf_dir: Path, target_normals):
    """Generate normal windows for one patient."""
    edf_paths = list_patient_edfs(edf_dir, pid)
    if not edf_paths:
        return []

    wins, durs = edf_segment_windows(edf_paths)
    if not wins:
        print(f"No valid EDF durations for {pid}, skipping.")
        return []

    total_start, total_end = wins[0][0], wins[-1][1]
    forb = build_forbidden_intervals(df_patient)
    free_global = subtract_intervals([total_start, total_end], forb)

    if not free_global:
        print(f"No free intervals for {pid}, total duration: {sum(durs)} sec, forbidden: {sum(e - s for s, e in forb)} sec")
        return []

    free_lengths = np.array([iv[1] - iv[0] for iv in free_global], dtype=float)
    if free_lengths.sum() <= 0:
        print(f"No valid free intervals for {pid}, skipping.")
        return []
    probs = free_lengths / free_lengths.sum()

    resp_durs = df_patient["duration_sec"].astype(float).values
    out = []
    trials = 0
    max_trials = target_normals * 50

    while len(out) < target_normals and trials < max_trials:
        trials += 1
        idx = np.random.choice(len(free_global), p=probs)
        a, b = free_global[idx]
        d = sample_duration(resp_durs)
        if b - a < d:
            continue
        s = float(np.random.uniform(a, b - d))
        e = s + d

        seg_idx = -1
        for k, (t0, t1) in enumerate(wins):
            if t0 <= s < t1:
                seg_idx = k
                local_start = s - t0
                if local_start + d > (t1 - t0):
                    seg_idx = -1
                break
        if seg_idx < 0:
            continue

        out.append({
            "patient_id": str(pid),  # Ensure string for consistency
            "event_id": -1,
            "family": "Normal",
            "type": "Normal",
            "start_sec": round(s, 3),
            "duration_sec": round(d, 3),
            "end_sec": round(e, 3),
            "segment_index": seg_idx,
            "segment_local_start_sec": round(local_start, 3),
            "recording_start_iso": df_patient.iloc[0]["recording_start_iso"] if "recording_start_iso" in df_patient.columns else ""
        })

    if len(out) < target_normals:
        print(f"Generated only {len(out)}/{target_normals} normals for {pid} due to limited free intervals.")
    return out


# --------- Load respiratory events and plan balancing ---------
df_resp = pd.read_csv(combined_csv)
# Convert patient_id to string and remove leading zeros for consistency
df_resp["patient_id"] = df_resp["patient_id"].astype(str).str.lstrip("0")
patients = sorted(df_resp["patient_id"].unique())
resp_total = len(df_resp)
print(f"Total respiratory events: {resp_total}")
print(f"Patients: {patients}")

counts = df_resp.groupby("patient_id").size()
shares = (counts / counts.sum() * target_normal_total).round().astype(int).to_dict()

# --------- Generate normals per patient ---------
normals_all = []
for pid in patients:
    df_p = df_resp[df_resp["patient_id"] == pid].copy()
    need = shares.get(pid, 0)
    if need == 0:
        print(f"No respiratory events for {pid}, skipping.")
        continue
    print(f"Generating {need} normal events for {pid}...")
    normals = draw_normals_for_patient(pid, df_p, Path(edf_root), need)
    normals_all.extend(normals)

if len(normals_all) > target_normal_total:
    normals_all = list(pd.DataFrame(normals_all).sample(n=target_normal_total, random_state=42).to_dict("records"))
elif len(normals_all) < target_normal_total:
    print(f"Warning: Generated only {len(normals_all)}/{target_normal_total} normal events.")

df_norm = pd.DataFrame(normals_all)

# --------- Save outputs ---------
if not df_norm.empty:
    df_norm.to_csv(out_normals, index=False)
    df_merged = pd.concat([df_resp, df_norm], ignore_index=True)
    df_merged.to_csv(out_merged, index=False)

    print(f"\nResults:")
    print(f"Respiratory events: {len(df_resp)}")
    print(f"Normal events created: {len(df_norm)} (target={target_normal_total})")
    print(f"Saved normals to: {out_normals}")
    print(f"Saved merged dataset to: {out_merged}")
    print("\nMerged class counts:")
    print(df_merged["type"].value_counts())
else:
    print("No normal windows could be created. Check EDF availability, free time, or buffers.")
    print("Debug suggestions:")
    print("- Verify EDF files exist in", edf_root)
    print("- Check if free intervals are too short (reduce exclusion_buffer_sec or min_normal_duration_sec)")
    print("- Inspect respiratory event density in", combined_csv)

Total respiratory events: 1157
Patients: ['1000', '1006', '1008', '1089', '995', '999']
Generating 166 normal events for 1000...
Found 2 EDF files for patient 1000: [PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001000-100507[001].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001000-100507[002].edf')]
Generating 167 normal events for 1006...
Found 4 EDF files for patient 1006: [PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001006-100507[001].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001006-100507[002].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001006-100507[003].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001006-100507[004].edf')]
Generating 153 normal events for 1008...
Found 5 EDF files for patient 1008: [PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001008-100507[001].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/00001008-100507[002].edf'), PosixPath('/Users/gusgoodman/Documents/V89/data_2/0000

# EDA

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv(r"C:\V89\Snore_Apnea_Analyze\EDF_RML\data_csv\respiratory_plus_normal.csv")
df

Unnamed: 0,patient_id,event_id,family,type,start_sec,duration_sec,end_sec,segment_index,segment_local_start_sec,recording_start_iso
0,999,20,Respiratory,ObstructiveApnea,155.000,17.0,172.000,0,155.000,2019-04-19T08:43:41
1,999,33,Respiratory,ObstructiveApnea,209.000,19.0,228.000,0,209.000,2019-04-19T08:43:41
2,999,39,Respiratory,ObstructiveApnea,264.500,14.0,278.500,0,264.500,2019-04-19T08:43:41
3,999,44,Respiratory,ObstructiveApnea,295.500,12.5,308.000,0,295.500,2019-04-19T08:43:41
4,999,49,Respiratory,ObstructiveApnea,347.000,11.5,358.500,0,347.000,2019-04-19T08:43:41
...,...,...,...,...,...,...,...,...,...,...
2064,999,-1,Normal,Normal,9385.342,16.5,9401.842,2,2185.342,2019-04-19T08:43:41
2065,999,-1,Normal,Normal,3995.079,15.0,4010.079,1,395.079,2019-04-19T08:43:41
2066,999,-1,Normal,Normal,14836.353,12.5,14848.853,4,436.353,2019-04-19T08:43:41
2067,999,-1,Normal,Normal,1199.751,12.5,1212.251,0,1199.751,2019-04-19T08:43:41


In [7]:
df['type'].value_counts()

type
Normal              912
ObstructiveApnea    649
Hypopnea            398
MixedApnea           72
CentralApnea         38
Name: count, dtype: int64

In [9]:
df_1 =  pd.read_csv(r"C:\V89\Snore_Apnea_Analyze\EDF_RML\data_csv\respiratory_events_all.csv")
df_1['type'].value_counts()

type
ObstructiveApnea    649
Hypopnea            398
MixedApnea           72
CentralApnea         38
Name: count, dtype: int64

In [10]:
df_1

Unnamed: 0,patient_id,event_id,family,type,start_sec,duration_sec,end_sec,segment_index,segment_local_start_sec,recording_start_iso
0,999,20,Respiratory,ObstructiveApnea,155.0,17.0,172.0,0,155.0,2019-04-19T08:43:41
1,999,33,Respiratory,ObstructiveApnea,209.0,19.0,228.0,0,209.0,2019-04-19T08:43:41
2,999,39,Respiratory,ObstructiveApnea,264.5,14.0,278.5,0,264.5,2019-04-19T08:43:41
3,999,44,Respiratory,ObstructiveApnea,295.5,12.5,308.0,0,295.5,2019-04-19T08:43:41
4,999,49,Respiratory,ObstructiveApnea,347.0,11.5,358.5,0,347.0,2019-04-19T08:43:41
...,...,...,...,...,...,...,...,...,...,...
1152,995,1074,Respiratory,ObstructiveApnea,17610.0,10.0,17620.0,4,3210.0,2019-04-17T22:35:00
1153,995,1078,Respiratory,ObstructiveApnea,17642.5,16.0,17658.5,4,3242.5,2019-04-17T22:35:00
1154,995,1082,Respiratory,ObstructiveApnea,17682.0,18.5,17700.5,4,3282.0,2019-04-17T22:35:00
1155,995,1095,Respiratory,Hypopnea,17820.5,11.5,17832.0,4,3420.5,2019-04-17T22:35:00
