In [12]:
import os
from pathlib import Path
import numpy as np
import pandas as pd


INPUT_CSV = Path("margeAll2.csv")   



# Seting
SAMPLE_RATE = 50.0  #every 1 seconed represent in 50 row
DT = 1.0 / SAMPLE_RATE
WINDOW_SEC = 12.0
WINDOW_SAMPLES = int(WINDOW_SEC * SAMPLE_RATE)  # 600 point
OVERLAP = 0.5
STRIDE_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP))  # 50% = 300
EPS = 1e-8  # للأمان في القسمة

In [13]:
df = pd.read_csv(INPUT_CSV)
df.columns = [c.strip().lower() for c in df.columns]
df.head(6)

Unnamed: 0,sample_id,time,anomaly,anomaly_type,data_modality,volume,flow,pressure
0,0,0.054254,1,Auto Trigger,paw_flow,,,5.304917
1,0,0.185185,1,Auto Trigger,paw_flow,,1.096296,
2,0,0.294991,1,Auto Trigger,paw_flow,,,6.521218
3,0,0.314815,1,Auto Trigger,paw_flow,,-9.547325,
4,0,0.5,1,Auto Trigger,paw_flow,,25.685597,
5,0,0.518442,1,Auto Trigger,paw_flow,,,3.656475


In [15]:
num_rows, num_cols = df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

Number of rows: 3983
Number of columns: 8


We notice that the dataset contains 3,983 rows.
Each waveform corresponds to one unique sample_id, meaning that each group of rows sharing the same sample_id represents one complete respiratory wave segment.

It is important to note that the waveforms were not originally recorded using the same sampling rate, which results in different waves having different numbers of rows.

In [16]:
column_types = df.dtypes

print("Types of columns:")
print(column_types)

Types of columns:
sample_id          int64
time             float64
anomaly            int64
anomaly_type      object
data_modality     object
volume           float64
flow             float64
pressure         float64
dtype: object


In [17]:
df.describe()

Unnamed: 0,sample_id,time,anomaly,volume,flow,pressure
count,3983.0,3983.0,3983.0,617.0,2535.0,2038.0
mean,8.595029,7.565135,0.900326,137.608664,-11.566264,13.412088
std,5.762628,5.971458,0.299602,183.516524,61.505976,9.172305
min,0.0,-0.01773,0.0,-19.403912,-309.364162,-0.215054
25%,2.0,3.03643,1.0,2.365605,-12.01802,6.381535
50%,9.0,6.396307,1.0,40.121581,0.143284,10.678246
75%,14.0,9.773259,1.0,246.200608,12.760192,17.849462
max,19.0,29.581152,1.0,796.02723,77.887556,60.002523


In [18]:
# just make sure that the coulmn have
assert "sample_id" in df.columns, "it must have sample_id"
assert "time" in df.columns, "it must have time"


signal_channels = ["flow", "volume", "pressure"] 


label_columns =["anomaly", "anomaly_type"]


df = df.sort_values(["sample_id", "time"])

In [19]:


OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
final_csv_path = OUTPUT_DIR / "margeAllAfter.csv"



global_wave_count = 0     
total_windows = 0


# We will collect all windows here, and then concatenate them at the end
all_windows = []


label_columns = ["anomaly_type"]  

def build_uniform_time_grid(t_min: float, t_max: float, dt: float) -> np.ndarray:
    if not np.isfinite(t_min) or not np.isfinite(t_max) or t_max <= t_min:
        return np.array([t_min], dtype=float)
    n_steps = int(np.floor((t_max - t_min) / dt)) + 1
    return t_min + np.arange(n_steps) * dt

def resample_channel(times_src: np.ndarray, values_src: np.ndarray, times_uniform: np.ndarray) -> np.ndarray:
    s = pd.Series(values_src, index=times_src)
    s = s[~s.index.duplicated(keep="first")].sort_index()
    su = s.reindex(s.index.union(times_uniform)).interpolate("index").reindex(times_uniform)
    return su.to_numpy(dtype=float)

# === MAIN LOOP ===
for sid, g in df.groupby("sample_id", sort=True):  #sid is value of current sample id

    g = g.dropna(subset=["time"]) #g is a smaller DataFrame that contains only the rows belonging to that specific waveform
    if g.empty:
        continue

    #
    t_min, t_max = float(g["time"].min()), float(g["time"].max()) #identify max and min value for the time
    t_uniform = build_uniform_time_grid(t_min, t_max, DT) #DT = 1 / SAMPLE_RATE
    T = len(t_uniform) #count poiny after resampling
 
    #
    X_list, M_list = [], []
    for ch in signal_channels:  # we olready idententify signal_channels in cell 2 {Flow Paw Presure}
        series = g[["time", ch]].copy() #take time and signal hannel
        if series[ch].notna().any():
            ch_times = series.loc[series[ch].notna(), "time"].to_numpy(float)
            ch_vals  = series.loc[series[ch].notna(), ch].to_numpy(float)
            xr = resample_channel(ch_times, ch_vals, t_uniform)  # RAW
            mask = np.ones_like(xr, dtype=np.float32)
        else:
            xr = np.zeros(T, dtype=float)
            mask = np.zeros(T, dtype=np.float32)  #mask is zero when there is no info and 1 if there is

        X_list.append(xr)
        M_list.append(mask)

    X = np.stack(X_list, axis=1) if len(X_list) else np.zeros((T, 0))
    M = np.stack(M_list, axis=1) if len(M_list) else np.zeros((T, 0))

    # extrect lebal
    label_values = {}
    for lc in label_columns:
        label_values[lc] = g[lc].dropna().iloc[0] if (lc in g.columns and g[lc].notna().any()) else None

    # calculate place of window
    window_starts = []
    if T < WINDOW_SAMPLES:
        window_starts = [0]
    else:
        s = 0
        while s + WINDOW_SAMPLES <= T:
            window_starts.append(s)
            s += STRIDE_SAMPLES
        if not window_starts or (window_starts[-1] + WINDOW_SAMPLES < T):
            start_tail = max(T - WINDOW_SAMPLES, 0)
            if not window_starts or start_tail != window_starts[-1]:
                window_starts.append(start_tail)

    # create window and write it
    local_window_id = 0
    for ws in window_starts:
        we = ws + WINDOW_SAMPLES

        if we <= T:
            xw = X[ws:we, :]
            mw = M[ws:we, :]
        else:
            real_len = T - ws
            if real_len <= 0:
                continue

            xw = np.zeros((WINDOW_SAMPLES, X.shape[1]), dtype=float)
            mw = np.zeros((WINDOW_SAMPLES, M.shape[1]), dtype=np.float32)

            xw[:real_len, :] = X[ws:, :]
            mw[:real_len, :] = M[ws:, :]

            # PADDING → mask=0
            if real_len < WINDOW_SAMPLES:
                xw[real_len:, :] = X[-1:, :]
                mw[real_len:, :] = 0.0

        # buld dataframe of window
        df_w = pd.DataFrame(index=np.arange(WINDOW_SAMPLES))
        df_w["time"] = np.arange(WINDOW_SAMPLES) * DT

        # signal and mask
        for idx, ch in enumerate(signal_channels):
            df_w[ch] = xw[:, idx]
            df_w[f"mask_{ch}"] = mw[:, idx].astype(int)

        # id
        df_w["sample_id"] = sid
        df_w["window_id"] = local_window_id
        df_w["wave_count"] = global_wave_count

        # 
        for lc in label_columns:
            df_w[lc] = label_values[lc]

        # colum order
        col_order = ["time", "sample_id", "window_id", "wave_count"]
        for ch in signal_channels:
            col_order += [ch, f"mask_{ch}"]
        col_order += [c for c in label_columns if c in df_w.columns]
        df_w = df_w[col_order]

       
        all_windows.append(df_w)

        total_windows += 1
        local_window_id += 1
        global_wave_count += 1

# نجمع كل النوافذ في DataFrame واحد داخل الذاكرة
df_all = pd.concat(all_windows, ignore_index=True) if all_windows else pd.DataFrame()
print("Total windows:", total_windows, "| df_all shape:", df_all.shape)

# طباعة أول 6 صفوف
print("\nFirst 6 rows:\n", df_all.head(6))




Total windows: 32 | df_all shape: (19200, 11)

First 6 rows:
    time  sample_id  window_id  wave_count  flow  mask_flow  volume  \
0  0.00          0          0           0   NaN          1     0.0   
1  0.02          0          0           0   NaN          1     0.0   
2  0.04          0          0           0   NaN          1     0.0   
3  0.06          0          0           0   NaN          1     0.0   
4  0.08          0          0           0   NaN          1     0.0   
5  0.10          0          0           0   NaN          1     0.0   

   mask_volume  pressure  mask_pressure  anomaly_type  
0            0  5.304917              1  Auto Trigger  
1            0  5.405965              1  Auto Trigger  
2            0  5.507013              1  Auto Trigger  
3            0  5.608061              1  Auto Trigger  
4            0  5.709109              1  Auto Trigger  
5            0  5.810158              1  Auto Trigger  


In this step, each respiratory waveform was processed individually to prepare the data for machine-learning training. Since the original waves differ in duration and sampling rate, we first aligned them onto a unified and evenly spaced time scale. This ensures that all signals follow a consistent temporal structure.

After standardizing the timeline, each waveform was divided into fixed-length windows. This windowing process allows longer waves to contribute multiple segments while still accommodating shorter ones through controlled padding.

During this step, we also generated a mask for every channel to clearly distinguish between real data points and padded values. This helps the model learn without being misled by artificial padding.

In [20]:
# === Cell 5: Encode anomaly_type -> anomaly_id and save final CSV ===

#
assert "anomaly_type" in df_all.columns, "anomaly_type note exist "

#
unique_labels = sorted(df_all["anomaly_type"].dropna().unique())
class_to_id = {label: idx for idx, label in enumerate(unique_labels)}


df_all["anomaly_id"] = df_all["anomaly_type"].map(class_to_id)


df_all = df_all.drop(columns=["anomaly_type"])
print(class_to_id) 
df_all.head(6)

 



{'Auto Trigger': 0, 'Delayed cycling': 1, 'Double trigger': 2, 'Early cycling': 3, 'Flow asynchrony': 4, 'Ineffective trigger': 5, 'Normal': 6, 'Reverse Trigger': 7}


Unnamed: 0,time,sample_id,window_id,wave_count,flow,mask_flow,volume,mask_volume,pressure,mask_pressure,anomaly_id
0,0.0,0,0,0,,1,0.0,0,5.304917,1,0
1,0.02,0,0,0,,1,0.0,0,5.405965,1,0
2,0.04,0,0,0,,1,0.0,0,5.507013,1,0
3,0.06,0,0,0,,1,0.0,0,5.608061,1,0
4,0.08,0,0,0,,1,0.0,0,5.709109,1,0
5,0.1,0,0,0,,1,0.0,0,5.810158,1,0


In this step, the original textual anomaly labels were converted into numeric class identifiers using label encoding. The anomaly_type column was replaced with a new integer-based label (anomaly_id)

In [21]:
# save the file 
final_csv_path ="margeAllAfter.csv"
df_all.to_csv(final_csv_path, index=False, encoding="utf-8-sig")

print(" Saved final CSV with numeric anomaly_id")

 Saved final CSV with numeric anomaly_id
