This notebook is for a pipeline of analyzing and visualizing a time-seires dataset. 

## Step 1: load package and preprocess functions

In [1]:
import pandas as pd
import numpy as np
import os
from wav2sleep.data.edf import load_edf_data
from wav2sleep.data.txt import parse_txt_annotations
from wav2sleep.data.utils import interpolate_index
from wav2sleep.data.xml import parse_xml_annotations
from wav2sleep.data.xml_parse_all import parse_all_annotations, annotate_waveform
from wav2sleep.data.rpoints import parse_process_rpoints_annotations
from wav2sleep.settings import *
from wav2sleep.config import *


In [2]:

import mne, pandas as pd, pathlib
### please modify path here
# annotation_path = '/scratch/besp/shared_data/shhs/polysomnography/annotations-events-nsrr/shhs1/shhs1-201935-nsrr.xml'
# edf_path = '/scratch/besp/shared_data/shhs/polysomnography/edfs/shhs1/shhs1-201935.edf'
# DATA_FOR_CHECK = 'shhs'
annotation_path = '/scratch/besp/shared_data/ccshs/polysomnography/annotations-events-nsrr/ccshs-trec-1800823-nsrr.xml'
edf_path = '/scratch/besp/shared_data/ccshs/polysomnography/edfs/ccshs-trec-1800823.edf'
# annotation_path = '/scratch/besp/shared_data/ccshs/polysomnography/annotations-events-nsrr/ccshs-trec-1800248-nsrr.xml'
# edf_path = '/scratch/besp/shared_data/ccshs/polysomnography/edfs/ccshs-trec-1800248.edf'
DATA_FOR_CHECK = 'ccshs'


###########################
edf = pathlib.Path(edf_path)
raw = mne.io.read_raw_edf(edf, preload=False, verbose="error")


hdr          = raw._raw_extras[0]              
rec_len_sec  = hdr['record_length']           
n_samps_list = hdr['n_samps']                  

rows = []
for idx, ch in enumerate(raw.info['chs']):
    sfreq = n_samps_list[idx] / rec_len_sec    
    rows.append(dict(channel   = ch['ch_name'],
                     sfreq_hz  = sfreq,
                     phys_unit = ch.get('unit', '—'),
                     lowpass   = ch.get('lowpass',  '—'),
                     highpass  = ch.get('highpass', '—')))

df = pd.DataFrame(rows)#.sort_values("sfreq_hz", ascending=False)
print(df)          

        channel          sfreq_hz  phys_unit lowpass highpass
0            C3    [128.0, 128.0]        107       —        —
1            C4    [128.0, 128.0]        107       —        —
2            A1    [128.0, 128.0]        107       —        —
3            A2    [128.0, 128.0]        107       —        —
4           LOC    [128.0, 128.0]        107       —        —
5           ROC    [128.0, 128.0]        107       —        —
6          ECG2    [256.0, 256.0]        107       —        —
7          ECG1    [256.0, 256.0]        107       —        —
8          EMG1    [256.0, 256.0]        107       —        —
9          EMG2    [256.0, 256.0]        107       —        —
10         EMG3    [256.0, 256.0]        107       —        —
11        L Leg      [64.0, 64.0]        107       —        —
12        R Leg      [64.0, 64.0]        107       —        —
13      AIRFLOW      [32.0, 32.0]        107       —        —
14  THOR EFFORT      [32.0, 32.0]        107       —        —
15  ABDO

In [3]:
'''
Preprocessing notes:
1. select a time window: here we use 10h 
2. select different frequency for different channel
3. resample using interpolate
4. channel-wise normalization
'''
def _mne_lowpass_series(s: pd.Series, fs,
                        cutoff=None) -> pd.Series:
    """
    Apply low-pass filter to a pd.Series using MNE.
    Keeps frequencies below the cutoff.
    
    Parameters:
    - s: input signal
    - fs: sampling rate
    - cutoff: cutoff frequency (Hz)
    """
    if cutoff is None:
        return s

    x = s.to_numpy(np.float64)[np.newaxis, :]  # shape (1, n)

    x_filt = mne.filter.filter_data(
        x, sfreq=fs,
        l_freq=None, h_freq=cutoff, 
        method='fir', phase='zero-double',
        n_jobs='cuda',
        verbose=False
    )[0]

    return pd.Series(x_filt, index=s.index, name=s.name)

def process_edf(edf: pd.DataFrame):
    """Process dataframe of EDF data."""
    signals = []

    def _process_edf_column(col, target_index, preprocessed_fs):
        """Process signal column of EDF"""
        if col in edf:
            
            raw = edf[col].dropna()
            
            # print(len(raw.loc[0:1]))
            raw_fs = len(raw.loc[0:1]) - 1
            
            if raw_fs > preprocessed_fs: 
                raw_hp = _mne_lowpass_series(raw, raw_fs, cutoff = preprocessed_fs/2)
            else:
                raw_hp = raw
            
            resampled = interpolate_index(raw_hp, target_index,
                              method="linear", squeeze=False)
            # normalized_wav = (resampled_wav - resampled_wav.mean()) / resampled_wav.std()
            print("col:", col, "length:", resampled.shape)
            signals.append(resampled)
            return 0
        else:
            return 1

    _process_edf_column(ECG, ECG_SIGNAL_INDEX, FREQ_ECG)
    _process_edf_column(HR, HR_SIGNAL_INDEX, FREQ_HR)

    _process_edf_column(SPO2, SPO2_SIGNAL_INDEX, FREQ_SPO2)
    _process_edf_column(OX, OX_SIGNAL_INDEX, FREQ_OX)
    _process_edf_column(ABD, ABD_SIGNAL_INDEX, FREQ_ABD)
    _process_edf_column(THX, THX_SIGNAL_INDEX, FREQ_THX)
    _process_edf_column(AF, AF_SIGNAL_INDEX, FREQ_AF)
    _process_edf_column(NP, NP_SIGNAL_INDEX, FREQ_NP)
    _process_edf_column(SN, SN_SIGNAL_INDEX, FREQ_SN)
    
    _process_edf_column(EMG_LLeg, EMG_LLeg_SIGNAL_INDEX, FREQ_EMG_LLeg)
    _process_edf_column(EMG_RLeg, EMG_RLeg_SIGNAL_INDEX, FREQ_EMG_RLeg)
    _process_edf_column(EMG_LChin, EMG_LChin_SIGNAL_INDEX, FREQ_EMG_LChin)
    _process_edf_column(EMG_RChin, EMG_RChin_SIGNAL_INDEX, FREQ_EMG_RChin)
    _process_edf_column(EMG_CChin, EMG_CChin_SIGNAL_INDEX, FREQ_EMG_CChin)
    _process_edf_column(EOG_L, EOG_L_SIGNAL_INDEX, FREQ_EOG_L)
    _process_edf_column(EOG_R, EOG_R_SIGNAL_INDEX, FREQ_EOG_R)
    
    is_na_C3 = _process_edf_column(EEG_C3, EEG_C3_SIGNAL_INDEX, FREQ_EEG_C3)
    is_na_C4 = _process_edf_column(EEG_C4, EEG_C4_SIGNAL_INDEX, FREQ_EEG_C4)
    is_na_A1 = _process_edf_column(EEG_A1, EEG_A1_SIGNAL_INDEX, FREQ_EEG_A1)
    is_na_A2 = _process_edf_column(EEG_A2, EEG_A2_SIGNAL_INDEX, FREQ_EEG_A2)
    is_na_O1 = _process_edf_column(EEG_O1, EEG_O1_SIGNAL_INDEX, FREQ_EEG_O1)
    is_na_O2 = _process_edf_column(EEG_O2, EEG_O2_SIGNAL_INDEX, FREQ_EEG_O2)
    is_na_F3 = _process_edf_column(EEG_F3, EEG_F3_SIGNAL_INDEX, FREQ_EEG_F3)
    is_na_F4 = _process_edf_column(EEG_F4, EEG_F4_SIGNAL_INDEX, FREQ_EEG_F4)
    
    # add a logic to check
    
    is_na_C3_A2 = _process_edf_column(EEG_C3_A2, EEG_C3_A2_SIGNAL_INDEX, FREQ_EEG_C3_A2)
    is_na_C4_A1 = _process_edf_column(EEG_C4_A1, EEG_C4_A1_SIGNAL_INDEX, FREQ_EEG_C4_A1)
    is_na_F3_A2 = _process_edf_column(EEG_F3_A2, EEG_F3_A2_SIGNAL_INDEX, FREQ_EEG_F3_A2)
    is_na_F4_A1 = _process_edf_column(EEG_F4_A1, EEG_F4_A1_SIGNAL_INDEX, FREQ_EEG_F4_A1)
    is_na_O1_A2 = _process_edf_column(EEG_O1_A2, EEG_O1_A2_SIGNAL_INDEX, FREQ_EEG_O1_A2)
    is_na_O2_A1 = _process_edf_column(EEG_O2_A1, EEG_O2_A1_SIGNAL_INDEX, FREQ_EEG_O2_A1)
    
    
    
    merged_df = pd.concat(signals, axis=1).astype(np.float32)
    
    if (EEG_C3_A2 not in merged_df.columns.to_list()) and (is_na_C3 == 0) and (is_na_A2 == 0):
        merged_df[EEG_C3_A2] = merged_df[EEG_C3] - merged_df[EEG_A2]
    if (EEG_C4_A1 not in merged_df.columns.to_list()) and (is_na_C4 == 0) and (is_na_A1 == 0):
        merged_df[EEG_C4_A1] = merged_df[EEG_C4] - merged_df[EEG_A1]
    if (EEG_F3_A2 not in merged_df.columns.to_list()) and (is_na_F3 == 0) and (is_na_A2 == 0):
        merged_df[EEG_F3_A2] = merged_df[EEG_F3] - merged_df[EEG_A2]
    if (EEG_F4_A1 not in merged_df.columns.to_list()) and (is_na_F4 == 0) and (is_na_A1 == 0):
        merged_df[EEG_F4_A1] = merged_df[EEG_F4] - merged_df[EEG_A1]
    if (EEG_O1_A2 not in merged_df.columns.to_list()) and (is_na_O1 == 0) and (is_na_A2 == 0):
        merged_df[EEG_O1_A2] = merged_df[EEG_O1] - merged_df[EEG_A2]
    if (EEG_O2_A1 not in merged_df.columns.to_list()) and (is_na_O2 == 0) and (is_na_A1 == 0):
        merged_df[EEG_O2_A1] = merged_df[EEG_O2] - merged_df[EEG_A1]    
    
    merged_df = (merged_df - merged_df.mean()) / merged_df.std()
    return merged_df



def process(edf_fp: str, label_fp: str, output_fp: str, overwrite: bool = False) -> bool:
    """Process night of data."""
    if os.path.exists(output_fp) and not overwrite:
        logger.debug(f'Skipping {edf_fp=}, {output_fp=}, already exists')
        return False
    else:
        os.makedirs(os.path.dirname(output_fp), exist_ok=True)
        
    # Process labels
    if label_fp.endswith('.xml'):
        try:
            labels = parse_xml_annotations(label_fp) # parse sleep stages
            all_df = parse_all_annotations(label_fp) # parse all other events (arousals, respiratory)
        except Exception as e:
            logger.error(f'Failed to parse: {label_fp}.')
            logger.error(e)
            return False
    else:
        labels = parse_txt_annotations(fp=label_fp)
        # NOTE: If we end up using a dataset with txt annotations, we will want to write another function to parse all other events
        if labels is None:
            logger.error(f'Failed to parse: {label_fp}.')
            return False
    labels = labels.reindex(TARGET_LABEL_INDEX).fillna(-1) # these are sleep stage labels
    # Check for N1, N3 or REM presence. (Recordings with just sleep-wake typically use N2 as sole sleep class)
    stage_counts = labels.value_counts()
    if stage_counts.get(1.0) is None and stage_counts.get(3.0) is None and stage_counts.get(4.0) is None: 
        logger.error(f'No N1, N3 or REM in {label_fp}.')
        output_fp = output_fp.replace('.parquet', 'sleepstage.issues.parquet') # note these are still useful since sleep stages are not strictly necessary for captions
    
    edf = load_edf_data(edf_fp, columns=EDF_COLS, raise_on_missing=False)
    waveform_df = process_edf(edf)
    output_df = pd.concat([waveform_df, labels], axis=1)
    
    
    # output_df = annotate_waveform(output_df, [all_df])
    
    output_df.to_parquet(output_fp)
    return True

In [4]:

output_path = f'./test/test_{DATA_FOR_CHECK}/test.parquet'
# process(edf_path, annotation_path, output_path)

## Step 2: Sanity Check for the pre-processed data

In [5]:
df = pd.read_parquet(output_path)

In [6]:
df.head(128)

Unnamed: 0,ECG,HR,SPO2,OX,ABD,THX,AF,NP,SN,EMG_LLeg,...,EMG_CChin,EOG_L,EOG_R,EEG_C3,EEG_C4,EEG_A1,EEG_A2,EEG_C3_A2,EEG_C4_A1,Stage
0.007812,0.440614,,,,,,,,,,...,,,,,,,,,,
0.015625,0.476442,,,,,,,,,0.013094,...,0.157685,0.022226,0.188558,0.155145,0.120566,0.164373,0.233547,-0.189578,-0.130112,
0.023438,0.483524,,,,,,,,,,...,,,,,,,,,,
0.031250,0.499479,,,,,,,,0.489620,-0.021708,...,0.125483,0.007087,0.196270,0.079324,0.107714,0.145194,0.300588,-0.459381,-0.112241,
0.039062,0.472029,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.968750,-0.484955,,,,,,,,-0.108981,0.054743,...,0.652792,0.138020,0.245651,0.267168,0.100432,0.194370,0.289232,-0.100508,-0.247061,
0.976562,-0.769167,,,,,,,,,,...,,,,,,,,,,
0.984375,-0.090049,,,,,,,,,0.023364,...,0.785541,0.080364,0.391951,0.263962,0.158410,-0.153812,0.327730,-0.183262,0.721707,
0.992188,1.936273,,,,,,,,,,...,,,,,,,,,,


In [8]:
# print(pd.isna(df['EMG_LLeg_events']))

In [9]:
print(len(df))

df_clean = df.dropna(how="all") 

print(len(df_clean))

4608000
4608000


In [10]:
df.describe()

Unnamed: 0,ECG,HR,SPO2,OX,ABD,THX,AF,NP,SN,EMG_LLeg,...,EMG_CChin,EOG_L,EOG_R,EEG_C3,EEG_C4,EEG_A1,EEG_A2,EEG_C3_A2,EEG_C4_A1,Stage
count,4608000.0,36000.0,36000.0,36000.0,288000.0,288000.0,288000.0,288000.0,1152000.0,2304000.0,...,2304000.0,2304000.0,2304000.0,2304000.0,2304000.0,2304000.0,2304000.0,2304000.0,2304000.0,1200.0
mean,1.17951e-08,8e-06,2e-06,2.24034e-07,-4.270011e-09,1.399757e-08,-1.32925e-07,3.009372e-08,8.162498e-06,2.144054e-07,...,4.960762e-08,-4.068596e-08,8.573797e-08,2.975927e-08,6.732345e-08,4.443858e-08,-1.231763e-07,4.157689e-07,4.000134e-09,1.838333
std,0.9999999,1.0,1.0,0.9999999,1.0,1.0,1.0,1.0,0.9999999,1.0,...,1.0,0.9999999,1.0,1.0,0.9999999,0.9999999,1.0,1.0,1.0,1.466512
min,-10.29667,-2.017526,-5.098369,-0.2143331,-10.71381,-8.26563,-6.586277,-10.21683,-41.95744,-8.836324,...,-6.33201,-4.87738,-7.013215,-7.183324,-7.620163,-6.986156,-7.583656,-16.61291,-24.28671,0.0
25%,-0.2802635,-0.517657,0.17614,-0.2143331,-0.2689425,-0.2474319,-0.6203564,-0.5299019,-0.2014888,-0.0473813,...,-0.1417342,-0.1027955,-0.1284004,-0.1545936,-0.1347163,-0.1372003,-0.1036812,-0.3824673,-0.4082511,0.0
50%,0.1480797,-0.517657,0.17614,-0.2143331,0.03234929,0.01112955,-0.03048699,-0.01614015,-0.05716186,0.001683881,...,0.041377,0.005204327,0.05371881,0.02018109,0.02574785,0.05248186,0.07561701,-0.07935584,-0.04985562,2.0
75%,0.3415336,0.320436,0.231155,-0.2143331,0.2580699,0.2419875,0.5803589,0.4427423,0.09129246,0.04675537,...,0.2204267,0.1150267,0.2186569,0.1899607,0.1802212,0.2201145,0.2381507,0.2568295,0.3150061,3.0
max,9.866091,5.5597,0.394943,5.906232,11.0501,8.177671,8.883821,24.46212,100.2395,9.857509,...,7.063334,4.901008,7.319209,7.830482,7.683206,7.371453,6.486403,22.56493,15.01821,4.0


In [12]:
print(df['ECG'])
print(df['ECG'].dropna())
print(df.columns)
print(df['Stage'].unique())

0.007812        0.440614
0.015625        0.476442
0.023438        0.483524
0.031250        0.499479
0.039062        0.472029
                  ...   
35999.968750   -0.312155
35999.976562   -0.322524
35999.984375   -0.327719
35999.992188   -0.384232
36000.000000   -0.356567
Name: ECG, Length: 4608000, dtype: float32
0.007812        0.440614
0.015625        0.476442
0.023438        0.483524
0.031250        0.499479
0.039062        0.472029
                  ...   
35999.968750   -0.312155
35999.976562   -0.322524
35999.984375   -0.327719
35999.992188   -0.384232
36000.000000   -0.356567
Name: ECG, Length: 4608000, dtype: float32
Index(['ECG', 'HR', 'SPO2', 'OX', 'ABD', 'THX', 'AF', 'NP', 'SN', 'EMG_LLeg',
       'EMG_RLeg', 'EMG_LChin', 'EMG_RChin', 'EMG_CChin', 'EOG_L', 'EOG_R',
       'EEG_C3', 'EEG_C4', 'EEG_A1', 'EEG_A2', 'EEG_C3_A2', 'EEG_C4_A1',
       'Stage'],
      dtype='object')
[nan  0.  1.  2.  3.  4.]


## Step 3: Check merge and separate storing channels

In [18]:
output_path = f'./test/test_{DATA_FOR_CHECK}/test.parquet'
for col_name in df.columns.to_list():
    output_path_temp = output_path.replace('.parquet', f'_{col_name}.parquet')
    df_temp = df[[col_name]].copy()
    df_temp.dropna(inplace = True)
    df_temp.to_parquet(output_path_temp)

               ECG
0.007812  0.440614
0.015625  0.476442
0.023438  0.483524
0.031250  0.499479
0.039062  0.472029 (4608000, 1)
               ECG
0.007812  0.440614
0.015625  0.476442
0.023438  0.483524
0.031250  0.499479
0.039062  0.472029 (4608000, 1)
./test/test_ccshs/test_ECG.parquet
          HR
0.007812 NaN
0.015625 NaN
0.023438 NaN
0.031250 NaN
0.039062 NaN (4608000, 1)
           HR
1.0  2.331671
2.0  1.161773
3.0  1.728171
4.0  1.235669
5.0  1.962005 (36000, 1)
./test/test_ccshs/test_HR.parquet
          SPO2
0.007812   NaN
0.015625   NaN
0.023438   NaN
0.031250   NaN
0.039062   NaN (4608000, 1)
        SPO2
1.0  0.17614
2.0  0.17614
3.0  0.17614
4.0  0.17614
5.0  0.17614 (36000, 1)
./test/test_ccshs/test_SPO2.parquet
          OX
0.007812 NaN
0.015625 NaN
0.023438 NaN
0.031250 NaN
0.039062 NaN (4608000, 1)
           OX
1.0 -0.214333
2.0 -0.214333
3.0 -0.214333
4.0 -0.214333
5.0 -0.214333 (36000, 1)
./test/test_ccshs/test_OX.parquet
          ABD
0.007812  NaN
0.015625  NaN
0

In [20]:
output_path = f'./test/test_{DATA_FOR_CHECK}/test.parquet'
df_list = []
for col_name in df.columns.to_list():
    output_path_temp = output_path.replace('.parquet', f'_{col_name}.parquet')
    df_temp = pd.read_parquet(output_path_temp)
    df_list.append(df_temp)
df_recover = pd.concat(df_list, axis=1)
print(df_recover)

                   ECG        HR     SPO2        OX       ABD       THX  \
0.007812      0.440614       NaN      NaN       NaN       NaN       NaN   
0.015625      0.476442       NaN      NaN       NaN       NaN       NaN   
0.023438      0.483524       NaN      NaN       NaN       NaN       NaN   
0.031250      0.499479       NaN      NaN       NaN       NaN       NaN   
0.039062      0.472029       NaN      NaN       NaN       NaN       NaN   
...                ...       ...      ...       ...       ...       ...   
35999.968750 -0.312155       NaN      NaN       NaN       NaN       NaN   
35999.976562 -0.322524       NaN      NaN       NaN       NaN       NaN   
35999.984375 -0.327719       NaN      NaN       NaN       NaN       NaN   
35999.992188 -0.384232       NaN      NaN       NaN       NaN       NaN   
36000.000000 -0.356567 -0.517657  0.17614 -0.214333  0.174837  0.067072   

                    AF        NP        SN  EMG_LLeg  ...  EMG_CChin  \
0.007812           NaN     