This notebook is for a pipeline of analyzing and visualizing a time-seires dataset. 

## Step 1: load package and preprocess functions

In [1]:
import pandas as pd
import numpy as np
import os
from wav2sleep.data.edf import load_edf_data
from wav2sleep.data.txt import parse_txt_annotations
from wav2sleep.data.utils import interpolate_index
from wav2sleep.data.xml import parse_xml_annotations
from wav2sleep.settings import *
from wav2sleep.config import *


In [2]:

import mne, pandas as pd, pathlib
### please modify path here
# annotation_path = '/scratch/besp/shared_data/shhs/polysomnography/annotations-events-nsrr/shhs1/shhs1-205804-nsrr.xml'
# edf_path = '/scratch/besp/shared_data/shhs/polysomnography/edfs/shhs1/shhs1-205373.edf'
# DATA_FOR_CHECK = 'shhs'
annotation_path = '/scratch/besp/shared_data/ccshs/polysomnography/annotations-events-nsrr/ccshs-trec-1800806-nsrr.xml'
edf_path = '/scratch/besp/shared_data/ccshs/polysomnography/edfs/ccshs-trec-1800823.edf'
DATA_FOR_CHECK = 'ccshs'


###########################
edf = pathlib.Path(edf_path)
raw = mne.io.read_raw_edf(edf, preload=False, verbose="error")


hdr          = raw._raw_extras[0]              
rec_len_sec  = hdr['record_length']           
n_samps_list = hdr['n_samps']                  

rows = []
for idx, ch in enumerate(raw.info['chs']):
    sfreq = n_samps_list[idx] / rec_len_sec    
    rows.append(dict(channel   = ch['ch_name'],
                     sfreq_hz  = sfreq,
                     phys_unit = ch.get('unit', '—'),
                     lowpass   = ch.get('lowpass',  '—'),
                     highpass  = ch.get('highpass', '—')))

df = pd.DataFrame(rows)#.sort_values("sfreq_hz", ascending=False)


print(df)          




        channel          sfreq_hz  phys_unit lowpass highpass
0            C3    [128.0, 128.0]        107       —        —
1            C4    [128.0, 128.0]        107       —        —
2            A1    [128.0, 128.0]        107       —        —
3            A2    [128.0, 128.0]        107       —        —
4           LOC    [128.0, 128.0]        107       —        —
5           ROC    [128.0, 128.0]        107       —        —
6          ECG2    [256.0, 256.0]        107       —        —
7          ECG1    [256.0, 256.0]        107       —        —
8          EMG1    [256.0, 256.0]        107       —        —
9          EMG2    [256.0, 256.0]        107       —        —
10         EMG3    [256.0, 256.0]        107       —        —
11        L Leg      [64.0, 64.0]        107       —        —
12        R Leg      [64.0, 64.0]        107       —        —
13      AIRFLOW      [32.0, 32.0]        107       —        —
14  THOR EFFORT      [32.0, 32.0]        107       —        —
15  ABDO

In [3]:
'''
Preprocessing notes:
1. select a time window: here we use 10h 
2. select different frequency for different channel
3. resample using interpolate
4. channel-wise normalization
'''

def process_edf(edf: pd.DataFrame):
    """Process dataframe of EDF data."""
    signals = []

    def _process_edf_column(col, target_index):
        """Process signal column of EDF"""
        if col in edf:
            resampled_wav = interpolate_index(edf[col].dropna(), target_index, method="linear", squeeze=False)
            # normalized_wav = (resampled_wav - resampled_wav.mean()) / resampled_wav.std()
            print("col:", col, "length:", resampled_wav.shape)
            signals.append(resampled_wav)
            return 0
        else:
            return 1

    _process_edf_column(ECG, ECG_SIGNAL_INDEX)
    _process_edf_column(HR, HR_SIGNAL_INDEX)

    _process_edf_column(SPO2, SPO2_SIGNAL_INDEX)
    _process_edf_column(OX, OX_SIGNAL_INDEX)
    _process_edf_column(ABD, ABD_SIGNAL_INDEX)
    _process_edf_column(THX, THX_SIGNAL_INDEX)
    _process_edf_column(AF, AF_SIGNAL_INDEX)
    _process_edf_column(NP, NP_SIGNAL_INDEX)
    _process_edf_column(SN, SN_SIGNAL_INDEX)
    
    _process_edf_column(EMG_LLeg, EMG_LLeg_SIGNAL_INDEX)
    _process_edf_column(EMG_RLeg, EMG_RLeg_SIGNAL_INDEX)
    _process_edf_column(EMG_LChin, EMG_LChin_SIGNAL_INDEX)
    _process_edf_column(EMG_RChin, EMG_RChin_SIGNAL_INDEX)
    _process_edf_column(EMG_CChin, EMG_CChin_SIGNAL_INDEX)
    _process_edf_column(EOG_L, EOG_L_SIGNAL_INDEX)
    _process_edf_column(EOG_R, EOG_R_SIGNAL_INDEX)
    
    is_na_C3 = _process_edf_column(EEG_C3, EEG_C3_SIGNAL_INDEX)
    is_na_C4 = _process_edf_column(EEG_C4, EEG_C4_SIGNAL_INDEX)
    is_na_A1 = _process_edf_column(EEG_A1, EEG_A1_SIGNAL_INDEX)
    is_na_A2 = _process_edf_column(EEG_A2, EEG_A2_SIGNAL_INDEX)
    is_na_O1 = _process_edf_column(EEG_O1, EEG_O1_SIGNAL_INDEX)
    is_na_O2 = _process_edf_column(EEG_O2, EEG_O2_SIGNAL_INDEX)
    is_na_F3 = _process_edf_column(EEG_F3, EEG_F3_SIGNAL_INDEX)
    is_na_F4 = _process_edf_column(EEG_F4, EEG_F4_SIGNAL_INDEX)
    
    # add a logic to check
    
    is_na_C3_A2 = _process_edf_column(EEG_C3_A2, EEG_C3_A2_SIGNAL_INDEX)
    is_na_C4_A1 = _process_edf_column(EEG_C4_A1, EEG_C4_A1_SIGNAL_INDEX)
    is_na_F3_A2 = _process_edf_column(EEG_F3_A2, EEG_F3_A2_SIGNAL_INDEX)
    is_na_F4_A1 = _process_edf_column(EEG_F4_A1, EEG_F4_A1_SIGNAL_INDEX)
    is_na_O1_A2 = _process_edf_column(EEG_O1_A2, EEG_O1_A2_SIGNAL_INDEX)
    is_na_O2_A1 = _process_edf_column(EEG_O2_A1, EEG_O2_A1_SIGNAL_INDEX)
    
    
    
    merged_df = pd.concat(signals, axis=1).astype(np.float32)
    
    if (EEG_C3_A2 not in merged_df.columns.to_list()) and (is_na_C3 == 0) and (is_na_A2 == 0):
        merged_df[EEG_C3_A2] = merged_df[EEG_C3] - merged_df[EEG_A2]
    if (EEG_C4_A1 not in merged_df.columns.to_list()) and (is_na_C4 == 0) and (is_na_A1 == 0):
        merged_df[EEG_C4_A1] = merged_df[EEG_C4] - merged_df[EEG_A1]
    if (EEG_F3_A2 not in merged_df.columns.to_list()) and (is_na_F3 == 0) and (is_na_A2 == 0):
        merged_df[EEG_F3_A2] = merged_df[EEG_F3] - merged_df[EEG_A2]
    if (EEG_F4_A1 not in merged_df.columns.to_list()) and (is_na_F4 == 0) and (is_na_A1 == 0):
        merged_df[EEG_F4_A1] = merged_df[EEG_F4] - merged_df[EEG_A1]
    if (EEG_O1_A2 not in merged_df.columns.to_list()) and (is_na_O1 == 0) and (is_na_A2 == 0):
        merged_df[EEG_O1_A2] = merged_df[EEG_O1] - merged_df[EEG_A2]
    if (EEG_O2_A1 not in merged_df.columns.to_list()) and (is_na_O2 == 0) and (is_na_A1 == 0):
        merged_df[EEG_O2_A1] = merged_df[EEG_O2] - merged_df[EEG_A1]    
    
    merged_df = (merged_df - merged_df.mean()) / merged_df.std()
    return merged_df


def process(edf_fp: str, label_fp: str, output_fp: str, overwrite: bool = False) -> bool:
    """Process night of data."""
    if os.path.exists(output_fp) and not overwrite:
        logger.debug(f'Skipping {edf_fp=}, {output_fp=}, already exists')
        return False
    else:
        os.makedirs(os.path.dirname(output_fp), exist_ok=True)

    # Process labels
    if label_fp.endswith('.xml'):
        try:
            labels = parse_xml_annotations(label_fp)
        except Exception as e:
            logger.error(f'Failed to parse: {label_fp}.')
            logger.error(e)
            return False
    else:
        labels = parse_txt_annotations(fp=label_fp)
        if labels is None:
            logger.error(f'Failed to parse: {label_fp}.')
            return False
    labels = labels.reindex(TARGET_LABEL_INDEX).fillna(-1)
    # Check for N1, N3 or REM presence. (Recordings with just sleep-wake typically use N2 as sole sleep class)
    stage_counts = labels.value_counts()
    
    if stage_counts.get(1.0) is None and stage_counts.get(3.0) is None and stage_counts.get(4.0) is None:
        
        logger.error(f'No N1, N3 or REM in {label_fp}.')
        output_fp = output_fp.replace('.parquet', '.issues.parquet')
    edf = load_edf_data(edf_fp, columns=EDF_COLS, raise_on_missing=False)
    waveform_df = process_edf(edf)
    output_df = pd.concat([waveform_df, labels], axis=1)
    output_df = waveform_df
    output_df.to_parquet(output_fp)
    return True

In [4]:

output_path = f'./test/test_{DATA_FOR_CHECK}/test.parquet'
process(edf_path, annotation_path, output_path)

{'C3': 0, 'C4': 1, 'A1': 2, 'A2': 3, 'LOC': 4, 'ROC': 5, 'ECG2': 6, 'ECG1': 7, 'EMG1': 8, 'EMG2': 9, 'EMG3': 10, 'L Leg': 11, 'R Leg': 12, 'AIRFLOW': 13, 'THOR EFFORT': 14, 'ABDO EFFORT': 15, 'SNORE': 16, 'SUM': 17, 'POSITION': 18, 'OX STATUS': 19, 'PULSE': 20, 'SpO2': 21, 'NASAL PRES': 22, 'PlethWV': 23, 'Light': 24, 'HRate': 25}
<class 'pandas.core.indexes.numeric.Float64Index'> <class 'pandas.core.indexes.numeric.Float64Index'>
col: ECG length: (4500000, 1)
<class 'pandas.core.indexes.numeric.Float64Index'> <class 'pandas.core.indexes.numeric.Float64Index'>
col: HR length: (36000, 1)
<class 'pandas.core.indexes.numeric.Float64Index'> <class 'pandas.core.indexes.numeric.Float64Index'>
col: SPO2 length: (36000, 1)
<class 'pandas.core.indexes.numeric.Float64Index'> <class 'pandas.core.indexes.numeric.Float64Index'>
col: OX length: (36000, 1)
<class 'pandas.core.indexes.numeric.Float64Index'> <class 'pandas.core.indexes.numeric.Float64Index'>
col: ABD length: (360000, 1)
<class 'pandas.

True

## Step 2: Sanity Check for the pre-processed data

In [5]:
df = pd.read_parquet(output_path)

In [6]:
df.head(125)

Unnamed: 0,ECG,HR,SPO2,OX,ABD,THX,AF,NP,SN,EMG_LLeg,...,EMG_RChin,EMG_CChin,EOG_L,EOG_R,EEG_C3,EEG_C4,EEG_A1,EEG_A2,EEG_C3_A2,EEG_C4_A1
0.008000,0.403133,,,,,,,,,,...,,,,,0.154027,0.128976,0.163462,0.215264,-0.156636,-0.110769
0.015625,,,,,,,,,,0.013094,...,0.124752,0.117541,,,,,,,,
0.016000,0.414034,,,,,,,,,,...,,,,,0.139240,0.115995,0.158330,0.262457,-0.278636,-0.127152
0.020000,,,,,,,,,,,...,,,0.019876,0.20092,,,,,,
0.024000,0.444919,,,,,,,,,,...,,,,,0.107257,0.112311,0.154962,0.305038,-0.422365,-0.127152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.568000,-0.018821,,,,,,,,,,...,,,,,0.143882,0.053722,0.127698,0.120417,0.016584,-0.191123
0.576000,0.128794,,,,,,,,,,...,,,,,0.143367,0.074246,0.163302,0.132715,-0.009182,-0.232080
0.578125,,,,,,,,,,-0.071914,...,0.055049,0.045678,,,,,,,,
0.580000,,,,,,,,,,,...,,,0.081009,0.07477,,,,,,


In [7]:
print(len(df))

df_clean = df.dropna(how="all") 

print(len(df_clean))

7693175
7693175


In [8]:
df.describe()

Unnamed: 0,ECG,HR,SPO2,OX,ABD,THX,AF,NP,SN,EMG_LLeg,...,EMG_RChin,EMG_CChin,EOG_L,EOG_R,EEG_C3,EEG_C4,EEG_A1,EEG_A2,EEG_C3_A2,EEG_C4_A1
count,4500000.0,36000.0,36000.0,36000.0,360000.0,360000.0,360000.0,1152000.0,360000.0,2304000.0,...,2304000.0,2304000.0,1800000.0,1800000.0,4500000.0,4500000.0,4500000.0,4500000.0,4500000.0,4500000.0
mean,8.677588e-09,1.5e-05,1.30256e-07,1.006656e-09,2.655089e-08,4.479024e-08,3.950728e-07,-1.740025e-07,-2e-06,-1.915296e-07,...,2.927002e-08,8.34941e-08,-1.011398e-08,-1.72463e-07,5.749342e-08,1.495656e-07,-6.382115e-08,-3.877513e-08,1.756939e-07,-8.812798e-09
std,1.0,1.0,0.9999999,0.9999999,1.0,0.9999999,0.9999999,0.9999999,1.0,1.0,...,0.9999999,1.0,0.9999999,1.0,1.0,1.0,1.0,0.9999999,1.0,0.9999999
min,-4.977956,-1.437151,-5.098371,-0.2143331,-10.22888,-7.564358,-6.5561,-11.40458,-180.547668,-8.836324,...,-3.859538,-3.354979,-3.060602,-5.108677,-5.180224,-5.579058,-5.104464,-5.455242,-17.05243,-25.43887
25%,-0.2276032,-0.500604,0.1761381,-0.2143331,-0.2693056,-0.246941,-0.6208002,-0.5288963,-0.250095,-0.04738171,...,-0.07200338,-0.09377032,-0.1013403,-0.124011,-0.1520447,-0.1329204,-0.1332368,-0.1006352,-0.381388,-0.4052687
50%,0.138484,-0.500604,0.1761381,-0.2143331,0.03236534,0.01108155,-0.03008326,-0.02605061,-0.036078,0.001683473,...,0.06034307,0.03455634,0.005761113,0.05390473,0.02042183,0.02583098,0.05280131,0.07553053,-0.07964984,-0.04796897
75%,0.3098698,0.363075,0.2311534,-0.2143331,0.2579848,0.2408245,0.5810915,0.4505597,0.15416,0.04675496,...,0.1900426,0.1678022,0.1151515,0.2166889,0.1891054,0.1791445,0.2186318,0.236324,0.2546838,0.3116712
max,4.943644,5.679806,0.3949414,5.906232,10.53698,7.751524,9.083836,24.82546,107.483765,9.857509,...,3.367901,3.652941,3.18924,5.329257,6.088043,5.916297,5.405406,4.618484,21.16093,15.20007


In [9]:
print(df['ECG'])
print(df['ECG'].dropna())

0.008000        0.403133
0.015625             NaN
0.016000        0.414034
0.020000             NaN
0.024000        0.444919
                  ...   
35999.980000         NaN
35999.984000   -0.284530
35999.984375         NaN
35999.992000   -0.338429
36000.000000   -0.331767
Name: ECG, Length: 7693175, dtype: float32
0.008        0.403133
0.016        0.414034
0.024        0.444919
0.032        0.431596
0.040        0.422512
               ...   
35999.968   -0.270601
35999.976   -0.287558
35999.984   -0.284530
35999.992   -0.338429
36000.000   -0.331767
Name: ECG, Length: 4500000, dtype: float32


## Step 3: Check the spectrogram of Each Channel