# Detection of stages: Light, Deep, REM, Wake Movement

Database source: 
    
    Goldberger AL, Amaral LAN, Glass L, Hausdorff JM, Ivanov PCh, Mark RG, Mietus JE, Moody GB, Peng C-K, Stanley HE. PhysioBank, PhysioToolkit, and PhysioNet: Components of a New Research Resource for Complex Physiologic Signals. Circulation 101(23):e215-e220 [Circulation Electronic Pages; http://circ.ahajournals.org/content/101/23/e215.full]; 2000 (June 13).


In [1]:
from __future__ import division
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.signal as ss
import seaborn as sns
from scipy.ndimage import convolve1d
import itertools
import sleep_utils as su

# import psycopg2
import datetime  
import time 
import wfdb
import glob

from biosppy.signals import ecg

%matplotlib inline
sns.set()
pd.set_option("display.max_columns", None)
%load_ext autoreload
%autoreload 2

## Load a file from database

In [2]:
data_dir = '../data/PSG_database/'

co2_sig = []
ppg_sig = []

filenames = []

sleep_data_sigs = {}
sleep_data_fields = {}

data_keys = []
for fname in glob.iglob(data_dir + '/*.dat'):#, recursive=True):
    # filename manipulation
    base = os.path.basename(fname)
    key = os.path.splitext(base)[0]
    data_keys.append(key)
    
    # load data
    sig, fields=wfdb.rdsamp(data_dir+key)
    sleep_data_sigs[key] = sig
    sleep_data_fields[key] = fields

In [3]:
ann_index = {}
ann_labels = {}

set_stages = set()
for key in data_keys:
    annotation = wfdb.rdann(data_dir + key, 'st')
    ann_index_sig = annotation[0]
    ann_label_sig = annotation[5]
    label_filt = [(x, y) for x, y in zip(ann_index_sig, ann_label_sig) if x >1 ]
    unzipped = zip(*label_filt)
    ann_index[key] = unzipped[0]
    ann_labels[key] = unzipped[1]


In [4]:
resp_data_set = {}
ecg_data_set = {}
time_data_set = {}
annot_data_set = {}

fs = 250
dec_prec = len(str(1/fs).split('.')[1])

for key in data_keys:
    
    resp_data_set[key] = (sleep_data_sigs[key])[:, (sleep_data_fields[key])['signame'].index('Resp')]
    ecg_data_set[key] = (sleep_data_sigs[key])[:, (sleep_data_fields[key])['signame'].index('ECG')]
    time_data_set[key] = np.arange(0, len(resp_data_set[key]))/fs

    # Note: Only the first character of the annotation was taken as label
    # Example: 1, 2, 3, 4, R, W, M
    stages_list = [x.rsplit(' ')[0][0] for x in ann_labels[key]]

    annot_data_set[key] = stages_list


#### *Annotations*

Stages:
   * W: subject is awake
   * 1: sleep stage 1
   * 2: sleep stage 2
   * 3: sleep stage 3
   * 4: sleep stage 4
   * R: REM sleep
   
Other descriptions:
   * H: Hypopnea
   * HA: Hypopnea with arousal
   * OA: Obstructive apnea
   * X: Obstructive apnea with arousal
   * CA: Central apnea
   * CAA: Central apnea with arousal
   * L: Leg movements
   * LA: Leg movements with arousal
   * A: Unspecified arousal
   * MT: Movement time

## Get Features

### I. Previous sleep stage

Note: Get features for every window of data

In [5]:
win_dur = 30 # duration of window, unit: seconds, size of one epoch 
win_size = fs*win_dur # length of window
win_step = 1 # duration by which the window slides, unit: seconds
win_int = win_step*fs # length by which the window slides

In [6]:
feature_prev_stage = {}

for key in data_keys:
    print("Computing for "+ key + "...")

    prev_stage = [np.nan] + annot_data_set[key][1:]
    feature_prev_stage[key] = {'_feat' : prev_stage}
    

Computing for slp01a...
Computing for slp01b...
Computing for slp02a...
Computing for slp02b...
Computing for slp03...
Computing for slp04...
Computing for slp14...
Computing for slp16...
Computing for slp32...
Computing for slp37...
Computing for slp41...
Computing for slp45...
Computing for slp48...
Computing for slp59...
Computing for slp60...
Computing for slp61...
Computing for slp66...
Computing for slp67x...


### II. Respiration signal

Divide respiration data per 30-second epoch. 

In [7]:
# Respiratory frequency range 
min_normrange = 4/60 # unit: cycles per second
max_normrange = 65/60 # unit: cycles per second

resp_data_set_epochs = {}

for key in data_keys:
    resp_sig = resp_data_set[key]
    fft_resp = np.fft.fft(resp_sig)
    fft_freqs = np.fft.fftfreq(len(resp_sig), 1/fs)
    
    # Remove frequencies which are outside the expected range
    fft_resp[abs(fft_freqs) < min_normrange] = 0
    fft_resp[abs(fft_freqs) > max_normrange] = 0
    
    resp_filt_sig = np.real(np.fft.ifft(fft_resp))
    time_sig = np.arange(len(resp_sig))/fs

    # Divide into windows
    resp_windows = su.divide_to_epochs(resp_filt_sig, ann_index[key], win_dur, fs)
    time_windows = su.divide_to_epochs(time_sig, ann_index[key], win_dur, fs)
    
    resp_data_set_epochs[key] = pd.DataFrame({'resp': list(resp_windows), '_time': list(time_windows)})


#### II.1 Respiration rate
*Frequency corresponding to the highest peak in the epoch's power spectrum (unit: breaths/minute)*

In [8]:
feature_resp_rate = {}
# feature_resp_rate_fd = {}
# feature_resp_rate_sd = {}

for key in data_keys:
    resp_data = resp_data_set_epochs[key]
    resp_data['power_spectrum'] = resp_data.resp.apply(lambda x: ss.periodogram(x, fs=fs))
    
    # Compute respiration rate
    resp_data['resp_rate'] = resp_data.power_spectrum.apply(lambda x: (x[0])[np.argmax(x[1])]*60)

    feature_resp_rate[key] = {'_feat' : resp_data.resp_rate.values}


#### II.2 Histogram of Respiration data
Use the counts for each bin of the magnitude histogram.

In [9]:
nbins = 10

feature_hist_resp = {}

for key in data_keys:
#     print("Computing for "+ key + "...")
    resp_data = resp_data_set_epochs[key]

    hist_windows = resp_data.resp.apply(lambda x: np.histogram(x, bins=nbins)[0])
    feature_hist_resp[key] = {'_feat' : (np.asarray(hist_windows))}

#### II.3

### III. ECG signal

Divide ECG data per 30-second epoch. 

In [12]:
ecg_data_set_epochs = {}

for key in data_keys:
    ecg_sig = ecg_data_set[key]
    time_sig = np.arange(len(ecg_sig))/fs

    # Divide into windows
    ecg_windows = su.divide_to_epochs(ecg_sig, ann_index[key], win_dur, fs)
    time_windows = su.divide_to_epochs(time_sig, ann_index[key], win_dur, fs)
    
    ecg_data_set_epochs[key] = pd.DataFrame({'ecg': list(ecg_windows), '_time': list(time_windows)})


#### III.1 Heart rate
*Reciprocal of the mean R-R interval in an epoch (unit: beats/minute)*


#### III.2 Heart rate variability
*Standard deviation of the R-R intervals in an epoch (unit: milliseconds)*


In [13]:
feature_heart_rate = {}
feature_heart_rate_var = {}

for key in data_keys:
    ecg_data = ecg_data_set_epochs[key]
    
    # Get time corresponding to R peaks
    ecg_data['r_peaks'] = ecg_data.ecg.apply(lambda x: ecg.hamilton_segmenter(x, sampling_rate=250)['rpeaks'])
    ecg_data['r_time'] = ecg_data.apply(lambda x: list(x._time[x.r_peaks]), axis = 1)
    
    # Compute heart rate
    ecg_data['heart_rate'] = ecg_data.r_time.apply(lambda x: su.heart_rate(np.array(x))*60)
    ecg_data['window_time'] = ecg_data['_time'].apply(lambda x: x[-1])
        
    # Compute heart rate variability
    ecg_data['heart_rate_var'] = ecg_data.r_time.apply(lambda x: su.heart_rate_var(np.array(x))*1000)
    
    feature_heart_rate[key] = {'_feat' : ecg_data.heart_rate.values}
    feature_heart_rate_var[key] = {'_feat' : ecg_data.heart_rate_var.values}

#### III.3 Histogram of ECG data
Use the counts for each bin of the magnitude histogram.

In [14]:
feature_hist_ecg = {}

for key in data_keys:
#     print("Computing for "+ key + "...")
    ecg_data = ecg_data_set_epochs[key]

    hist_windows = ecg_data.ecg.apply(lambda x: np.histogram(x, bins=nbins)[0])
    feature_hist_ecg[key] = {'_feat' : (np.asarray(hist_windows))}

## Compile features 
Combine features computed across all patients

In [81]:
features = [feature_prev_stage, feature_resp_rate, feature_hist_resp, 
            feature_heart_rate, feature_heart_rate_var, feature_hist_ecg]

f_merge = []
for f in features:
    fm = []
    for key in data_keys:
        fm += list(f[key]['_feat'])
    f_merge.append(np.reshape(fm, [len(fm), -1]))
    
sleep_features = np.hstack(f_merge)
sleep_features[np.where(sleep_features == '1')] = 1
sleep_features[np.where(sleep_features == '2')] = 2
sleep_features[np.where(sleep_features == '3')] = 3
sleep_features[np.where(sleep_features == '4')] = 4
sleep_features[np.where(sleep_features == 'R')] = 5
sleep_features[np.where(sleep_features == 'W')] = 6
sleep_features[np.where(sleep_features == 'M')] = 7

merge_labels = np.hstack([annot_data_set[key] for key in data_keys])

In [83]:
sleep_features = np.array(sleep_features, dtype = float)
sleep_data = sleep_features[np.isfinite(sleep_features).all(axis=1)]
sleep_labels = np.reshape(merge_labels, [len(merge_labels), 1])[np.isfinite(sleep_features).all(axis=1)]

## Classify

In [84]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [85]:
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [88]:
sleep_data_scaled = StandardScaler().fit_transform(sleep_data)
train_set, test_set, train_label, test_label = train_test_split(sleep_data_scaled, sleep_labels, test_size=0.33, random_state=42)

In [89]:
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes", "QDA", "Linear SVM", "RBF SVM",]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(), 
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1)]

In [90]:
for i in np.arange(len(classifiers)):
    print('Training ' + names[i])
    clf = classifiers[i]
    clf.fit(train_set, np.ndarray.flatten(train_label))
    prediction = clf.predict(test_set)
    print(names[i], accuracy_score(test_label, prediction))
    

Training Nearest Neighbors
('Nearest Neighbors', 0.81222056631892703)
Training Decision Tree
('Decision Tree', 1.0)
Training Random Forest
('Random Forest', 0.72727272727272729)
Training AdaBoost
('AdaBoost', 0.86825633383010437)
Training Naive Bayes
('Naive Bayes', 1.0)
Training QDA
('QDA', 0.99821162444113265)
Training Linear SVM




('Linear SVM', 0.9904619970193741)
Training RBF SVM
('RBF SVM', 0.50760059612518627)
