In [3]:
import os
import numpy as np
import pandas as pd
from pylab import find
import scipy.io
from sklearn.externals import joblib

# -----------------------------------------------------------------------------
# Returns dataframes containing the training and test file locations
# Run in the same directory as the training and test folders
# -----------------------------------------------------------------------------

def get_files():
    header_loc_train, arousal_loc_train, signal_loc_train = [], [], []
    header_loc_test, signal_loc_test = [], []
    for dirName, subdirList, fileList in os.walk('.', followlinks=True):
        
        if dirName.startswith('.\\training\\'):
            for fname in fileList:
                if '.hea' in fname:
                    header_loc_train.append(dirName + '\\' + fname)
                if '-arousal.mat' in fname:
                    arousal_loc_train.append(dirName + '\\' + fname)
                if 'mat' in fname and 'arousal' not in fname:
                    signal_loc_train.append(dirName + '\\' + fname)

        elif dirName.startswith('.\\test\\'):
            for fname in fileList:
                if '.hea' in fname:
                    header_loc_test.append(dirName + '\\' + fname)
                else:
                    signal_loc_test.append(dirName + '\\' + fname)

    data_locations_training = {'header':      header_loc_train,
                               'arousal':     arousal_loc_train,
                               'signal':      signal_loc_train,
                      }
    data_locations_test = {'header':     header_loc_test,
                           'signal':      signal_loc_test,}
    
    df_training_files = pd.DataFrame(data=data_locations_training)
    df_test_files = pd.DataFrame(data=data_locations_test)
    
    return df_training_files, df_test_files

file_locs = get_files()

In [175]:
import h5py
import numpy

# -----------------------------------------------------------------------------
# Helper functions to import content within each file
# -----------------------------------------------------------------------------

def import_signal_names(file_name):
    with open(file_name, 'r') as myfile:
        s = myfile.read()
        s = s.split('\n')
        s = [x.split() for x in s]

        n_signals = int(s[0][1])
        n_samples = int(s[0][3])
        Fs        = int(s[0][2])

        s = s[1:-1]
        s = [s[i][8] for i in range(0, n_signals)]
    return s, Fs, n_samples

def import_arousals(file_name):
    f = h5py.File(file_name, 'r')
    arousals = np.array(f['data']['arousals'])
    return arousals

def import_sleepstages(file_name):
    f = h5py.File(file_name, 'r')
    nonrem1, nonrem2, nonrem3, rem, undefined = np.array(f['data']['sleep_stages']['nonrem1']), np.array(f['data']['sleep_stages']['nonrem2']), np.array(f['data']['sleep_stages']['nonrem3']), np.array(f['data']['sleep_stages']['rem']), np.array(f['data']['sleep_stages']['undefined'])
    return nonrem1, nonrem2, nonrem3, rem, undefined

def import_signals(file_name):
    return np.transpose(scipy.io.loadmat(file_name)['val'])

In [176]:
# -----------------------------------------------------------------------------
# Helper functions to load a single sample's relevant files
# -----------------------------------------------------------------------------

def get_subject_data_train(arousal_file, signal_file, signal_names):
    this_arousal   = import_arousals(arousal_file)
    this_signal    = import_signals(signal_file)
    this_data      = np.append(this_signal, this_arousal, axis=1)
    this_data      = pd.DataFrame(this_data, index=None, columns=signal_names)
    return this_data

def get_subject_data_test(signal_file, signal_names):
    this_signal    = import_signals(signal_file)
    this_data      = this_signal
    this_data      = pd.DataFrame(this_data, index=None, columns=signal_names)
    return this_data

In [203]:
import math

# -----------------------------------------------------------------------------
# Loads an individual subject's data
# file_locations from get_files()
# sam_ind is the sample index number
# kept_param is the list of signals to be kept from total_parameters (i.e. ['CHEST', 'ECG'])
# inv is the number of timepoints to average over
# timeframe is the fractional subsample 
# -----------------------------------------------------------------------------

def raw_data_processing(file_location, sam_ind, kept_param, inv, subsample):
    scoring_file = file_location[0]['arousal'][sam_ind]
    signal_file = file_location[0]['signal'][sam_ind]
    nonrem1, nonrem2, nonrem3, rem, undefined = import_sleepstages(scoring_file)
    raw_arousals = import_arousals(scoring_file)
    raw_signals = import_signals(signal_file)
    
    total_parameters = {'F3-M2':0, 'F4-M1':1, 'C3-M2':2, 'C4-M1':3, 'O1-M2':4, 'O2-M1':5, 'E1-M2':6, 'Chin1-Chin2':7, 'ABD':8, 'CHEST':9, 'AIRFLOW':10, 'SaO2':11, 'ECG':12}
    kept_param_indices = []
    for param in kept_param:
        kept_param_indices.append(total_parameters[param])
    proc_signals = numpy.ndarray(shape = (raw_signals.shape[0], len(kept_param_indices)))
    for i in range(len(proc_signals)):
        proc_signals[i] = np.take(raw_signals[i], kept_param_indices, 0)
        
    step = int(200*subsample/inv)
    for i in range(1, len(proc_signals)-step, step):
        proc_signals_frac = numpy.ndarray(shape = (step, len(kept_param_indices)))
        proc_arousals_frac = numpy.ndarray(shape = (step, 1))
        a, n1, n2, n3, r, nsu = 0, 0, 0, 0, 0, 0
        for j in range(len(proc_signals_frac)):
            proc_signals_frac[j] = proc_signals[j+i-1]
            if raw_arousals[j+i-1] == 0.:
                if nonrem1[0][j+i-1] == 1.:
                    proc_arousals_frac[j] = 1.
                    n1+=1
                elif nonrem2[0][j+i-1] == 1.:
                    proc_arousals_frac[j] = 2.
                    n2+=1
                elif nonrem3[0][j+i-1] == 1.:
                    proc_arousals_frac[j] = 3.
                    n3+=1
                elif rem[0][j+i-1] == 1.:
                    proc_arousals_frac[j] = 4.
                    r+=1
                elif undefined[0][j+i-1] == 1.:
                    proc_arousals_frac[j] = -1.
                    nsu+=1
                else:
                    proc_arousals_frac[j] = 6.
            elif raw_arousals[j+i-1] == 1.:
                proc_arousals_frac[j] = 0.
                a+=1
            else:
                proc_arousals_frac[j] = -1.
                nsu+=1
        dist = numpy.array([a, n1, n2, n3, r, nsu])
        thresh = math.floor(0.95*step)
        f1 = "./training/processed_data/"
        f2 = "_training/" + str(scoring_file.split('\\')[2]) + "_" + str(i) + ".xz"
        if dist[0]> =thresh or dist[1]>=thresh or dist[2]>=thresh or dist[3]>=thresh or dist[4]>=thresh:
            if dist[0]>=thresh: file_dir = f1 + "arousal" + f2
            elif dist[1]>=thresh: file_dir = f1 + "nrem1" + f2
            elif dist[2]>=thresh: file_dir = f1 + "nrem2" + f2
            elif dist[3]>=thresh: file_dir = f1 + "nrem3" + f2
            elif dist[4]>=thresh: file_dir = f1 + "rem" + f2
            regions = [(scoring_file.split('\\')[2], proc_arousals_frac, proc_signals_frac, dist)]
            dataframe = pd.DataFrame(regions, index = None, columns = ['Patient ID', 'Arousal', 'Signal', 'Distribution']).set_index('Patient ID', drop = True)

            dataframe.to_pickle(file_dir)

for i in range(1, len(file_locs[0])):
    raw_data_processing(file_locs, i, ['O2-M1', 'E1-M2', 'Chin1-Chin2', 'ABD', 'CHEST', 'AIRFLOW', 'ECG'], 1, 60)
    print(str(i+1) + '/994 completed')

2/994 completed
3/994 completed
4/994 completed
5/994 completed
6/994 completed
7/994 completed
8/994 completed
9/994 completed
10/994 completed
11/994 completed
12/994 completed
13/994 completed
14/994 completed
15/994 completed
16/994 completed
17/994 completed
18/994 completed
19/994 completed
20/994 completed
21/994 completed
22/994 completed
23/994 completed
24/994 completed
25/994 completed
26/994 completed
27/994 completed
28/994 completed
29/994 completed
30/994 completed
31/994 completed
32/994 completed
33/994 completed
34/994 completed
35/994 completed
36/994 completed
37/994 completed
38/994 completed
39/994 completed
40/994 completed
41/994 completed
42/994 completed
43/994 completed
44/994 completed
45/994 completed
46/994 completed
47/994 completed
48/994 completed
49/994 completed
50/994 completed
51/994 completed
52/994 completed
53/994 completed
54/994 completed
55/994 completed
56/994 completed
57/994 completed
58/994 completed
59/994 completed
60/994 completed
61/99

464/994 completed
465/994 completed
466/994 completed
467/994 completed
468/994 completed
469/994 completed
470/994 completed
471/994 completed
472/994 completed
473/994 completed
474/994 completed
475/994 completed
476/994 completed
477/994 completed
478/994 completed
479/994 completed
480/994 completed
481/994 completed
482/994 completed
483/994 completed
484/994 completed
485/994 completed
486/994 completed
487/994 completed
488/994 completed
489/994 completed
490/994 completed
491/994 completed
492/994 completed
493/994 completed
494/994 completed
495/994 completed
496/994 completed
497/994 completed
498/994 completed
499/994 completed
500/994 completed
501/994 completed
502/994 completed
503/994 completed
504/994 completed
505/994 completed
506/994 completed
507/994 completed
508/994 completed
509/994 completed
510/994 completed
511/994 completed
512/994 completed
513/994 completed
514/994 completed
515/994 completed
516/994 completed
517/994 completed
518/994 completed
519/994 co

920/994 completed
921/994 completed
922/994 completed
923/994 completed
924/994 completed
925/994 completed
926/994 completed
927/994 completed
928/994 completed
929/994 completed
930/994 completed
931/994 completed
932/994 completed
933/994 completed
934/994 completed
935/994 completed
936/994 completed
937/994 completed
938/994 completed
939/994 completed
940/994 completed
941/994 completed
942/994 completed
943/994 completed
944/994 completed
945/994 completed
946/994 completed
947/994 completed
948/994 completed
949/994 completed
950/994 completed
951/994 completed
952/994 completed
953/994 completed
954/994 completed
955/994 completed
956/994 completed
957/994 completed
958/994 completed
959/994 completed
960/994 completed
961/994 completed
962/994 completed
963/994 completed
964/994 completed
965/994 completed
966/994 completed
967/994 completed
968/994 completed
969/994 completed
970/994 completed
971/994 completed
972/994 completed
973/994 completed
974/994 completed
975/994 co

In [8]:
test = pd.read_pickle('000.xz')
test.head()

Unnamed: 0,Arousal,Signal
0,"[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0....","[[3.0, -9.0, -4.0, 71.0, -9.0, 19.0, 1.0], [0...."


In [11]:
print(test.drop(['Arousal'], axis = 1))

                                              Signal
0  [[3.0, -9.0, -4.0, 71.0, -9.0, 19.0, 1.0], [0....
