In [1]:
import os
import numpy as np
import pandas as pd
import pylab
import scipy.io
import sklearn.externals
import mat73
import wfdb
import scipy
import sklearn
from imblearn.over_sampling import SMOTE

In [2]:
# funcoes desenvolvidas pela propria physionet
# -----------------------------------------------------------------------------
# import the outcome vector, given the file name.
# e.g. /training/tr04-0808/tr04-0808-arousal.mat
# -----------------------------------------------------------------------------
def import_arousals(file_name):
    import h5py
    import numpy
    f = h5py.File(file_name, 'r')
    arousals = numpy.array(f['data']['arousals'])
    return arousals

def import_signals(file_name):
    return np.transpose(scipy.io.loadmat(file_name)['val'])

# -----------------------------------------------------------------------------
# Take a header file as input, and returns the names of the signals
# For the corresponding .mat file containing the signals.
# -----------------------------------------------------------------------------
def import_signal_names(file_name):
    with open(file_name, 'r') as myfile:
        s = myfile.read()
        s = s.split('\n')
        s = [x.split() for x in s]

        n_signals = int(s[0][1])
        n_samples = int(s[0][3])
        Fs        = int(s[0][2])

        s = s[1:-1]
        s = [s[i][8] for i in range(0, n_signals)]
    return s, Fs, n_samples

# -----------------------------------------------------------------------------
# Get a given subject's data
# -----------------------------------------------------------------------------
def get_subject_data(arousal_file, signal_file, signal_names):
    this_arousal   = import_arousals(arousal_file)
    this_signal    = mat73.loadmat(signal_file)
    this_data      = np.append(this_signal, this_arousal, axis=1)
    this_data      = pd.DataFrame(this_data, index=None, columns=signal_names)
    return this_data

def get_subject_data_test(signal_file, signal_names):
    this_signal    = mat73.loadmat(signal_file)
    this_data      = this_signal
    this_data      = pd.DataFrame(this_data, index=None, columns=signal_names)
    return this_data

tr03-0005.mat: a Matlab V4 file containing the signal data.
tr03-0005.hea: record header file - a text file which describes the format of the signal data.
tr03-0005.arousal: arousal and sleep stage annotations, in WFDB annotation format.
tr03-0005-arousal.mat: a Matlab V7 structure containing a sample-wise vector with three distinct values (+1, 0, -1) where:
    +1: Designates arousal regions
    0: Designates non-arousal regions
    -1: Designates regions that will not be scored

In [3]:
# importandos dados de tr03-0005, dados de resposta
arousals_03 = import_arousals('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005-arousal.mat')

In [4]:
arousals_03

array([[ 0.],
       [ 0.],
       [ 0.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [5]:
data_all = mat73.loadmat('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005-arousal.mat')

In [6]:
data_all

{'data': {'arousals': array([ 0.,  0.,  0., ..., -1., -1., -1.]),
  'sleep_stages': {'nonrem1': array([False, False, False, ..., False, False, False]),
   'nonrem2': array([False, False, False, ..., False, False, False]),
   'nonrem3': array([False, False, False, ..., False, False, False]),
   'rem': array([False, False, False, ..., False, False, False]),
   'undefined': array([ True,  True,  True, ..., False, False,  True]),
   'wake': array([False, False, False, ...,  True,  True, False])}}}

In [7]:
header = import_signal_names('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005.hea')

In [8]:
header

(['F3-M2',
  'F4-M1',
  'C3-M2',
  'C4-M1',
  'O1-M2',
  'O2-M1',
  'E1-M2',
  'Chin1-Chin2',
  'ABD',
  'CHEST',
  'AIRFLOW',
  'SaO2',
  'ECG'],
 200,
 5147000)

In [9]:
data_mat = scipy.io.loadmat('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005.mat')

In [10]:
data_mat = pd.DataFrame(data_mat['val']).T

In [11]:
data_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-9,5,-5,9,5,12,-21,16,-41,-2,39,30496,-53
1,-36,2,-33,5,-27,3,-50,25,-36,0,93,30496,-103
2,-23,2,-23,1,-21,-6,-40,-4,-27,3,62,30496,6
3,-11,1,-12,-4,-11,-11,-30,-29,-17,6,41,30496,58
4,-23,5,-22,-1,-19,-8,-41,13,-8,9,72,30496,-64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146995,0,0,0,0,0,0,0,0,0,-107,6,-32443,-755
5146996,0,0,0,0,0,0,0,0,0,-107,0,-32443,405
5146997,0,0,0,0,0,0,0,0,0,-107,-6,-32443,543
5146998,0,0,0,0,0,0,0,0,0,-106,3,-32443,-728


In [12]:
columns_names = header[0]

In [13]:
data_mat.columns = columns_names

In [14]:
data_mat['F3-M2'] = (data_mat['F3-M2'] - data_mat['F3-M2'].min()) / (data_mat['F3-M2'].max() - data_mat['F3-M2'].min())
data_mat['F4-M1'] = (data_mat['F4-M1'] - data_mat['F4-M1'].min()) / (data_mat['F4-M1'].max() - data_mat['F4-M1'].min())
data_mat['C3-M2'] = (data_mat['C3-M2'] - data_mat['C3-M2'].min()) / (data_mat['C3-M2'].max() - data_mat['C3-M2'].min())
data_mat['C4-M1'] = (data_mat['C4-M1'] - data_mat['C4-M1'].min()) / (data_mat['C4-M1'].max() - data_mat['C4-M1'].min())
data_mat['O1-M2'] = (data_mat['O1-M2'] - data_mat['O1-M2'].min()) / (data_mat['O1-M2'].max() - data_mat['O1-M2'].min())
data_mat['O2-M1'] = (data_mat['O2-M1'] - data_mat['O2-M1'].min()) / (data_mat['O2-M1'].max() - data_mat['O2-M1'].min())
data_mat['E1-M2'] = (data_mat['E1-M2'] - data_mat['E1-M2'].min()) / (data_mat['E1-M2'].max() - data_mat['E1-M2'].min())
data_mat['Chin1-Chin2'] = (data_mat['Chin1-Chin2'] - data_mat['Chin1-Chin2'].min()) / (data_mat['Chin1-Chin2'].max() - data_mat['Chin1-Chin2'].min())
data_mat['ABD'] = (data_mat['ABD'] - data_mat['ABD'].min()) / (data_mat['ABD'].max() - data_mat['ABD'].min())
data_mat['CHEST'] = (data_mat['CHEST'] - data_mat['CHEST'].min()) / (data_mat['CHEST'].max() - data_mat['CHEST'].min())
data_mat['AIRFLOW'] = (data_mat['AIRFLOW'] - data_mat['AIRFLOW'].min()) / (data_mat['AIRFLOW'].max() - data_mat['AIRFLOW'].min())
data_mat['ECG'] = (data_mat['ECG'] - data_mat['ECG'].min()) / (data_mat['ECG'].max() - data_mat['ECG'].min())
data_mat['SaO2'] = (data_mat['SaO2'] - data_mat['SaO2'].min()) / (data_mat['SaO2'].max() - data_mat['SaO2'].min())

  data_mat['SaO2'] = (data_mat['SaO2'] - data_mat['SaO2'].min()) / (data_mat['SaO2'].max() - data_mat['SaO2'].min())


In [15]:
# juntado o dataframe com os dados de entrada e a resposta 

data_mat['Arousal'] = arousals_03

In [16]:
data_mat

Unnamed: 0,F3-M2,F4-M1,C3-M2,C4-M1,O1-M2,O2-M1,E1-M2,Chin1-Chin2,ABD,CHEST,AIRFLOW,SaO2,ECG,Arousal
0,0.479591,0.681240,0.498276,0.553212,0.598466,0.693206,0.495755,0.479799,0.502993,0.475416,0.527471,1.99923,0.497017,0.0
1,0.476136,0.680877,0.494562,0.552725,0.595058,0.691454,0.491908,0.481093,0.503616,0.475777,0.534833,1.99923,0.491052,0.0
2,0.477799,0.680877,0.495889,0.552237,0.595697,0.689702,0.493234,0.476923,0.504738,0.476320,0.530607,1.99923,0.504056,0.0
3,0.479335,0.680756,0.497347,0.551627,0.596762,0.688729,0.494561,0.473329,0.505985,0.476862,0.527744,1.99923,0.510260,0.0
4,0.477799,0.681240,0.496021,0.551993,0.595910,0.689313,0.493102,0.479367,0.507107,0.477404,0.531970,1.99923,0.495705,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146995,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.477498,0.508105,0.456435,0.522972,-0.00000,0.413267,-1.0
5146996,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.477498,0.508105,0.456435,0.522154,-0.00000,0.551658,-1.0
5146997,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.477498,0.508105,0.456435,0.521336,-0.00000,0.568122,-1.0
5146998,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.477498,0.508105,0.456616,0.522563,-0.00000,0.416488,-1.0


In [17]:
# removendo os valores de resposta -1, pois nao serão identificados. E em seguida removendo a coluna "Arousal"

df_remove = data_mat.loc[(data_mat['Arousal'] == -1)]
data_mat = data_mat.drop(df_remove.index)
data_mat = data_mat.drop('Arousal',axis=1)

In [18]:
data_mat

Unnamed: 0,F3-M2,F4-M1,C3-M2,C4-M1,O1-M2,O2-M1,E1-M2,Chin1-Chin2,ABD,CHEST,AIRFLOW,SaO2,ECG
0,0.479591,0.681240,0.498276,0.553212,0.598466,0.693206,0.495755,0.479799,0.502993,0.475416,0.527471,1.999230,0.497017
1,0.476136,0.680877,0.494562,0.552725,0.595058,0.691454,0.491908,0.481093,0.503616,0.475777,0.534833,1.999230,0.491052
2,0.477799,0.680877,0.495889,0.552237,0.595697,0.689702,0.493234,0.476923,0.504738,0.476320,0.530607,1.999230,0.504056
3,0.479335,0.680756,0.497347,0.551627,0.596762,0.688729,0.494561,0.473329,0.505985,0.476862,0.527744,1.999230,0.510260
4,0.477799,0.681240,0.496021,0.551993,0.595910,0.689313,0.493102,0.479367,0.507107,0.477404,0.531970,1.999230,0.495705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4803993,0.480742,0.680271,0.498806,0.551506,0.596975,0.688729,0.498541,0.476492,0.478678,0.479393,0.520791,3.497306,0.549988
4803994,0.480614,0.680514,0.498541,0.551627,0.596656,0.688923,0.498541,0.477211,0.478554,0.479393,0.521881,3.497306,0.399547
4803995,0.480486,0.680514,0.498408,0.551384,0.596549,0.688534,0.498541,0.477211,0.478429,0.479212,0.521609,3.497306,0.511930
4803996,0.480358,0.679908,0.498276,0.551140,0.596869,0.688145,0.498806,0.477354,0.478304,0.479212,0.520791,3.497306,0.594369


In [19]:
data_arousal_03 = pd.DataFrame(arousals_03)
data_arousal_03.columns = ['Respostas']
df_remove_arousal_03 = data_arousal_03.loc[(data_arousal_03.Respostas == -1)]
data_arousal_03 = data_arousal_03.drop(df_remove_arousal_03.index)
data_arousal_03

Unnamed: 0,Respostas
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
4803993,1.0
4803994,1.0
4803995,1.0
4803996,1.0


In [20]:
data_mat_29 = scipy.io.loadmat('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005.mat')
data_arousal_29 = import_arousals('C:/Users/thiago.barral/OneDrive - B2W Digital/Documentos/GitHub/projetofinalmestrado/tr03-0005-arousal.mat')

In [21]:
# manipulando data_mat_29_para ficar no formato ideal

data_mat_29 = pd.DataFrame(data_mat_29['val']).T
data_mat_29.columns = columns_names
data_mat_29['F3-M2'] = (data_mat_29['F3-M2'] - data_mat_29['F3-M2'].min()) / (data_mat_29['F3-M2'].max() - data_mat_29['F3-M2'].min())
data_mat_29['F4-M1'] = (data_mat_29['F4-M1'] - data_mat_29['F4-M1'].min()) / (data_mat_29['F4-M1'].max() - data_mat_29['F4-M1'].min())
data_mat_29['C3-M2'] = (data_mat_29['C3-M2'] - data_mat_29['C3-M2'].min()) / (data_mat_29['C3-M2'].max() - data_mat_29['C3-M2'].min())
data_mat_29['C4-M1'] = (data_mat_29['C4-M1'] - data_mat_29['C4-M1'].min()) / (data_mat_29['C4-M1'].max() - data_mat_29['C4-M1'].min())
data_mat_29['O1-M2'] = (data_mat_29['O1-M2'] - data_mat_29['O1-M2'].min()) / (data_mat_29['O1-M2'].max() - data_mat_29['O1-M2'].min())
data_mat_29['O2-M1'] = (data_mat_29['O2-M1'] - data_mat_29['O2-M1'].min()) / (data_mat_29['O2-M1'].max() - data_mat_29['O2-M1'].min())
data_mat_29['E1-M2'] = (data_mat_29['E1-M2'] - data_mat_29['E1-M2'].min()) / (data_mat_29['E1-M2'].max() - data_mat_29['E1-M2'].min())
data_mat_29['Chin1-Chin2'] = (data_mat_29['Chin1-Chin2'] - data_mat_29['Chin1-Chin2'].min()) / (data_mat['Chin1-Chin2'].max() - data_mat_29['Chin1-Chin2'].min())
data_mat_29['ABD'] = (data_mat_29['ABD'] - data_mat_29['ABD'].min()) / (data_mat_29['ABD'].max() - data_mat_29['ABD'].min())
data_mat_29['CHEST'] = (data_mat_29['CHEST'] - data_mat_29['CHEST'].min()) / (data_mat_29['CHEST'].max() - data_mat_29['CHEST'].min())
data_mat_29['AIRFLOW'] = (data_mat_29['AIRFLOW'] - data_mat_29['AIRFLOW'].min()) / (data_mat_29['AIRFLOW'].max() - data_mat_29['AIRFLOW'].min())
data_mat_29['ECG'] = (data_mat_29['ECG'] - data_mat_29['ECG'].min()) / (data_mat_29['ECG'].max() - data_mat_29['ECG'].min())
data_mat_29['SaO2'] = (data_mat_29['SaO2'] - data_mat_29['SaO2'].min()) / (data_mat_29['SaO2'].max() - data_mat_29['SaO2'].min())
data_mat_29

  data_mat_29['SaO2'] = (data_mat_29['SaO2'] - data_mat_29['SaO2'].min()) / (data_mat_29['SaO2'].max() - data_mat_29['SaO2'].min())


Unnamed: 0,F3-M2,F4-M1,C3-M2,C4-M1,O1-M2,O2-M1,E1-M2,Chin1-Chin2,ABD,CHEST,AIRFLOW,SaO2,ECG
0,0.479591,0.681240,0.498276,0.553212,0.598466,0.693206,0.495755,1.004550,0.502993,0.475416,0.527471,1.99923,0.497017
1,0.476136,0.680877,0.494562,0.552725,0.595058,0.691454,0.491908,1.007260,0.503616,0.475777,0.534833,1.99923,0.491052
2,0.477799,0.680877,0.495889,0.552237,0.595697,0.689702,0.493234,0.998530,0.504738,0.476320,0.530607,1.99923,0.504056
3,0.479335,0.680756,0.497347,0.551627,0.596762,0.688729,0.494561,0.991004,0.505985,0.476862,0.527744,1.99923,0.510260
4,0.477799,0.681240,0.496021,0.551993,0.595910,0.689313,0.493102,1.003647,0.507107,0.477404,0.531970,1.99923,0.495705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146995,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.522972,-0.00000,0.413267
5146996,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.522154,-0.00000,0.551658
5146997,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.521336,-0.00000,0.568122
5146998,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456616,0.522563,-0.00000,0.416488


In [22]:
# juntando os dataframes para excluir as linhas com arousals = -1

data_mat_29['Arousal'] = data_arousal_29

In [23]:
data_mat_29

Unnamed: 0,F3-M2,F4-M1,C3-M2,C4-M1,O1-M2,O2-M1,E1-M2,Chin1-Chin2,ABD,CHEST,AIRFLOW,SaO2,ECG,Arousal
0,0.479591,0.681240,0.498276,0.553212,0.598466,0.693206,0.495755,1.004550,0.502993,0.475416,0.527471,1.99923,0.497017,0.0
1,0.476136,0.680877,0.494562,0.552725,0.595058,0.691454,0.491908,1.007260,0.503616,0.475777,0.534833,1.99923,0.491052,0.0
2,0.477799,0.680877,0.495889,0.552237,0.595697,0.689702,0.493234,0.998530,0.504738,0.476320,0.530607,1.99923,0.504056,0.0
3,0.479335,0.680756,0.497347,0.551627,0.596762,0.688729,0.494561,0.991004,0.505985,0.476862,0.527744,1.99923,0.510260,0.0
4,0.477799,0.681240,0.496021,0.551993,0.595910,0.689313,0.493102,1.003647,0.507107,0.477404,0.531970,1.99923,0.495705,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146995,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.522972,-0.00000,0.413267,-1.0
5146996,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.522154,-0.00000,0.551658,-1.0
5146997,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456435,0.521336,-0.00000,0.568122,-1.0
5146998,0.480742,0.680635,0.498939,0.552115,0.597934,0.690870,0.498541,0.999734,0.508105,0.456616,0.522563,-0.00000,0.416488,-1.0


In [24]:
# removendo as linhas com arousal = -1 

df_remove_29 = data_mat_29.loc[(data_mat_29['Arousal'] == -1)]
data_mat_29 = data_mat_29.drop(df_remove_29.index)
data_mat_29 = data_mat_29.drop('Arousal',axis=1)
data_mat_29

Unnamed: 0,F3-M2,F4-M1,C3-M2,C4-M1,O1-M2,O2-M1,E1-M2,Chin1-Chin2,ABD,CHEST,AIRFLOW,SaO2,ECG
0,0.479591,0.681240,0.498276,0.553212,0.598466,0.693206,0.495755,1.004550,0.502993,0.475416,0.527471,1.999230,0.497017
1,0.476136,0.680877,0.494562,0.552725,0.595058,0.691454,0.491908,1.007260,0.503616,0.475777,0.534833,1.999230,0.491052
2,0.477799,0.680877,0.495889,0.552237,0.595697,0.689702,0.493234,0.998530,0.504738,0.476320,0.530607,1.999230,0.504056
3,0.479335,0.680756,0.497347,0.551627,0.596762,0.688729,0.494561,0.991004,0.505985,0.476862,0.527744,1.999230,0.510260
4,0.477799,0.681240,0.496021,0.551993,0.595910,0.689313,0.493102,1.003647,0.507107,0.477404,0.531970,1.999230,0.495705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4803993,0.480742,0.680271,0.498806,0.551506,0.596975,0.688729,0.498541,0.997627,0.478678,0.479393,0.520791,3.497306,0.549988
4803994,0.480614,0.680514,0.498541,0.551627,0.596656,0.688923,0.498541,0.999132,0.478554,0.479393,0.521881,3.497306,0.399547
4803995,0.480486,0.680514,0.498408,0.551384,0.596549,0.688534,0.498541,0.999132,0.478429,0.479212,0.521609,3.497306,0.511930
4803996,0.480358,0.679908,0.498276,0.551140,0.596869,0.688145,0.498806,0.999433,0.478304,0.479212,0.520791,3.497306,0.594369


In [25]:
data_arousal_29 = pd.DataFrame(data_arousal_29)
data_arousal_29.columns = ['Respostas']
df_remove_arousal_29 = data_arousal_29.loc[(data_arousal_29.Respostas == -1)]
data_arousal_29 = data_arousal_29.drop(df_remove_arousal_29.index)
data_arousal_29

Unnamed: 0,Respostas
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
4803993,1.0
4803994,1.0
4803995,1.0
4803996,1.0


In [26]:
data_mat_03_EEG = data_mat[['F3-M2', 'F4-M1', 'C3-M2', 'C4-M1', 'O1-M2', 'O2-M1']]
data_mat_29_EEG = data_mat_29[['F3-M2', 'F4-M1', 'C3-M2', 'C4-M1', 'O1-M2', 'O2-M1']]

In [27]:
#smt = SMOTE()
#X_mat_03, y_aurosal_03 = smt.fit_resample(data_mat, data_arousal_03)

In [27]:
#X_mat_29, y_aurosal_29 = smt.fit_resample(data_mat_29, data_arousal_29)

In [28]:
from sklearn.model_selection import train_test_split

In [28]:
X_train_03, X_test_03, y_train_03, y_test_03 = train_test_split(data_mat, data_arousal_03, test_size=0.33, random_state=42)

In [29]:
from sklearn import svm

In [None]:
# treinamento do modelo de SVM

svc = svm.SVC(C=0.0089, kernel='rbf')
svc.fit(data_mat_03_EEG, np.ravel(data_arousal_03))

In [None]:
# treinamento do modelo de SVM

svr = svm.SVR(C=1.0, epsilon=0.2)
svr.fit(X_train_03, y_train_03)

  return f(*args, **kwargs)


In [None]:
# prevendo valores para outro exemplo

y_pred = svc.predict(data_mat_29_EEG)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# avaliação das respostas previstas e resposta correta

confusion_matrix(data_arousal_29, y_pred)

In [None]:
print("Accuracy:",metrics.accuracy_score(data_arousal_29, y_pred))

In [None]:
auprc = sklearn.metrics.average_precision_score(data_arousal_29, y_pred)
auprc

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve

In [None]:
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(data_arousal_29, y_pred)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print(auc_precision_recall)

In [None]:
import sys
import os
import numpy
import h5py
import argparse

In [73]:
class Challenge2018Score:
    """Class used to compute scores for the 2018 PhysioNet/CinC Challenge.

    A Challenge2018Score object aggregates the outputs of a proposed
    classification algorithm, and calculates the area under the
    precision-recall curve, as well as the area under the receiver
    operating characteristic curve.

    After creating an instance of this class, call score_record() for
    each record being tested.  To calculate scores for a particular
    record, call record_auprc() and record_auroc().  After scoring all
    records, call gross_auprc() and gross_auroc() to obtain the scores
    for the database as a whole.
    """

    def __init__(self, input_digits=None):
        """Initialize a new scoring buffer.

        If 'input_digits' is given, it is the number of decimal digits
        of precision used in input probability values.
        """
        if input_digits is None:
            input_digits = 3
        self._scale = 10**input_digits
        self._pos_values = numpy.zeros(self._scale + 1, dtype=numpy.int64)
        self._neg_values = numpy.zeros(self._scale + 1, dtype=numpy.int64)
        self._record_auc = {}

    def score_record(self, truth, predictions, record_name=None):
        """Add results for a given record to the buffer.

        'truth' is a vector of arousal values: zero for non-arousal
        regions, positive for target arousal regions, and negative for
        unscored regions.

        'predictions' is a vector of probabilities produced by the
        classification algorithm being tested.  This vector must be
        the same length as 'truth', and each value must be between 0
        and 1.

        If 'record_name' is specified, it can be used to obtain
        per-record scores afterwards, by calling record_auroc() and
        record_auprc().
        """
        # Check if length is correct
        if len(predictions) != len(truth):
            raise ValueError("length of 'predictions' does not match 'truth'")

        # Compute the histogram of all input probabilities
        b = self._scale + 1
        r = (-0.5 / self._scale, 1.0 + 0.5 / self._scale)
        all_values = numpy.histogram(predictions, bins=b, range=r)[0]

        # Check if input contains any out-of-bounds or NaN values
        # (which are ignored by numpy.histogram)
        if numpy.sum(all_values) != len(predictions):
            raise ValueError("invalid values in 'predictions'")

        # Compute the histogram of probabilities within arousal regions
        pred_pos = predictions[truth > 0]
        pos_values = numpy.histogram(pred_pos, bins=b, range=r)[0]

        # Compute the histogram of probabilities within unscored regions
        pred_ign = predictions[truth < 0]
        ign_values = numpy.histogram(pred_ign, bins=b, range=r)[0]

        # Compute the histogram of probabilities in non-arousal regions,
        # given the above
        neg_values = all_values - pos_values - ign_values

        self._pos_values += pos_values
        self._neg_values += neg_values

        if record_name is not None:
            self._record_auc[record_name] = self._auc(pos_values, neg_values)

    def _auc(self, pos_values, neg_values):
        # Calculate areas under the ROC and PR curves by iterating
        # over the possible threshold values.

        # At the minimum threshold value, all samples are classified as
        # positive, and thus TPR = 1 and TNR = 0.
        tp = numpy.sum(pos_values)
        fp = numpy.sum(neg_values)
        tn = fn = 0
        tpr = 1
        tnr = 0
        if tp == 0 or fp == 0:
            # If either class is empty, scores are undefined.
            return (float('nan'), float('nan'))
        ppv = float(tp) / (tp + fp)
        auroc = 0
        auprc = 0

        # As the threshold increases, TP decreases (and FN increases)
        # by pos_values[i], while TN increases (and FP decreases) by
        # neg_values[i].
        for (n_pos, n_neg) in zip(pos_values, neg_values):
            tp -= n_pos
            fn += n_pos
            fp -= n_neg
            tn += n_neg
            tpr_prev = tpr
            tnr_prev = tnr
            ppv_prev = ppv
            tpr = float(tp) / (tp + fn)
            tnr = float(tn) / (tn + fp)
            if tp + fp > 0:
                ppv = float(tp) / (tp + fp)
            else:
                ppv = ppv_prev
            auroc += (tpr_prev - tpr) * (tnr + tnr_prev) * 0.5
            auprc += (tpr_prev - tpr) * ppv_prev
        return (auroc, auprc)

    def gross_auroc(self):
        """Compute the area under the ROC curve.

        The result will be NaN if none of the records processed so far
        contained any target arousals.
        """
        return self._auc(self._pos_values, self._neg_values)[0]

    def gross_auprc(self):
        """Compute the area under the precision-recall curve.

        The result will be NaN if none of the records processed so far
        contained any target arousals.
        """
        return self._auc(self._pos_values, self._neg_values)[1]

    def record_auroc(self, record_name):
        """Compute the area under the ROC curve for a single record.

        The result will be NaN if the record did not contain any
        target arousals.

        The given record must have previously been processed by
        calling score_record().
        """
        return self._record_auc[record_name][0]

    def record_auprc(self, record_name):
        """Compute the area under the PR curve for a single record.

        The result will be NaN if the record did not contain any
        target arousals.

        The given record must have previously been processed by
        calling score_record().
        """
        return self._record_auc[record_name][1]


In [72]:
Challenge2018Score.gross_auprc(data_arousal_29)

AttributeError: 'DataFrame' object has no attribute '_auc'

In [72]:
len(b)

4770000

In [73]:
len(data_arousal_03)

2541898

In [36]:
len(data_mat)

2541898

In [64]:
from sklearn import metrics

In [78]:
print("AUPRC:",sklearn.metrics.average_precision_score(data_arousal_29, y_pred))

AUPRC: 0.07318428984955337


In [None]:
print("Accuracy:",metrics.accuracy_score(data_arousal_29, y_pred))

In [57]:
data_arousal_29 = np.array(data_arousal_29)
y_pred = np.array(y_pred)

In [67]:
type(y_pred)

numpy.ndarray

In [68]:
type(data_arousal_29)

numpy.ndarray

In [49]:
import numpy as np
import os
import sys
import glob
import h5py
from pylab import find
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
import gc

ImportError: cannot import name 'find' from 'pylab' (C:\Users\thiag\anaconda3\lib\site-packages\pylab\__init__.py)

In [51]:
!pip install gc

ERROR: Could not find a version that satisfies the requirement gc (from versions: none)
ERROR: No matching distribution found for gc


In [62]:
def score_training_set():
    AUROC, AUPRC = [], []
    for vec_file in glob.glob('tr*.vec'):
        gc.collect()
        record_name = x

        # Load predictions from .vec file
        predictions = np.y_pred

        # Load target arousals
        arousal_file = ('training/%s/%s-arousal.mat'
                        % (record_name, record_name))
        f = h5py.File(data_arousal_29, 'r')
        arousals = np.array(f['data']['arousals'])

        # Remove any segments that have a -1 label before grading
        keep = find(arousals != -1)
        used_predictions = predictions[keep]
        used_arousals    = arousals[keep]

        if np.any(used_arousals):
            precision, recall, thresholds = \
                precision_recall_curve(np.ravel(used_arousals),
                                       used_predictions,
                                       pos_label=1, sample_weight=None)
            auprc = auc(recall, precision)
            auroc = roc_auc_score(np.ravel(used_arousals), used_predictions)
            AUPRC.append(auprc)
            AUROC.append(auroc)
        else:
            auprc = auroc = float('nan')
        print('%s AUROC:%f AUPRC:%f' % (record_name, auroc, auprc))

    print()
    print('Training AUROC Performance: %f+/-%f'
          % (np.mean(AUROC), np.std(AUROC)))
    print('Training AUPRC Performance: %f+/-%f'
          % (np.mean(AUPRC), np.std(AUPRC)))
    print()

if __name__ == '__main__':
    score_training_set()    


Training AUROC Performance: nan+/-nan
Training AUPRC Performance: nan+/-nan



In [70]:
b = import_arousals('C:/Users/thiag/OneDrive/Área de Trabalho/Thiago/Mestrado/Projeto Final/Dados/physionet.orgcontentchallenge-20181.0.0/you-snooze-you-win-the-physionet-computing-in-cardiology-challenge-2018-1.0.0/tr03-0029-arousal.mat')

In [76]:
b

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])