## ML with Extracted Features

Gruss S, Walter S (2015) Data from: Pain intensity recognition rates via biopotential feature patterns with support vector machines. Dryad Digital Repository. https://doi.org/10.5061/dryad.2b09s


In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression

In [2]:
rawdata = pd.read_csv('features.csv') #(8500, 160)

In [3]:
Y_LABEL = 'Label'
rawdata = rawdata.sample(frac=1)

label = rawdata[Y_LABEL]
label = label.replace(['level_zero', 'level_one','level_two','level_three','level_four'],[0,1,2,3,4])
rawX = rawdata.drop([Y_LABEL],  axis=1)
rawX.iloc[:5]
N,D = rawX.shape

In [4]:
# check for NaN
rawX.isnull().sum().sum()

73

In [5]:
X=rawX.fillna(rawX.mean())
X.isnull().sum().sum()

0

In [6]:
# Normalization 
norm = lambda x: (x - x.mean()) / x.std()
X = X.apply(norm)

### Logistic regression

In [9]:
lreg = Pipeline([("imputer", Imputer()),
                     ('lreg', LogisticRegression(
                      solver = 'sag', max_iter = 500, 
                                                 class_weight = 'balanced',
                                                 multi_class = 'ovr',
                                                 warm_start = True, verbose =1))
                       ])
lreg_result = cross_validate(lreg, X, label, cv=10, return_train_score=True)
train_accu = lreg_result['train_score'].mean()
test_accu = lreg_result['test_score'].mean()
print('Train accuracy: %.2f%%, Test accuracy: %.2f%%' %( train_accu*100,test_accu*100))

max_iter reached after 9 seconds




max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   42.6s finished


max_iter reached after 8 seconds




max_iter reached after 9 seconds
max_iter reached after 9 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.8s finished


max_iter reached after 8 seconds




max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 9 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.1s finished


max_iter reached after 8 seconds




max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   42.0s finished


max_iter reached after 8 seconds




max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.8s finished


max_iter reached after 8 seconds




max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   42.6s finished


max_iter reached after 9 seconds




max_iter reached after 9 seconds
max_iter reached after 9 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.6s finished


max_iter reached after 8 seconds




max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.9s finished


max_iter reached after 8 seconds




max_iter reached after 9 seconds
max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 8 seconds


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   42.2s finished


max_iter reached after 9 seconds




max_iter reached after 8 seconds
max_iter reached after 9 seconds
max_iter reached after 9 seconds
max_iter reached after 9 seconds
Train accuracy: 42.84%, Test accuracy: 38.95%


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.4s finished


### Random Forest


In [13]:
rf_estimator = [("imputer", Imputer()),
                      ("forest", RandomForestClassifier(random_state=0,n_estimators=100,
                                                    warm_start = True))
                     ]
rf_pipe = Pipeline(rf_estimator)
rf_score = cross_val_score(rf_pipe, X, label, cv=10).mean()
print(rf_score)

0.430705882353


### AdaBoostClassifier

In [27]:
ada_estimator = [("ada", AdaBoostClassifier(random_state=0,n_estimators=200))]
ada_pipe = Pipeline(ada_estimator)
ada_result = cross_validate(rf_pipe, X, label, cv=10, return_train_score=True)
train_accu = ada_result['train_score'].mean()
test_accu = ada_result['test_score'].mean()
print('Train accuracy: %.2f%%, Test accuracy: %.2f%%' %( train_accu*100,test_accu*100))

Train accuracy: 100.00%, Test accuracy: 43.07%


### SVM

In [None]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

svm_estimator = LinearSVC()
classifier = OneVsOneClassifier(svm_estimator)
svm_result = cross_validate(classifier, X, label, cv=10, return_train_score=True)
train_accu = svm_result['train_score'].mean()
test_accu = svm_result['test_score'].mean()
print('Train accuracy: %.2f%%, Test accuracy: %.2f%%' %( train_accu*100,test_accu*100))

### Neural Nets

In [17]:
# convert label into one-hot encoding
dummy_y = pd.get_dummies(label)
dummy_y.head()

Unnamed: 0,0,1,2,3,4
3118,0,0,0,1,0
6259,0,0,1,0,0
3802,0,0,0,1,0
410,0,0,0,0,1
1252,0,0,0,0,1


In [26]:
from keras.models import Sequential
from keras.layers import Dense,BatchNormalization, Dropout, Activation
from keras.optimizers import RMSprop
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold

X_ary = X.as_matrix()
def nn_model(lr, hid_units, decay, reg):
    model = Sequential()
    model.add(Dense(hid_units[0], input_dim=D, kernel_initializer='random_uniform', 
                    kernel_regularizer=regularizers.l2(reg), activation='relu'))
    for u in hid_units[1:]:
        model.add(Dense(u, kernel_initializer='random_uniform', 
                        kernel_regularizer=regularizers.l2(reg),use_bias=False))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
    model.add(Dense(5, activation='relu'))
    adam = keras.optimizers.adam(lr=lr, decay=decay)
    #rmsprop = RMSprop(lr=lr)
    model.compile(loss = 'categorical_crossentropy', metrics=['accuracy'], optimizer=adam)
    return model


max_accu = 0
best_model = None
records = []
epochs = 200
num_trials = 10
for i in range(num_trials):
    # random search of hyperparameters
    lr = 10**(-3*np.random.rand()) 
    decay = 10**(-4*np.random.rand()-1) # 1e-1~-5
    batch_size = 2**(np.random.randint(6,10)) # 64~1024
    reg = np.random.rand()/10 # regularization strength
    # layers and number of hidden units
    num_layers = np.random.randint(1,5)
    hid_units = []
    for l in range(num_layers):
        hid_units.append(2**(np.random.randint(4,9)))
    hypers = [lr, decay, reg, batch_size, hid_units]
    # define 10-fold cross validation test harness
    kfold = StratifiedKFold(n_splits=10)
    cv_accu = []
    # Model setup
    model = nn_model(lr,hid_units, decay, reg)
    # create estimator for sklearn to train
    estimator = KerasClassifier(build_fn=nn_model, 
                                lr=lr, hid_units=hid_units, decay=decay, 
                                reg = reg, epochs=epochs, batch_size=batch_size, verbose=0)
    kfold = KFold(n_splits=10)
    results = cross_val_score(estimator, X_ary, dummy_y, cv=kfold)
    accu = results.mean()
    print("Baseline: %.2f%% (%.2f%%)" % (accu*100, results.std()*100))
    hypers.append(accu)
    if accu > max_accu:
        max_accu = accu
        best_model = model
        print('max accuracy: ', hypers)
    hypers.append(history.history['categorical_accuracy'])
    records.append(hypers)
    print('---->', i, accu)
# get the index of ranking
accu_records = [i[5] for i in records]
accu_index = [i[0] for i in sorted(enumerate(accu_records), key=lambda x:x[1])]
# list top 3 minimal accuracy and records
accu_index = [i[0] for i in sorted(enumerate(accu_records), key=lambda x:x[1])]

print(' ')
print('The top 10 accuracy are:')
for i in accu_index[:10]:
    print(records[i][:6])



Baseline: 23.33% (5.57%)
max accuracy:  [0.49215565689486696, 0.028226569347106337, 0.08127063750150053, 512, [32], 0.23329411771367575]


NameError: name 'history' is not defined

### SVM