In [1]:
import numpy as np
from scipy.io import loadmat

import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.ensemble import RandomForestClassifier

# confusion matrix
from sklearn import metrics
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html




#from tqdm import tqdm # for progressive bar

from sklearn.externals import joblib # to save model

In [2]:
def load_one_epoch(subject, data_type, epoch_num):
    if data_type is 'train':
        epoch_data=loadmat("data/{}{}-allfilt10.mat".format(subject, epoch_num))

        x=epoch_data['x']
        y=epoch_data['y']
        code=epoch_data['code']

    elif data_type is 'test':
        epoch_data=loadmat("data/{}t{}-allfilt10.mat".format(subject, epoch_num))

        x=epoch_data['x']
        #y=epoch_data['y']
        code=epoch_data['code']
        
        # decode y 
        #print('decode y')
        true_code = np.loadtxt("data/{}t_{}_true_code.txt".format(subject, epoch_num))

        y = -np.ones(code.shape)
        idx = (code == true_code[0]) | (code == true_code[1])
        y[idx] = 1

    else:
        pass
            


    return x, y, code

In [3]:
def load_data(subject, data_type, num_epoches):
    X = list()
    Y = list()
    C = list()
    for epoch_num in range(num_epoches):
        #print(epoch_num)
        x, y, code = load_one_epoch(subject, data_type, 1+epoch_num)

        X.append(x)
        Y.append(y)
        C.append(code)
        


    X = np.array(X)
    Y = np.array(Y)
    C = np.array(C)

    print('loaded:')
    print(X.shape)
    print(Y.shape)
    print(C.shape)    

    num_trials = X.shape[1]
    data_dim = X.shape[2]

    # stack epoches
    X = X.reshape(-1,data_dim)
    Y = Y.ravel()
    C = C.ravel()    

    print('stacked:')
    print(X.shape)
    print(Y.shape)
    print(C.shape)    
    
    return X, Y, C

## Random Forests classifier

In [4]:
subject = 'A'
X_train, Y_train, Code_train = load_data(subject, 'train', 85)
X_test, Y_test, Code_test = load_data(subject, 'test', 100)

loaded:
(85, 180, 896)
(85, 180, 1)
(85, 180, 1)
stacked:
(15300, 896)
(15300,)
(15300,)
loaded:
(100, 180, 896)
(100, 180, 1)
(100, 180, 1)
stacked:
(18000, 896)
(18000,)
(18000,)


In [5]:
clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=16, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [6]:
num_ch = 64
joblib.dump(clf, "{}_RF_ch{}.model".format(subject,num_ch))

['A_RF_ch64.model']

In [7]:
# predict
Y_est=clf.predict(X_test)

#accuracy = 
sum(Y_test==Y_est) / len(Y_test)

0.6615

In [8]:
# confusion matrix
from sklearn import metrics
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

metrics.confusion_matrix(Y_test,Y_est,labels=(1,-1))

array([[ 1576,  1424],
       [ 4669, 10331]])

## subject B

In [9]:
subject = 'B'
X_train, Y_train, Code_train = load_data(subject, 'train', 85)
X_test, Y_test, Code_test = load_data(subject, 'test', 100)

loaded:
(85, 180, 896)
(85, 180, 1)
(85, 180, 1)
stacked:
(15300, 896)
(15300,)
(15300,)
loaded:
(100, 180, 896)
(100, 180, 1)
(100, 180, 1)
stacked:
(18000, 896)
(18000,)
(18000,)


In [10]:
# train
clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=16, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [11]:
# save model
num_ch = 64
joblib.dump(clf, "{}_RF_ch{}.model".format(subject,num_ch))

['B_RF_ch64.model']

In [12]:
# test
Y_est=clf.predict(X_test)

#accuracy = 
sum(Y_test==Y_est) / len(Y_test)

0.6856666666666666

In [13]:
# confusion matrix
metrics.confusion_matrix(Y_test,Y_est,labels=(1,-1))

array([[ 1498,  1502],
       [ 4156, 10844]])

## Joint

In [14]:
subject = 'A'
X_train_A, Y_train_A, Code_train_A = load_data(subject, 'train', 85)
X_test_A, Y_test_A, Code_test_A = load_data(subject, 'test', 100)

subject = 'B'
X_train_B, Y_train_B, Code_train_B = load_data(subject, 'train', 85)
X_test_B, Y_test_B, Code_test_B = load_data(subject, 'test', 100)


loaded:
(85, 180, 896)
(85, 180, 1)
(85, 180, 1)
stacked:
(15300, 896)
(15300,)
(15300,)
loaded:
(100, 180, 896)
(100, 180, 1)
(100, 180, 1)
stacked:
(18000, 896)
(18000,)
(18000,)
loaded:
(85, 180, 896)
(85, 180, 1)
(85, 180, 1)
stacked:
(15300, 896)
(15300,)
(15300,)
loaded:
(100, 180, 896)
(100, 180, 1)
(100, 180, 1)
stacked:
(18000, 896)
(18000,)
(18000,)


In [15]:
# combine
X_train = np.vstack((X_train_A,X_train_B))
print(X_train.shape)
Y_train = np.concatenate((Y_train_A, Y_train_B))
print(Y_train.shape)

X_test = np.vstack((X_test_A, X_test_B))
print(X_test.shape)
Y_test = np.concatenate((Y_test_A, Y_test_B))
print(Y_test.shape)




(30600, 896)
(30600,)
(36000, 896)
(36000,)


In [16]:
# train
clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=16, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [17]:
# save model
subject = 'AandB'
num_ch = 64
joblib.dump(clf, "{}_RF_ch{}.model".format(subject,num_ch)) 
    

['AandB_RF_ch64.model']

In [18]:
# predict
Y_est=clf.predict(X_test)

#accuracy = 
sum(Y_test==Y_est) / len(Y_test)

0.62225

In [19]:
# confusion matrix
metrics.confusion_matrix(Y_test,Y_est,labels=(1,-1))

array([[ 3160,  2840],
       [10759, 19241]])

In [20]:
# predict A using joint model
Y_est_A=clf.predict(X_test_A)

#accuracy = 
sum(Y_test_A==Y_est_A) / len(Y_test_A)



0.6362777777777778

In [21]:
# predict B using joint model

Y_est_B=clf.predict(X_test_B)

#accuracy = 
sum(Y_test_B==Y_est_B) / len(Y_test_B)



0.6082222222222222