In [5]:
# import libraries
import numpy as np
import pickle
import os
import pandas as pd
import pathlib
import matplotlib.pyplot as plt
%matplotlib inline
import random
import librosa
from scipy.stats import kurtosis, skew
from sklearn import svm
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings('ignore')


In [6]:
# get file list for 1 patient
def get_file_list(path):
    file_list = []
    label = []
    for i in os.listdir(path):
        file_list.append(i)
        if i.endswith("001.csv"):
            label.append(0)
        elif i.endswith("002.csv"):
            label.append(1)
        elif i.endswith("003.csv"):
            label.append(2)
        elif i.endswith("004.csv"):
            label.append(3)
        elif i.endswith("005.csv"):
            label.append(4)
        elif i.endswith("006.csv"):
            label.append(5)
        elif i.endswith("007.csv"):
            label.append(6)
        elif i.endswith("008.csv"):
            label.append(7)
        elif i.endswith("009.csv"):
            label.append(8)
        elif i.endswith("010.csv"):
            label.append(9)
        elif i.endswith("011.csv"):
            label.append(10)
        elif i.endswith("012.csv"):
            label.append(11)

    
    return label,file_list

In [7]:
# get feature from 1 file and normalize
def get_feature(path):
    f = []
    scaler = MinMaxScaler()
    a = pd.read_csv(path,names=["vertical","horizontal"])
    a = np.array(a)
    a = scaler.fit_transform(a)
    for j in a[:,0]:
        f.append(j)        
    for j in a[:,1]:
        f.append(j)
    return f
    
    

In [8]:
# self identified test_split
def my_train_test_split_user_dependent(path,test_split,val_split,file_list,label):
    X_test = []
    X_train = []
    X_val = []
    y_val = []
    y_train = []
    y_test = []
    for f in range(len(file_list)):
        file = file_list[f]
        file_label = label[f]
        feature = get_feature(str(path+file))
        #print(file.split('_')[2],file_label)
        if file.split('_')[2] == test_split:
            X_test.append(feature)
            y_test.append(file_label)
            #print(file,len(feature))
        elif file.split('_')[2] == val_split:
            X_val.append(feature)
            y_val.append(file_label)
        else:
            X_train.append(feature)
            y_train.append(file_label)
            #print(file)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    X_val = np.array(X_val)
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    y_test = np.array(y_test)
    
    return X_train,X_test,X_val,y_val, y_train,y_test

In [9]:
def evaluate(y_true, y_pred,y_prob):
    f1 = f1_score(y_true, y_pred,average = 'micro')
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    acc = accuracy_score(y_true, y_pred)
    #cm = confusion_matrix(y_true, y_pred)
    #disp = ConfusionMatrixDisplay(confusion_matrix=cm,
     #                         display_labels=[0,1,2,3,4,5,6,7,8,9,10,11])
    #disp.plot()
    #plt.show()
    #class_accuracy = cm.diagonal()/cm.sum(axis=1)
    #specificity = class_accuracy[1]
    #sensitivity = class_accuracy[0]
    auc = roc_auc_score(y_true,y_prob,multi_class="ovr",average="micro")

    return f1,precision,recall,acc,auc

## User dependent

In [15]:
patient = ["001","002","003","004","005","006"]
test_split = ["01","02","03","04","05","06","07","08","09"]
val_split = ['02','03',"04","05","06","07","08","09",'01']

In [16]:
def grid_search(X_train,X_test, X_val,y_val,y_train,y_test):
    C = [0.01,1,100]
    kernel= ['linear', 'poly', 'rbf', 'sigmoid']
    grid_search_output = []
    for c in C:
        for k in kernel:
            output = dict()
            output["C"] = c
            output["kernal"] = k
            clf = svm.SVC(C=c,kernel = k,probability=True,random_state=42)
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            y_prob = clf.predict_proba(X_val)
            f1,precision,recall,acc,auc = evaluate(y_val, y_pred,y_prob)
            output["model"] = clf
            output["val_Accuracy"] = acc
            output["val_F1"] = f1
            output["val_AUC"] = auc
            output["val_precision"]= precision
            output["val_recall"]= recall
            
            
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)
            f1,precision,recall,acc,auc = evaluate(y_test, y_pred,y_prob)
            output["test_Accuracy"] = acc
            output["test_F1"] = f1
            output["test_AUC"] = auc
            output["test_precision"]= precision
            output["test_recall"]= recall
            grid_search_output.append(output)
    return grid_search_output

In [17]:
for p in patient:
    print(p)
    output_pck = []
    path = path = str("../../data/isolated 2/"+p+"/isolated_strokes/")
    for t in range(len(test_split)):
        label,file_list = get_file_list(path)
        X_train,X_test,X_val,y_val, y_train,y_test = my_train_test_split_user_dependent(path,test_split[t],val_split[t],file_list,label)
        print(X_val.shape,y_val.shape)
         # build model
        # set parameter 
        output = dict()
        output["test_split"] = t
        grid_search_output = grid_search(X_train,X_test, X_val,y_val,y_train,y_test)
        output["grid_search"] = grid_search_output
        output_pck.append(output)
        
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "wb") as output_file:
        pickle.dump(output_pck, output_file)
        
    
    

001
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
002
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
003
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
004
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
005
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
006
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)
(12, 2500) (12,)


In [18]:
parameter = dict()
for p in patient:
    #print(p)
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
    C = [0.01,1,100]
    kernel= ['linear', 'poly', 'rbf', 'sigmoid']
    grid_search_output = []

    
    for c in C:
        for k in kernel:
            #print("C:",c," kernel:",k)

            acc = 0
            f1 = 0
            auc = 0
            recall = 0
            precision = 0
            count = 0
            for i in range(len(pck)):
                tmp = pck[i]["grid_search"]
                for j in range(len(tmp)):
                    tmp_tesult = tmp[j]
                    if tmp_tesult['C'] == c and tmp_tesult['kernal'] == k:
                        acc += tmp_tesult['val_Accuracy']
                        f1 += tmp_tesult['val_F1']
                        auc+=tmp_tesult['val_AUC']
                        recall += tmp_tesult['val_recall']
                        precision += tmp_tesult['val_precision']
                        count+=1
#             print("Accuracy:",round(acc/count,2),
#                   "F1:",round(f1/count,2),
#                  "auc:",round(auc/count,2),
#                  "precision:",round(precision/count,2),
#                  "recall:",round(recall/count,2),)
            if str("C:"+str(c)+" kernel:"+k) in parameter:
                parameter[str("C:"+str(c)+" kernel:"+k)] += f1/count
            else:
                #print(str("C:"+str(c)+" kernel:"+k))
                parameter[str("C:"+str(c)+" kernel:"+k)] = f1/count

print()




In [19]:
parameter

{'C:0.01 kernel:linear': 5.333333333333333,
 'C:0.01 kernel:poly': 4.231481481481482,
 'C:0.01 kernel:rbf': 2.1388888888888893,
 'C:0.01 kernel:sigmoid': 0.7037037037037037,
 'C:1 kernel:linear': 5.3425925925925934,
 'C:1 kernel:poly': 5.287037037037036,
 'C:1 kernel:rbf': 5.259259259259259,
 'C:1 kernel:sigmoid': 0.8518518518518519,
 'C:100 kernel:linear': 5.3425925925925934,
 'C:100 kernel:poly': 5.296296296296296,
 'C:100 kernel:rbf': 5.379629629629629,
 'C:100 kernel:sigmoid': 0.6296296296296297}

In [20]:
print("bester parameter",list(parameter.keys())[list(parameter.values()).index(max(parameter.values()))])
print("best f1 score", max(parameter.values())/6)

bester parameter C:100 kernel:rbf
best f1 score 0.8966049382716048


In [21]:
# report test data
count = 0
acc = 0
f1 = 0
auc = 0
recall = 0
precision = 0
for p in patient:
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
    for i in range(len(pck)):
        tmp = pck[i]["grid_search"]
        for j in range(len(tmp)):
            tmp_tesult = tmp[j]
            if tmp_tesult['C'] == 1 and tmp_tesult['kernal'] == 'linear':
                #print(tmp_tesult['test_Accuracy'],tmp_tesult['test_F1'],tmp_tesult['test_recall'], tmp_tesult['test_precision'])
                acc += tmp_tesult['test_Accuracy']
                f1 += tmp_tesult['test_F1']
                auc+=tmp_tesult['test_AUC']
                recall += tmp_tesult['test_recall']
                precision += tmp_tesult['test_precision']
                count+=1

print("acc",acc/count)
print("f1",f1/count)
print("auc",auc/count)
print("recall",recall/count)
print("precision",precision/count)


acc 0.9012345679012344
f1 0.9012345679012344
auc 0.9896534792368128
recall 0.9012345679012344
precision 0.9012345679012344


## User independent

In [23]:
# self identified test_split
def my_train_test_split_user_independent(test_patient,val_patient,train_patient):
    X_test = []
    X_train = []
    X_val = []
    y_val = []
    y_train = []
    y_test = []
    for p in train_patient:
        path = str("../../data/isolated 2/"+p+"/isolated_strokes/")
        label,file_list = get_file_list(path)
        for i in range(len(file_list)):
            file = file_list[i]
            file_label = label[i]
            feature = get_feature(str(path+file))
            X_train.append(feature)
            y_train.append(file_label)
        
    path = str("../../data/isolated 2/"+test_patient+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    for i in range(len(file_list)):
        file = file_list[i]
        file_label = label[i]
        feature = get_feature(str(path+file))
        X_test.append(feature)
        y_test.append(file_label)
    
    path = str("../../data/isolated 2/"+val_patient+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    for i in range(len(file_list)):
        file = file_list[i]
        file_label = label[i]
        feature = get_feature(str(path+file))
        X_val.append(feature)
        y_val.append(file_label)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_val = np.array(X_val)
    y_val = np.array(y_val)
    return X_train,X_test, X_val,y_val,y_train,y_test

In [27]:
test_patien_list = ["001","002","003","004","005","006"]
val_patient_list = ["002","003","004","005","006","001"]
output_pck = []
F1 = 0
 
for t in range(len(test_patien_list)):
    train_patient= ["001","002","003","004","005","006"]
    test_patient = test_patien_list[t]
    val_patient = val_patient_list[t]
    train_patient.remove(test_patient)
    train_patient.remove(val_patient)
    print(test_patient,val_patient,train_patient)
    X_train,X_test, X_val,y_val,y_train,y_test = my_train_test_split_user_independent(test_patient,val_patient,train_patient)

    output = dict()
    output["test_split"] = t
    grid_search_output = grid_search(X_train,X_test, X_val,y_val,y_train,y_test)
    output["grid_search"] = grid_search_output
    output_pck.append(output)
    

with open(str("./User_independent/User-independent-SVC.pck"), "wb") as output_file:
    pickle.dump(output_pck, output_file)


001 002 ['003', '004', '005', '006']
002 003 ['001', '004', '005', '006']
003 004 ['001', '002', '005', '006']
004 005 ['001', '002', '003', '006']
005 006 ['001', '002', '003', '004']
006 001 ['002', '003', '004', '005']


In [30]:
# repor best paramter
parameter = dict()
with open(str("./User_independent/User-independent-SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
C = [0.01,1,100]
kernel= ['linear', 'poly', 'rbf', 'sigmoid']
for c in C:
    for k in kernel:

        f1 = 0
        count = 0
        for i in range(len(pck)):
            tmp = pck[i]["grid_search"]
            for j in range(len(tmp)):
                tmp_tesult = tmp[j]
                if tmp_tesult['C'] == c and tmp_tesult['kernal'] == k:

                    f1 += tmp_tesult['val_F1']
#                         auc+=tmp_tesult['train_AUC']
#                         recall += tmp_tesult['train_recall']
#                         precision += tmp_tesult['train_precision']
                    count+=1
#             print("Accuracy:",round(acc/count,2),
#                   "F1:",round(f1/count,2),
#                  "auc:",round(auc/count,2),
#                  "precision:",round(precision/count,2),
#                  "recall:",round(recall/count,2),)
        if str("C:"+str(c)+" kernel:"+k) in parameter:
            parameter[str("C:"+str(c)+" kernel:"+k)] += f1/count
        else:
                #print(str("C:"+str(c)+" kernel:"+k))
            parameter[str("C:"+str(c)+" kernel:"+k)] = f1/count


In [31]:
parameter

{'C:0.01 kernel:linear': 0.6700780156257057,
 'C:0.01 kernel:poly': 0.5305762543467462,
 'C:0.01 kernel:rbf': 0.27098800975477577,
 'C:0.01 kernel:sigmoid': 0.11479775399298499,
 'C:1 kernel:linear': 0.598255280073462,
 'C:1 kernel:poly': 0.6438940447696035,
 'C:1 kernel:rbf': 0.699232827530145,
 'C:1 kernel:sigmoid': 0.10064693131012059,
 'C:100 kernel:linear': 0.5941115702479339,
 'C:100 kernel:poly': 0.6286279335832242,
 'C:100 kernel:rbf': 0.6702031492269942,
 'C:100 kernel:sigmoid': 0.042780826145207666}

In [33]:
print("bester parameter",list(parameter.keys())[list(parameter.values()).index(max(parameter.values()))])
print("best f1 score", max(parameter.values()))

bester parameter C:1 kernel:rbf
best f1 score 0.699232827530145


In [34]:
# report test accuracy
count = 0
acc = 0
f1 = 0
auc = 0
recall = 0
precision = 0
with open(str("./User_independent/User-independent-SVC.pck"), "rb") as input_file:
    pck = pickle.load(input_file)
for i in range(len(pck)):
    tmp = pck[i]["grid_search"]
    for j in range(len(tmp)):
        tmp_tesult = tmp[j]
        if tmp_tesult['C'] == 1 and tmp_tesult['kernal'] == 'linear':
                #print(tmp_tesult['test_Accuracy'],tmp_tesult['test_F1'],tmp_tesult['test_recall'], tmp_tesult['test_precision'])
            acc += tmp_tesult['test_Accuracy']
            f1 += tmp_tesult['test_F1']
            auc+=tmp_tesult['test_AUC']
            recall += tmp_tesult['test_recall']
            precision += tmp_tesult['test_precision']
            count+=1

print("acc",acc/count)
print("f1",f1/count)
print("auc",auc/count)
print("recall",recall/count)
print("precision",precision/count)

acc 0.6357191512742929
f1 0.6357191512742929
auc 0.9362457162462082
recall 0.6357191512742929
precision 0.6357191512742929
