In [1]:
# import libraries
import numpy as np
import pickle
import os
import pandas as pd
import pathlib
import matplotlib.pyplot as plt
%matplotlib inline
import random
import librosa
from scipy.stats import kurtosis, skew
from sklearn import svm
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score
from preprocess import preprocess
warnings.filterwarnings('ignore')


In [3]:
# get file list for 1 patient
def get_file_list(path):
    file_list = []
    label = []
    for i in os.listdir(path):
        file_list.append(i)
        l = int(i.split("_")[-1].split(".")[0])-1
        label.append(l)
    return label,file_list

In [4]:
# get feature from 1 file and preprocess
def get_feature(path):
    f = []
    a = pd.read_csv(path,names=["vertical","horizontal"])
    a = np.array(a)
    #print(a.shape)
    a = preprocess(a)
    #print(a.shape)
    for j in a[:,0]:
        f.append(j)        
    for j in a[:,1]:
        f.append(j)
    return f
    
    

In [8]:
# self identified test_split
def my_train_test_split_user_dependent(path,test_split,val_split,file_list,label):
    X_test = []
    X_train = []
    X_val = []
    y_val = []
    y_train = []
    y_test = []
    for f in range(len(file_list)):
        file = file_list[f]
        file_label = label[f]
        #print(file)
        feature = get_feature(str(path+file))
        #print(file.split('_')[2],file_label)
        if file.split('_')[2] in test_split:
            X_test.append(feature)
            y_test.append(file_label)
            #print(file,len(feature))
        elif file.split('_')[2] == val_split:
            X_val.append(feature)
            y_val.append(file_label)
        else:
            X_train.append(feature)
            y_train.append(file_label)
            #print(file)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    X_val = np.array(X_val)
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    y_test = np.array(y_test)
    
    return X_train,X_test,X_val,y_val, y_train,y_test

In [9]:
def evaluate(y_true, y_pred,y_prob):
    f1_micro = f1_score(y_true, y_pred,average = 'micro')
    f1_macro = f1_score(y_true, y_pred,average = 'macro')
    precision_micro = precision_score(y_true, y_pred, average='micro')
    precision_macro = precision_score(y_true, y_pred, average='macro')
    recall_micro = recall_score(y_true, y_pred, average='micro')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    #cm = confusion_matrix(y_true, y_pred)
    #disp = ConfusionMatrixDisplay(confusion_matrix=cm,
     #                         display_labels=[0,1,2,3,4,5,6,7,8,9,10,11])
    #disp.plot()
    #plt.show()
    #class_accuracy = cm.diagonal()/cm.sum(axis=1)
    #specificity = class_accuracy[1]
    #sensitivity = class_accuracy[0]
    auc = roc_auc_score(y_true,y_prob,multi_class="ovr",average="micro")

    return f1_micro,f1_macro,precision_micro,precision_macro,recall_micro,recall_macro,acc

## User dependent

In [10]:
patient = ["001","002","003","004","005","006"]
test_split = [["01","02"],["03","04"],["05","06"],["07","08"],["09","10"]]
val_split = ['03','01',"04","05","06"]

In [11]:
def grid_search(X_train,X_test, X_val,y_val,y_train,y_test):
    C = [0.01,1,100]
    kernel= ['linear', 'poly', 'rbf', 'sigmoid']
    grid_search_output = []
    for c in C:
        for k in kernel:
            output = dict()
            output["C"] = c
            output["kernal"] = k
            clf = svm.SVC(C=c,kernel = k,probability=True,random_state=42)
            clf.fit(X_train, y_train)
            
            y_pred = clf.predict(X_val)
            y_prob = clf.predict_proba(X_val)
            f1_micro,f1_macro,precision_micro,precision_macro,recall_micro,recall_macro,acc = evaluate(y_val, y_pred,y_prob)
            output["model"] = clf
            output["val_Accuracy"] = acc
            output["val_f1_micro"] = f1_micro
            output["val_f1_macro"] = f1_macro
            output["val_precision_micro"]= precision_micro
            output["val_precision_macro"]= precision_macro
            output["val_recall_micro"]= recall_micro
            output["val_recall_macro"]= recall_macro
            
            
            y_pred = clf.predict(X_test)
            y_prob = clf.predict_proba(X_test)
            f1_micro,f1_macro,precision_micro,precision_macro,recall_micro,recall_macro,acc = evaluate(y_test, y_pred,y_prob)
            output["test_Accuracy"] = acc
            output["test_f1_micro"] = f1_micro
            output["test_f1_macro"] = f1_macro
            output["test_precision_micro"]= precision_micro
            output["test_precision_macro"]= precision_macro
            output["test_recall_micro"]= recall_micro
            output["test_recall_macro"]= recall_macro
            grid_search_output.append(output)
    return grid_search_output


In [12]:
for p in patient:
    print(p)
    output_pck = []
    path = str("../../../../data/isolated 2/"+p+"/isolated_strokes/")
    for t in range(len(test_split)):
        print(test_split[t])
        label,file_list = get_file_list(path)
        X_train,X_test,X_val,y_val, y_train,y_test = my_train_test_split_user_dependent(path,test_split[t],val_split[t],file_list,label)
        print(X_val.shape,y_val.shape)
         # build model
        # set parameter 
        output = dict()
        output["test_split"] = t
        grid_search_output = grid_search(X_train,X_test, X_val,y_val,y_train,y_test)
        output["grid_search"] = grid_search_output
        output_pck.append(output)
        
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "wb") as output_file:
        pickle.dump(output_pck, output_file)
        
    
    

001
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)
002
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)
003
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)
004
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)
005
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)
006
['01', '02']
(12, 200) (12,)
['03', '04']
(12, 200) (12,)
['05', '06']
(12, 200) (12,)
['07', '08']
(12, 200) (12,)
['09', '10']
(12, 200) (12,)


In [13]:
parameter = dict()
for p in patient:
    #print(p)
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
    C = [0.01,1,100]
    kernel= ['linear', 'poly', 'rbf', 'sigmoid']
    grid_search_output = []

    
    for c in C:
        for k in kernel:
            #print("C:",c," kernel:",k)

            acc = 0
            f1 = 0
            recall = 0
            precision = 0
            count = 0
            for i in range(len(pck)):
                tmp = pck[i]["grid_search"]
                for j in range(len(tmp)):
                    tmp_tesult = tmp[j]
                    if tmp_tesult['C'] == c and tmp_tesult['kernal'] == k:
                        acc += tmp_tesult['val_Accuracy']
                        f1 += tmp_tesult['val_f1_macro']
                        recall += tmp_tesult['val_recall_macro']
                        precision += tmp_tesult['val_precision_macro']
                        count+=1
#             print("Accuracy:",round(acc/count,2),
#                   "F1:",round(f1/count,2),
#                  "auc:",round(auc/count,2),
#                  "precision:",round(precision/count,2),
#                  "recall:",round(recall/count,2),)
            if str("C:"+str(c)+" kernel:"+k) in parameter:
                parameter[str("C:"+str(c)+" kernel:"+k)] += f1/count
            else:
                #print(str("C:"+str(c)+" kernel:"+k))
                parameter[str("C:"+str(c)+" kernel:"+k)] = f1/count

print()




In [14]:
parameter

{'C:0.01 kernel:linear': 3.646190476190476,
 'C:0.01 kernel:poly': 3.705,
 'C:0.01 kernel:rbf': 1.6328133903133901,
 'C:0.01 kernel:sigmoid': 0.3834110334110334,
 'C:1 kernel:linear': 4.965,
 'C:1 kernel:poly': 4.896666666666667,
 'C:1 kernel:rbf': 4.7316666666666665,
 'C:1 kernel:sigmoid': 0.509066674066674,
 'C:100 kernel:linear': 4.9816666666666665,
 'C:100 kernel:poly': 4.930000000000001,
 'C:100 kernel:rbf': 5.042777777777777,
 'C:100 kernel:sigmoid': 0.46675925925925915}

In [15]:
print("bester parameter",list(parameter.keys())[list(parameter.values()).index(max(parameter.values()))])
print("best f1 score", max(parameter.values())/6)

bester parameter C:100 kernel:rbf
best f1 score 0.8404629629629629


In [17]:
# report test data
count = 0
acc = 0
f1_micro = 0
f1_macro = 0
recall_micro = 0
recall_macro = 0
precision_micro = 0
precision_macro = 0
for p in patient:
    with open(str("./User_dependent/patient_"+p+"_SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
    for i in range(len(pck)):
        tmp = pck[i]["grid_search"]
        for j in range(len(tmp)):
            tmp_tesult = tmp[j]
            if tmp_tesult['C'] == 100 and tmp_tesult['kernal'] == 'rbf':
                #print(tmp_tesult['test_Accuracy'],tmp_tesult['test_F1'],tmp_tesult['test_recall'], tmp_tesult['test_precision'])
                acc += tmp_tesult['test_Accuracy']
                f1_macro += tmp_tesult['test_f1_macro']
                f1_micro += tmp_tesult['test_f1_micro']
                recall_micro += tmp_tesult['test_recall_micro']
                recall_macro += tmp_tesult['test_recall_macro']
                precision_micro += tmp_tesult['test_precision_micro']
                precision_macro += tmp_tesult['test_precision_macro']
                count+=1

print("acc",acc/count)
print("f1_macro",f1_macro/count)
print("f1_micro",f1_micro/count)
print("recall_micro",recall_micro/count)
print("recall_macro",recall_macro/count)
print("precision_micro",precision_micro/count)
print("precision_macro",precision_macro/count)


acc 0.8735507246376808
f1_macro 0.865952380952381
f1_micro 0.8735507246376808
recall_micro 0.8735507246376808
recall_macro 0.8736111111111107
precision_micro 0.8735507246376808
precision_macro 0.8941203703703703


In [21]:
for p in patient:
    
    path = str("../../../../data/isolated 2/"+p+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    X = []
    y = []
    for f in range(len(file_list)):
        file = file_list[f]
        file_label = label[f]
        #print(file)
        feature = get_feature(str(path+file))
        X.append(feature)
        y.append(file_label)
    print(len(X),len(y))
    clf = svm.SVC(C=100,kernel = 'rbf',probability=True,random_state=42)
    clf.fit(X, y)  
    with open(str("../Isolated Model/SVC__sub"+p+".pck"), "wb") as output_file:
        pickle.dump(clf, output_file)
        
    
    

120 120
120 120
122 122
121 121
120 120
121 121


## User independent

In [22]:
# self identified test_split
def my_train_test_split_user_independent(test_patient,val_patient,train_patient):
    X_test = []
    X_train = []
    X_val = []
    y_val = []
    y_train = []
    y_test = []
    for p in train_patient:
        path = str("../../../../data/isolated 2/"+p+"/isolated_strokes/")
        label,file_list = get_file_list(path)
        for i in range(len(file_list)):
            file = file_list[i]
            file_label = label[i]
            feature = get_feature(str(path+file))
            X_train.append(feature)
            y_train.append(file_label)
        
    path = str("../../../../data/isolated 2/"+test_patient+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    for i in range(len(file_list)):
        file = file_list[i]
        file_label = label[i]
        feature = get_feature(str(path+file))
        X_test.append(feature)
        y_test.append(file_label)
    
    path = str("../../../../data/isolated 2/"+val_patient+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    for i in range(len(file_list)):
        file = file_list[i]
        file_label = label[i]
        feature = get_feature(str(path+file))
        X_val.append(feature)
        y_val.append(file_label)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_val = np.array(X_val)
    y_val = np.array(y_val)
    return X_train,X_test, X_val,y_val,y_train,y_test

In [24]:
test_patien_list = ["001","002","003","004","005","006"]
val_patient_list = ["002","003","004","005","006","001"]
output_pck = []
F1 = 0
 
for t in range(len(test_patien_list)):
    train_patient= ["001","002","003","004","005","006"]
    test_patient = test_patien_list[t]
    val_patient = val_patient_list[t]
    train_patient.remove(test_patient)
    train_patient.remove(val_patient)
    print(test_patient,val_patient,train_patient)
    X_train,X_test, X_val,y_val,y_train,y_test = my_train_test_split_user_independent(test_patient,val_patient,train_patient)

    output = dict()
    output["test_split"] = t
    grid_search_output = grid_search(X_train,X_test, X_val,y_val,y_train,y_test)
    output["grid_search"] = grid_search_output
    output_pck.append(output)
    

with open(str("./User_independent/User-independent-SVC.pck"), "wb") as output_file:
    pickle.dump(output_pck, output_file)


001 002 ['003', '004', '005', '006']
002 003 ['001', '004', '005', '006']
003 004 ['001', '002', '005', '006']
004 005 ['001', '002', '003', '006']
005 006 ['001', '002', '003', '004']
006 001 ['002', '003', '004', '005']


In [26]:
# repor best paramter
parameter = dict()
with open(str("./User_independent/User-independent-SVC.pck"), "rb") as input_file:
        pck = pickle.load(input_file)
C = [0.01,1,100]
kernel= ['linear', 'poly', 'rbf', 'sigmoid']
for c in C:
    for k in kernel:

        f1 = 0
        count = 0
        for i in range(len(pck)):
            tmp = pck[i]["grid_search"]
            for j in range(len(tmp)):
                tmp_tesult = tmp[j]
                if tmp_tesult['C'] == c and tmp_tesult['kernal'] == k:

                    f1 += tmp_tesult['val_f1_macro']
#                         auc+=tmp_tesult['train_AUC']
#                         recall += tmp_tesult['train_recall']
#                         precision += tmp_tesult['train_precision']
                    count+=1
#             print("Accuracy:",round(acc/count,2),
#                   "F1:",round(f1/count,2),
#                  "auc:",round(auc/count,2),
#                  "precision:",round(precision/count,2),
#                  "recall:",round(recall/count,2),)
        if str("C:"+str(c)+" kernel:"+k) in parameter:
            parameter[str("C:"+str(c)+" kernel:"+k)] += f1/count
        else:
                #print(str("C:"+str(c)+" kernel:"+k))
            parameter[str("C:"+str(c)+" kernel:"+k)] = f1/count


In [27]:
parameter

{'C:0.01 kernel:linear': 0.5790401756834708,
 'C:0.01 kernel:poly': 0.48929764285345806,
 'C:0.01 kernel:rbf': 0.13830490171123214,
 'C:0.01 kernel:sigmoid': 0.07318836870338591,
 'C:1 kernel:linear': 0.6111696348018266,
 'C:1 kernel:poly': 0.6114089186586383,
 'C:1 kernel:rbf': 0.6635423362540734,
 'C:1 kernel:sigmoid': 0.08005649637699712,
 'C:100 kernel:linear': 0.5744299301257022,
 'C:100 kernel:poly': 0.6033309393524217,
 'C:100 kernel:rbf': 0.6371213638672343,
 'C:100 kernel:sigmoid': 0.024189134478842564}

In [28]:
print("bester parameter",list(parameter.keys())[list(parameter.values()).index(max(parameter.values()))])
print("best f1 score", max(parameter.values()))

bester parameter C:1 kernel:rbf
best f1 score 0.6635423362540734


In [29]:
# report test accuracy
count = 0
acc = 0
f1_micro = 0
f1_macro = 0
recall_micro = 0
recall_macro = 0
precision_micro = 0
precision_macro = 0
with open(str("./User_independent/User-independent-SVC.pck"), "rb") as input_file:
    pck = pickle.load(input_file)
for i in range(len(pck)):
    tmp = pck[i]["grid_search"]
    for j in range(len(tmp)):
        tmp_tesult = tmp[j]
        if tmp_tesult['C'] == 1 and tmp_tesult['kernal'] == 'rbf':
                #print(tmp_tesult['test_Accuracy'],tmp_tesult['test_F1'],tmp_tesult['test_recall'], tmp_tesult['test_precision'])
            acc += tmp_tesult['test_Accuracy']
            f1_macro += tmp_tesult['test_f1_macro']
            f1_micro += tmp_tesult['test_f1_micro']
            recall_micro += tmp_tesult['test_recall_micro']
            recall_macro += tmp_tesult['test_recall_macro']
            precision_micro += tmp_tesult['test_precision_micro']
            precision_macro += tmp_tesult['test_precision_macro']
            count+=1

print("acc",acc/count)
print("f1_macro",f1_macro/count)
print("f1_micro",f1_micro/count)
print("recall_micro",recall_micro/count)
print("recall_macro",recall_macro/count)
print("precision_micro",precision_micro/count)
print("precision_macro",precision_macro/count)

acc 0.6620284439025125
f1_macro 0.633232952168275
f1_micro 0.6620284439025125
recall_micro 0.6620284439025125
recall_macro 0.6626964085297419
precision_micro 0.6620284439025125
precision_macro 0.7014261530499276


In [36]:
X = []
y = []
for p in patient:
    path = str("../../../../data/isolated 2/"+p+"/isolated_strokes/")
    label,file_list = get_file_list(path)
    for f in range(len(file_list)):
        file = file_list[f]
        file_label = label[f]
        feature = get_feature(str(path+file))
        X.append(feature)
        y.append(file_label)
print(len(X),len(y))   
clf = svm.SVC(C=1,kernel = 'rbf',probability=True,random_state=42)
clf.fit(X, y)

with open(str("../Isolated Model/SVC__subAll.pck"), "wb") as output_file:
    pickle.dump(clf, output_file)
        

724 724
