In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import KFold
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [2]:
def seq_encoding(path, outfile):
    dataMat = []; labelMat = []
    dataList = []; labelList = []
    feature_val = []
    encoded_20D = []
    result_1D = []
    
    files = os.listdir(path)
    print(files)
    for file in files:
        if not os.path.isdir(file):  #open when it is a folder
            f = open(path+'/'+file,'r')
            lines = f.readlines()  # return a list
            print(lines)
            for line in lines:
                #separate the sequences and labels
                l_line = line.strip()
                line_list = l_line.split(' ')
                while '' in line_list:
                    line_list.remove('')

                dataList.append(line_list[0])
                labelList.append(line_list[1])
    '''
    for seq in dataList:
        for i in range(len(seq)):
            # Encode each amino acid in the sequence into a sequence of 20 as 0，1 binaries
            binary_str_20D = dic_encoded[seq[i]]
            for j in range(len(binary_str_20D)):
                temp_v = int(binary_str_20D[j])
                feature_val.append(temp_v)
    step = 180
    encoded_20D = [feature_val[i:i+step] for i in range(0,len(feature_val),step)]

    '''
    
    for i in range(len(labelList)):
        result_1D.append(int(labelList[i]))
    # dataMat = np.array(encoded_20D) #dataMat as a X feature matrix
    
    dataMat = np.array(dataList) #dataMat as a X feature matrix
    labelMat = np.asarray(result_1D) #labelMat as a y label matrix
    # Combine the feature matrix and the label matrix into one matrix
    window_Mat = np.column_stack((dataMat, labelMat))
    np.random.shuffle(window_Mat)
    print(window_Mat.shape)

    np.savetxt(outfile,window_Mat,fmt='%s')
    f.close()
    return window_Mat

In [3]:
def splitDataSetbyKFold(window_Mat,split_size,outdir):
    if not os.path.exists(outdir): #if not outdir, makedir
        os.makedirs(outdir)
    train_all = [];
    test_all = []
    each_split_tr = []
    each_split_te = []
    count_split = 0
    kf = KFold(n_splits=split_size)
    for train_index, test_index in kf.split(window_Mat):
        count_split += 1
        for index in train_index:
            each_split_tr.append(list(window_Mat[index]))
        array_ = np.array(each_split_tr)
        np.savetxt(outdir + "/train_" + str(count_split) + '.txt',array_, fmt="%s", delimiter='\t')  # output each piece of data
        train_all.append(each_split_tr)  # Add each piece of data to a list '[[[],[],...[]]]' 3-D list
        each_split_tr = []

        for index in test_index:
            each_split_te.append(list(window_Mat[index]))
        array_ = np.array(each_split_te)
        np.savetxt(outdir + "/test_" + str(count_split) + '.txt',array_, fmt="%s", delimiter='\t')  # output each piece of data
        test_all.append(each_split_te)  # Add each piece of data to a list
        each_split_te = []

    #train_all = train_all[0]
    #test_all = test_all[0]
    return train_all, test_all

In [4]:
def performance(labelArr, predictArr):#类标签为int类型
    #labelArr[i] is actual value,predictArr[i] is predict value
    TP = 0.0; TN = 0.0; FP = 0.0; FN = 0.0
    for i in range(len(labelArr)):
        if labelArr[i] == 1 and predictArr[i] == 1:
            TP += 1.0
        if labelArr[i] == 1 and predictArr[i] == 0:
            FN += 1.0
        if labelArr[i] == 0 and predictArr[i] == 1:
            FP += 1.0
        if labelArr[i] == 0 and predictArr[i] == 0:
            TN += 1.0
    print(TP)
    print(FN)
    print(TN)
    print(FP)
    
    if ((TP+FN) == 0):
        SN = 0
        SP = 0
    elif ((FP+FN) == 0):
        SN = 0
        SP = 0
    else:
        SN = TP/(TP + FN) #Sensitivity = TP/P  and P = TP + FN
        SP = TN/(FP + TN) #Specificity = TN/N  and N = TN + FP

    #MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    return SN,SP

In [5]:
def classifier(clf,clfname,train_X, train_y, test_X, test_y,i):#X:feature matrix，y:label matrix
    # train with train set
    print(" training begin...")
    clf = clf.fit(train_X,train_y)
    print(" training end.")
    #==========================================================================
    # test with validation set
    print(" test begin.")
    predict_ = clf.predict(test_X) #return type is float64
    proba = clf.predict_proba(test_X)[:,1] #return type is float64
    score_ = clf.score(test_X, test_y)
    
    # Report
    sk_report = classification_report(
    digits=6,
    y_true=test_y, 
    y_pred=clf.predict(test_X))
    print(sk_report)
    
    print(" test end.")
    
    #==========================================================================
    
    ACC = accuracy_score(test_y, predict_)
    SN, SP = performance(test_y, predict_)
    MCC = matthews_corrcoef(test_y, predict_)
    #AUC = roc_auc_score(test_y, proba)
    AUC = 0
    
    # Model Evaluation
    #==========================================================================
    #save output
    
    eval_output = []
    eval_output.append(ACC);eval_output.append(SN);eval_output.append(AUC)
    eval_output.append(SP);eval_output.append(MCC)
    eval_output.append(score_)
    eval_output = np.array(eval_output,dtype=float)
    
    np.savetxt("proba.data",proba,fmt="%f",delimiter="\t")
    np.savetxt("test_y.data",test_y,fmt="%f",delimiter="\t")
    np.savetxt("predict.data",predict_,fmt="%f",delimiter="\t")
    #np.savetxt("eval_output.data",eval_output,fmt="%f",delimiter="\t")
    print("Wrote results to output.data...EOF...")
    # ==========================================================================
    # save Model
    os.chdir("/Users/oscarkuan/coding/fyp_ecgid/ML_Model")

    joblib.dump(clf,'train_'+clfname+str(i)+'.model')
    return ACC,SN,SP,MCC,AUC

In [6]:
# mean_fun used to find the average value of the values in the list,
# mainly ACC mean,SP mean and SN mean, to evaluate the model
def mean_fun(onelist):
    count = 0
    for i in onelist:
        count += i
    return float(count/len(onelist))

In [7]:
def crossValidation(clf, clfname, curdir, train_all, test_all):
    os.chdir(curdir)
    cur_path = curdir
    ACCs = [];SNs = [];SPs = [];MCCs = [];AUCs = []

    for i in range(len(train_all)):
        os.chdir(cur_path)
        train_data = train_all[i]; train_X = []; train_y = []
        test_data = test_all[i]; test_X = []; test_y = []

        #Divide train_all into train_X and train_y
        for eachline_train in train_data:
            one_train = eachline_train
            one_train_format = []
            for index in range(0, len(one_train) - 1):
                one_train_format.append(float(one_train[index]))
            train_X.append(one_train_format)
            train_y.append(int(one_train[-1]))

        #Divide test_all into test_X and test_y
        for eachline_test in test_data:
            one_test = eachline_test
            one_test_format = []
            for index in range(0, len(one_test) - 1):
                one_test_format.append(float(one_test[index]))
            test_X.append(one_test_format)
            test_y.append(int(one_test[-1]))
        # ======================================================================
        # classifier start here
        if not os.path.exists(clfname):
            os.mkdir(clfname)
        out_path = clfname + "/" + clfname + "_00" + str(i)  # the folder that save result of each fold
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        os.chdir(out_path)
        ACC, SN, SP, MCC, AUC = classifier(clf, clfname, train_X, train_y, test_X, test_y,i)
        ACCs.append(ACC);
        SNs.append(SN);
        SPs.append(SP);
        MCCs.append(MCC);
        AUCs.append(AUC)
    # ======================================================================
    ACC_mean = mean_fun(ACCs)
    SN_mean = mean_fun(SNs)
    SP_mean = mean_fun(SPs)
    MCC_mean = mean_fun(MCCs)
    AUC_mean = mean_fun(AUCs)
    # ==========================================================================
    # output experiment result
    ("/Users/oscarkuan/coding/fyp_ecgid/")
    os.system("echo `date`'" + str(clf) + "' >> log.out")
    os.system("echo ACC_mean=" + str(ACC_mean) + " >> log.out")
    os.system("echo SN_mean=" + str(SN_mean) + " >> log.out")
    os.system("echo SP_mean=" + str(SP_mean) + " >> log.out")
    os.system("echo MCC_mean=" + str(MCC_mean) + " >> log.out")
    os.system("echo AUC_mean=" + str(AUC_mean) + " >> log.out")
    return ACC_mean, SN_mean, SP_mean, MCC_mean, AUC_mean

In [None]:
if __name__ == '__main__':
    path = 'LABELED_DATASET'
    outfile = 'seq_encoded.txt'
    outdir = 'KFold'
    a = []
    # encode the original dataset
    window_Mat = seq_encoding(path,outfile)

    # split the feature matrix into N fold
    train_all, test_all = splitDataSetbyKFold(window_Mat, 100, outdir)

    print("generate Dataset end and cross validation start")

    clf = svm.SVC(C=1, kernel='rbf', gamma=0.05, probability=True)
    curdir = '/Users/oscarkuan/coding/fyp_ecgid/'
    clfname = 'SVM'
    #ACC_mean, SN_mean, SP_mean, MCC_mean, AUC_mean = crossValidation(clf, clfname, curdir, train_all, test_all)
    crossValidation(clf, clfname, curdir, train_all, test_all)

    # performace_list = [ACC_mean, SN_mean, SP_mean, MCC_mean, AUC_mean]
    # performace_set = ['ACC_mean', 'SN_mean', 'SP_mean', 'MCC_mean', 'AUC_mean']
    # plt.plot(performace_set, performace_list, 'r-o', label='Performance')
    # plt.legend()
    # plt.title('MHC Prediction by SVM')
    # plt.xlabel('Name of Evaluation')
    # plt.ylabel('Performance')
    # plt.show()
    # plt.savefig('SVM_10fold.png')
    #print('MCC_mean', '\t', 'AUC_mean', '\t', 'ACC_mean', '\t', 'SN_mean', '\t', 'SP_mean')
    #print(MCC_mean, AUC_mean, ACC_mean, SN_mean, SP_mean)  # 将ACC均值，SP均值，SN均值都输出到控制台



['person_id_1.csv', 'person_id_2.csv', 'person_id_3.csv', 'person_id_4.csv']
['-8.500000000000000611e-02 1\n', '-8.000000000000000167e-02 1\n', '-7.000000000000000666e-02 1\n', '-7.499999999999999722e-02 1\n', '-9.500000000000000111e-02 1\n', '-8.999999999999999667e-02 1\n', '-1.000000000000000056e-01 1\n', '-1.000000000000000056e-01 1\n', '-1.000000000000000056e-01 1\n', '-8.500000000000000611e-02 1\n', '-5.999999999999999778e-02 1\n', '-4.499999999999999833e-02 1\n', '-5.500000000000000028e-02 1\n', '-5.999999999999999778e-02 1\n', '-7.000000000000000666e-02 1\n', '-8.000000000000000167e-02 1\n', '-8.500000000000000611e-02 1\n', '-8.000000000000000167e-02 1\n', '-7.000000000000000666e-02 1\n', '-4.000000000000000083e-02 1\n', '-5.000000000000000278e-02 1\n', '-4.499999999999999833e-02 1\n', '-4.000000000000000083e-02 1\n', '-4.000000000000000083e-02 1\n', '-5.500000000000000028e-02 1\n', '-5.999999999999999778e-02 1\n', '-5.500000000000000028e-02 1\n', '-5.500000000000000028e-02 1\n'

(40000, 2)
generate Dataset end and cross validation start
 training begin...
 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.435484  0.540000  0.482143       100
           2   0.789474  0.681818  0.731707       110
           3   0.812500  0.611765  0.697987        85
           4   0.529915  0.590476  0.558559       105

    accuracy                       0.607500       400
   macro avg   0.641843  0.606015  0.617599       400
weighted avg   0.637735  0.607500  0.616699       400

 test end.
54.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
 training begin...
 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.486486  0.540000  0.511848       100
           2   0.647727  0.600000  0.622951        95
           3   0.851351  0.572727  0.684783       110
           4   0.551181  0.736842  0.630631        95

    accuracy                       0.610000       400
   macro avg   0.634

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.425000  0.510000  0.463636       100
           2   0.700000  0.673077  0.686275       104
           3   0.857143  0.606061  0.710059        99
           4   0.518182  0.587629  0.550725        97

    accuracy                       0.595000       400
   macro avg   0.625081  0.594192  0.602674       400
weighted avg   0.626052  0.595000  0.603631       400

 test end.
51.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
 training begin...
 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.472000  0.567308  0.515284       104
           2   0.717949  0.565657  0.632768        99
           3   0.854839  0.569892  0.683871        93
           4   0.518519  0.673077  0.585774       104

    accuracy                       0.595000       400
   macro avg   0.640826  0.593983  0.604424       400
weighted avg   0.633977  0.595000  0.601885 