## Multi-level DWT method for feature extraction 

Think-count

dwt coefficients, RMS for each set of coefficients, variance for each set of coefficients, ratio between bands

In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
from scipy import signal
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pickle as pkl
import itertools 
import glob
from sklearn import svm 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.decomposition import PCA
from spectrum import data_cosine, dpss, pmtm
import pywt 


# %matplotlib inline 
%matplotlib qt


In [5]:
# Directories to load filtered datasets
theta5 = "F:\EEG-data\\think-count\\filtered\\0.05s\\theta/"
alpha5 = "F:\EEG-data\\think-count\\filtered\\0.05s\\alpha/"

T_TA_list = [pd.read_pickle(theta5+"think.pkl"),pd.read_pickle(alpha5+"think.pkl")]
C_TA_list = [pd.read_pickle(theta5+"count.pkl"),pd.read_pickle(alpha5+"count.pkl")]


## Feature Extraction

In [238]:
def dwt_bands_list(bands_list):
    #Applies dwt to get coefficients, 
    #Use for each list containing frequency subbands 
    dwt_df_list = [0]*len(bands_list)
    for (i,df) in enumerate(bands_list):
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db4',level=4))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db3',level=4))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db2',level=4))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db1',level=4))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db20',level=10))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'db4',level=1))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'db4'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'db20'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'db1'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'sym4'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'haar'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'coif4'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'rbio4.4'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'bior4.4'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'dmey'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'bior6.8'))
        # dwt_df_list[i] = df.applymap(lambda x: pywt.dwt(x,'bior1.1'))
        dwt_df_list[i] = df.applymap(lambda x: pywt.wavedec(x,'bior6.8',level=10))


    return dwt_df_list

def dwt_oneArr_bands_list(dwt_bands_list):
    #Concatenate the lists of coefficients as 1 list
    #Use for each list containing frequency subbands 
    dwt_oneArr_df_list = [0]*len(dwt_bands_list)
    for (i,df) in enumerate(dwt_bands_list):
        dwt_oneArr_df_list[i] = dwt_bands_list[i].applymap(lambda x: np.array([item for sublist in x for item in sublist]))
    return dwt_oneArr_df_list

def dwt_rms_bands_list(dwt_bands_list):
    #Get rms of each coefficient array, then concate as 1 list 
    #Use for each list containing frequency subbands 
    dwt_rms_df_list = [0]*len(dwt_bands_list)
    for (i,df) in enumerate(dwt_bands_list):
        dwt_rms_df_list[i] = df.applymap(lambda x: np.array([np.sqrt(np.mean(np.square(y))) for y in x]))
    return dwt_rms_df_list

def dwt_var_bands_list(dwt_bands_list):
    #Get var of each coefficient array, then concate as 1 list 
    #Use for each list containing frequency subbands 
    dwt_var_df_list = [0]*len(dwt_bands_list)
    for (i,df) in enumerate(dwt_bands_list):
        dwt_var_df_list[i] = dwt_bands_list[i].applymap(lambda x: np.array([np.var(y) for y in x]))
    return dwt_var_df_list


#DWT
T_dwt_list = dwt_bands_list(T_TA_list)
C_dwt_list = dwt_bands_list(C_TA_list)
#Single Array
T_dwt_1_list = dwt_oneArr_bands_list(T_dwt_list)
C_dwt_1_list = dwt_oneArr_bands_list(C_dwt_list)
#RMS
T_dwt_rms_list = dwt_rms_bands_list(T_dwt_list)
C_dwt_rms_list = dwt_rms_bands_list(C_dwt_list)
#Variance
T_dwt_var_list = dwt_var_bands_list(T_dwt_list)
C_dwt_var_list = dwt_var_bands_list(C_dwt_list)


In [239]:
print(len(T_dwt_1_list))
print(len(T_dwt_rms_list))
print(len(T_dwt_1_list[1].iloc[0,0]))

2
2
168


In [240]:
#Expand all lists in cells to their own variables

#Expands for a single feature_df_list
def expand_PSD_df_list(PSD_df_list):
    e_PSD_df_list = [0]*len(PSD_df_list)
    for df_no in range(len(PSD_df_list)):
        #e_PSD_df_cols_list will be used to create new dataframe
        no_PSD = len(PSD_df_list[0].iloc[0,0])
        e_PSD_df_cols_list = [0]*32
        
        for channel in range(len(PSD_df_list[df_no].columns)):
            #Expand each column into its own dataframe
            new_col = PSD_df_list[df_no][channel].apply(pd.Series)
            #Rename every variable in the new column
            new_col = new_col.rename(columns = lambda x: "Ch"+str(channel+1)+'_'+str(np.linspace(0,64,no_PSD)[x]))
            #Add new_col to cols_list
            e_PSD_df_cols_list[channel] = new_col
        
        #Create new dataframe
        e_PSD_df = pd.concat(e_PSD_df_cols_list, axis=1)
        
        #Add to list
        e_PSD_df_list[df_no] = e_PSD_df
    return e_PSD_df_list 

#Single Array
e_T_dwt_1_list = expand_PSD_df_list(T_dwt_1_list)
e_C_dwt_1_list = expand_PSD_df_list(C_dwt_1_list)
#RMS
e_T_dwt_rms_list = expand_PSD_df_list(T_dwt_rms_list)
e_C_dwt_rms_list = expand_PSD_df_list(C_dwt_rms_list)
#Variance
e_T_dwt_var_list = expand_PSD_df_list(T_dwt_var_list)
e_C_dwt_var_list = expand_PSD_df_list(C_dwt_var_list)


In [241]:
#Apply to one set of expanded PSD_df_list and AUC_df_list
def get_3F_combos_df_list(feat1_df_list,feat2_df_list,feat3_df_list):
    #Two features
    combos= [(0,1)]
    combos_df_list = [0]*len(combos)
    for i in range(len(combos)):
        feat1_list = [feat1_df_list[x] for x in combos[i]]
        feat2_list = [feat2_df_list[x] for x in combos[i]]
        feat3_list = [feat3_df_list[x] for x in combos[i]]
        concat_list = feat1_list + feat2_list + feat3_list
        combos_df_list[i] = pd.concat(concat_list,axis=1)
    return combos_df_list

In [242]:
#List of dataframes with both features
T_combos_df_list = get_3F_combos_df_list(e_T_dwt_1_list,e_T_dwt_rms_list,e_T_dwt_var_list)
C_combos_df_list = get_3F_combos_df_list(e_C_dwt_1_list,e_C_dwt_rms_list,e_C_dwt_var_list)


In [243]:
#Feature scaling all dataframes 

#Applies feature scaling to one combos_df_list
def featureScaling_df(combos_df_list):
    sc = StandardScaler()
    scaled_combos_df_list = [0]*len(combos_df_list)
    for i in range(len(combos_df_list)):
        df = combos_df_list[i]
        cols = df.columns
        scaled_combos_df_list[i] = pd.DataFrame(sc.fit_transform(df),columns=cols)

    return scaled_combos_df_list

In [244]:
sc_T_combos_df_list = featureScaling_df(T_combos_df_list)
sc_C_combos_df_list = featureScaling_df(C_combos_df_list)

In [245]:
#Append action type columns to all dataframes 

#Add action column for a list of dataframes
def add_action_col(df_list,action_type):
    new_list = [0]*len(df_list)
    for i in range(len(df_list)):
        new_df = df_list[i][:]
        new_df['Action'] = pd.Series(action_type,index=df_list[i].index) #add new column
        new_list[i] = new_df
    return new_list

sc_T_combosA_df_list = add_action_col(sc_T_combos_df_list,'T')
sc_C_combosA_df_list = add_action_col(sc_C_combos_df_list,'C')

In [246]:
#Combine different class types to form full datasets
def concatSave_df_list(T_df_list,C_df_list,sampLenStrings,filename,savedir,save=False):
    if len(C_df_list) == len(T_df_list):
        for i in range(len(C_df_list)):
            new_df = pd.concat([T_df_list[i],C_df_list[i]],axis=0)
            if save:
                savepath = savedir+filename+'_'+ sampLenStrings[i]+'.pkl'
                #Save to external HDD as pkl files 
                new_df.to_pickle(savepath)
            if i == 0:
                csvpath = savedir+filename+'_'+ sampLenStrings[i]+'.csv'
                new_df.to_csv(csvpath)
    else:
        print("Lists are of unequal lengths.")

sampLenStrings = ["0.05s"]
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/dwtAdd/" #Level 4
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db1/" #Level 4
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db2/" #Level 4
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db3/" #Level 4
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db20/" #Level 4
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db4-lvl1/"
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db20-lvl1/"
# dwt_savedir = "F:\EEG-data\\think-count\dwt\\featureScaled/db1-lvl1/"
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/sym4-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/haar-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/coiflets4-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/rbiortho4.4-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho4.4-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/dMeyer-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho6.8-lvl1/'
# dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho1.1-lvl1/'
dwt_savedir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho6.8-lvl10/'


# dwt_filename = "dwtAdd_df"
# dwt_filename = "db1_df"
# dwt_filename = "db2_df"
# dwt_filename = "db3_df"
# dwt_filename = "db20_df"
# dwt_filename = "db4-lvl1_df"
# dwt_filename = "db20-lvl1_df"
# dwt_filename = "db1-lvl1_df"
# dwt_filename = "sym4-lvl1_df"
# dwt_filename = "haar-lvl1_df"
# dwt_filename = "coiflets4-lvl1_df"
# dwt_filename = "rbiortho4-lvl1_df"
# dwt_filename = "biortho4-lvl1_df"
# dwt_filename = "dMeyer-lvl1_df"
# dwt_filename = "biortho68-lvl1_df"
# dwt_filename = "biortho11-lvl1_df"
dwt_filename = "biortho68-lvl10_df"



#DWT
concatSave_df_list(sc_T_combosA_df_list,sc_C_combosA_df_list,sampLenStrings,dwt_filename,dwt_savedir,save=True)

## SVM, Cross Validation, Evaluation

In [247]:
# Directories to load feature scaled datasets
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/dwtAdd/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db2/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db3/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db20/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db4-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db20-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/db1-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/sym4-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/haar-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/coiflets4-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/rbiortho4.4-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho4.4-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/dMeyer-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho6.8-lvl1/'
# loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho1.1-lvl1/'
loaddir = 'F:\EEG-data\\think-count\dwt\\featureScaled/biortho6.8-lvl10/'


# Directories to save results
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/dwtAdd/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db1/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db2/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db3/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db20/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db4-lvl1/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db20-lvl1/"
# savedir = "F:\EEG-data\\think-count\dwt\PCA_CV_results/db1-lvl1/"
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/sym4-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/haar-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/coiflets4-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/rbiortho4.4-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/biortho4.4-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/dMeyer-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/biortho6.8-lvl1/'
# savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/biortho1.1-lvl1/'
savedir = 'F:\EEG-data\\think-count\dwt\\PCA_CV_results/biortho6.8-lvl10/'





In [248]:
# Cross-validation for SVM 
## Applies PCA only training set to retain 99% variance
## 80% training set, 20% test set
## 5 fold cross validation
def apply_PCA_CV_SVM(loaddir,savedir):
    files = glob.glob(loaddir+'*.pkl')
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    testdir = savedir+'testResults/'
    bestF1 = {'F1 Score':0,'dataset':'','params':''}
    test_results = {'F1 Score':[],'dataset':[],'params':[]}
    #Create PCA instances
    pca99 = PCA(n_components=0.99, svd_solver='full')
    #Instantiate SVM gridsearch
    C_range = [1,3,10,30,100,300,1000]
    param_grid= [
                # {'C': C_range, 'kernel': ['linear']},
                {'C': C_range, 'gamma': [0.001, 0.0001, 'auto', 'scale'], 'kernel': ['rbf']},
                ]
    scoring = {'accuracy','f1_macro'}
    clf = svm.SVC()
    grid = GridSearchCV(clf,param_grid=param_grid,scoring=scoring,cv=5,refit='f1_macro')
    #Apply PCA and Gridsearch
    for file in files:
        #Create filenames for saving
        cv99_filename = file.split("\\")[-1].rstrip('.pkl') +'_PCA99_CV_results.pkl'
        df = pd.read_pickle(file)
        X = df.iloc[:,:-1].values
        y = df.iloc[:,-1].values
        #Split into training and test sets
        for train_index, test_index in sss.split(X,y):
            X_train, X_test = X[train_index],X[test_index]
            y_train, y_test = y[train_index],y[test_index]
        #Fit transform on training data
        x_99_train = pca99.fit_transform(X_train)
        #Fit transform on test data
        x_99_test = pca99.transform(X_test)
        #Gridsearch
        grid99 = grid.fit(x_99_train,y_train)
        #Get number of principal components
        _,cols99 = x_99_train.shape
        #Get results
        results_df99 = pd.DataFrame(grid99.cv_results_)
        #Append number of principal components
        results_df99['No. of PC'] = [cols99]*len(results_df99.index) 
        #Save results
        results_df99.to_pickle(savedir+cv99_filename)
        #Export as csv 
        results_df99.to_csv(savedir+cv99_filename+'.csv')
        #Fit SVM on best parameters 
        best_params99 = results_df99.loc[results_df99['rank_test_f1_macro'].idxmin()]['params']
        
        clf99 = svm.SVC(**best_params99)
        clf99.fit(x_99_train,y_train)
        y_99_pred = clf99.predict(x_99_test)

        y_pred_list = [y_99_pred]
        x_test_list = [x_99_test]
        clf_list = [clf99]
        best_params_list =[best_params99]

        #Generate and save classification report, macro-avg f1 score, confusion matrix
        
        target_names = ['T','C']
        names_list = ['99']
        for i in range(len(y_pred_list)):
            df_name = file.split("\\")[-1].rstrip('.pkl')+'_'+names_list[i]
            report_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_clf_report'
            cm_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_ConfusionMatrix'
            ##Classification report 
            report = classification_report(y_test, y_pred_list[i], target_names=target_names, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            report_df.to_pickle(testdir+report_name+'.pkl')
            ##Macro-avg f1 score 
            f1Score = report['macro avg']['f1-score']
            test_results['F1 Score'].append(f1Score)
            test_results['dataset'].append(df_name)
            test_results['params'].append(best_params_list[i])
            test_results_df = pd.DataFrame(test_results)
            if bestF1['F1 Score']<f1Score:
                bestF1['F1 Score'] = f1Score
                bestF1['dataset'] = df_name
                bestF1['params'] = str(best_params_list[i])
            ##Confusion matrix
            fig,ax = plt.subplots()
            ax.set_title(df_name+' CM')
            plot_confusion_matrix(clf_list[i],x_test_list[i],y_test,labels=target_names,ax=ax,normalize='true')
            plt.savefig(testdir+df_name+'.png')
            plt.close()

    bestF1_df = pd.DataFrame(bestF1,index=[0],columns=['F1 Score','dataset','params'])
    bestF1_df.to_csv(testdir+bestF1['dataset']+'.csv')
    test_results_df.to_csv(testdir+'test_results.csv')
    test_results_df.to_csv(testdir+'test_results.pkl')
        


apply_PCA_CV_SVM(loaddir,savedir)