## EEG Music compare effects of channels

Using theta-alpha bands, welch's method features, using linear SVM kernel to determine feature importance 

In [101]:
#Import necessary libraries

import pandas as pd
import numpy as np
from scipy import signal
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pickle as pkl
import itertools 
import glob
from sklearn import svm 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.decomposition import PCA
from spectrum import data_cosine, dpss, pmtm


# %matplotlib inline 
%matplotlib qt



In [2]:
# Directories to load feature scaled datasets
loaddir = 'F:\EEG-data\music\channel_comparison\\featureScaled/'
loadAUC_dir = loaddir+'AUC/'
loadPSD_dir = loaddir+'PSD/'
loadAUC_PSD_dir = loaddir+'AUC_PSD/'

# Directories to save results
savedir = "F:\EEG-data\music\channel_comparison\PCA_CV_results/"
saveAUC_dir = savedir+'AUC/'
savePSD_dir = savedir+'PSD/'
saveAUC_PSD_dir = savedir+'AUC_PSD/'

In [3]:
# Cross-validation for SVM 
## Should not do PCA because we want to understand channel information
## 80% training set, 20% test set
## 5 fold cross validation
def apply_PCA_CV_SVM(loaddir,savedir):
    files = glob.glob(loaddir+'*.pkl')
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    testdir = savedir+'testResults/'
    bestF1 = {'F1 Score':0,'dataset':'','params':''}
    test_results = {'F1 Score':[],'dataset':[],'params':[]}
    coeff_results = {'dataset':[],'coeff':[]}
    #Instantiate SVM gridsearch
    C_range = [1,10]
    param_grid= [
                {'C': C_range, 'kernel': ['linear']},
                # {'C': C_range, 'gamma': [0.001, 0.0001, 'auto', 'scale'], 'kernel': ['rbf']},
                ]
    scoring = {'accuracy','f1_macro'}
    clf = svm.SVC()
    grid = GridSearchCV(clf,param_grid=param_grid,scoring=scoring,cv=5,refit='f1_macro')
    #Apply PCA and Gridsearch
    for file in files:
        #Create filenames for saving
        cv_filename = file.split("\\")[-1].rstrip('.pkl') +'_CV_results.pkl'
        df = pd.read_pickle(file)
        X = df.iloc[:,:-1].values
        y = df.iloc[:,-1].values
        #Split into training and test sets
        for train_index, test_index in sss.split(X,y):
            X_train, X_test = X[train_index],X[test_index]
            y_train, y_test = y[train_index],y[test_index]
        #Gridsearch
        grid = grid.fit(X_train,y_train)
        #Get results
        results_df = pd.DataFrame(grid.cv_results_)
        #Save results
        results_df.to_pickle(savedir+cv_filename)
        #Export as csv 
        results_df.to_csv(savedir+cv_filename+'.csv')
        #Fit SVM on best parameters 
        best_params = results_df.loc[results_df['rank_test_f1_macro'].idxmin()]['params']
        
        clf = svm.SVC(**best_params)
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)

        df_name = file.split("\\")[-1].rstrip('.pkl')
        coeff_results['dataset'].append(df_name)
        coeff_results['coeff'].append(clf.coef_)


        y_pred_list = [y_pred]
        x_test_list = [X_test]
        clf_list = [clf]
        best_params_list =[best_params]

        #Generate and save classification report, macro-avg f1 score, confusion matrix
        
        target_names = ['C','H','W']
        names_list = ['1']
        for i in range(len(y_pred_list)):
            df_name = file.split("\\")[-1].rstrip('.pkl')+'_'+names_list[i]
            report_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_clf_report'
            cm_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_ConfusionMatrix'
            ##Classification report 
            report = classification_report(y_test, y_pred_list[i], target_names=target_names, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            report_df.to_pickle(testdir+report_name+'.pkl')
            ##Macro-avg f1 score 
            f1Score = report['macro avg']['f1-score']
            test_results['F1 Score'].append(f1Score)
            test_results['dataset'].append(df_name)
            test_results['params'].append(best_params_list[i])
            test_results_df = pd.DataFrame(test_results)
            if bestF1['F1 Score']<f1Score:
                bestF1['F1 Score'] = f1Score
                bestF1['dataset'] = df_name
                bestF1['params'] = str(best_params_list[i])
            ##Confusion matrix
            fig,ax = plt.subplots()
            ax.set_title(df_name+' CM')
            plot_confusion_matrix(clf_list[i],x_test_list[i],y_test,labels=target_names,ax=ax,normalize='true')
            plt.savefig(testdir+df_name+'.png')
            plt.close()

    coeff_df = pd.DataFrame(coeff_results,columns=['dataset','coeff'])
    coeff_df.to_csv(testdir+'coefficients.csv')

    bestF1_df = pd.DataFrame(bestF1,index=[0],columns=['F1 Score','dataset','params'])
    bestF1_df.to_csv(testdir+bestF1['dataset']+'.csv')
    test_results_df.to_csv(testdir+'test_results.csv')
    test_results_df.to_csv(testdir+'test_results.pkl')
        


apply_PCA_CV_SVM(loadAUC_dir,saveAUC_dir)
apply_PCA_CV_SVM(loadPSD_dir,savePSD_dir)
apply_PCA_CV_SVM(loadAUC_PSD_dir,saveAUC_PSD_dir)


In [6]:
# Directories for channel comparison results
resdir = "F:\EEG-data\music\channel_comparison\PCA_CV_results/"
resAUC_dir = resdir+'AUC/testResults/'
resPSD_dir = resdir+'PSD/testResults/'
resAUC_PSD_dir = resdir+'AUC_PSD/testResults/'

In [98]:
#AUC only
auc_coeff = pd.read_csv(resAUC_dir+'coefficients.csv')
auc_coeff_array = auc_coeff['coeff'].values
no_features = 32
auc_coeff_split_list = auc_coeff_array[0].strip("[]").split(" ")
auc_coeff_dict = {"CH_feat":[],"CW_feat":[],"HW_feat":[]}
count = 0
for i in range(len(auc_coeff_split_list)):
    if len(auc_coeff_split_list[i])>1:
        num = ''
        for j in auc_coeff_split_list[i]:
            if j in '.e0123456789-':
                num+=j
        num = abs(float(num))
        if len(auc_coeff_dict['CH_feat'])<64:
            auc_coeff_dict['CH_feat'].append(num)
        elif len(auc_coeff_dict['CW_feat'])<64:
            auc_coeff_dict['CW_feat'].append(num)
        else:
            auc_coeff_dict['HW_feat'].append(num)

feature_names_theta = ["Ch"+str(x+1)+"_theta" for x in range(32)]
feature_names_alpha = ["Ch"+str(x+1)+"_alpha" for x in range(32)]
feature_names = np.array(feature_names_theta+feature_names_alpha)

top_CH_coeff = np.argsort(auc_coeff_dict["CH_feat"])[:10]
top_CW_coeff = np.argsort(auc_coeff_dict["CW_feat"])[:10]
top_HW_coeff = np.argsort(auc_coeff_dict["HW_feat"])[:10]

In [103]:
print(feature_names[top_CH_coeff])
plt.bar(feature_names[top_CH_coeff],np.array(auc_coeff_dict["CH_feat"])[top_CH_coeff])
plt.title("Music: Classical vs Hip-Hop Top 10 features, AUC only")

['Ch7_alpha' 'Ch2_alpha' 'Ch10_alpha' 'Ch2_theta' 'Ch29_theta' 'Ch3_theta'
 'Ch11_alpha' 'Ch15_alpha' 'Ch1_alpha' 'Ch6_alpha']


Text(0.5, 1.0, 'Music: Classical vs Hip-Hop Top 10 features, AUC only')

In [105]:
print(feature_names[top_CW_coeff])
plt.bar(feature_names[top_CW_coeff],np.array(auc_coeff_dict["CW_feat"])[top_CW_coeff])
plt.title("Music: Classical vs Whitenoise Top 10 features, AUC only")

['Ch11_alpha' 'Ch26_alpha' 'Ch25_theta' 'Ch24_theta' 'Ch6_theta'
 'Ch25_alpha' 'Ch3_theta' 'Ch13_theta' 'Ch5_theta' 'Ch24_alpha']


Text(0.5, 1.0, 'Music: Classical vs Whitenoise Top 10 features, AUC only')

In [106]:
print(feature_names[top_HW_coeff])
plt.bar(feature_names[top_HW_coeff],np.array(auc_coeff_dict["HW_feat"])[top_HW_coeff])
plt.title("Music: Hip-Hop vs Whitenoise Top 10 features, AUC only")

['Ch16_alpha' 'Ch30_theta' 'Ch10_alpha' 'Ch23_alpha' 'Ch5_alpha'
 'Ch9_alpha' 'Ch21_theta' 'Ch28_theta' 'Ch6_alpha' 'Ch8_alpha']


Text(0.5, 1.0, 'Music: Hip-Hop vs Whitenoise Top 10 features, AUC only')

In [46]:
print(auc_coeff_arrays[0].strip("[]").split(" "))

['', '3.05534262e-05', '', '1.62994630e-06', '', '1.94907005e-06', '', '4.19017998e-05\n', '', '-5.18407729e-05', '-2.70846634e-05', '', '5.69198970e-06', '', '2.11566881e-05\n', '', '-1.84258801e-05', '-5.11040217e-05', '-9.69757833e-06', '', '1.94537522e-05\n', '', '', '5.05741847e-05', '-1.14444394e-05', '', '1.45508843e-05', '', '5.00372714e-05\n', '', '-2.60584952e-05', '', '2.88357531e-05', '-5.31262064e-05', '', '4.05577525e-05\n', '', '', '2.38680088e-05', '', '1.03046200e-05', '-9.71248473e-06', '', '5.54147150e-05\n', '', '-4.33371290e-05', '', '4.66305935e-05', '', '2.87340000e-05', '-1.26510949e-05\n', '', '', '1.80935521e-06', '', '1.36339819e-05', '', '9.59430429e-05', '-9.26609671e-05\n', '', '-3.61819861e-06', '-3.35852434e-07', '-5.73200873e-05', '', '9.73929130e-06\n', '', '', '4.33890609e-05', '', '4.94115062e-06', '-2.19272195e-07', '-1.11081518e-05\n', '', '', '1.50689974e-05', '-8.54784759e-07', '-2.77525101e-06', '', '3.26678433e-05\n', '', '', '7.47756229e-05', 