# EEG Music PCA 

Goal: Classify types of music by EEG features 

3 types of music: White Noise, Classical, Hip-Hop 
Lengths of music (raw): 
    White Noise: 5934 recorded samples 
    Classical:   6128 recorded samples 
    Hip-Hop:     7674 recorded samples 
Data collection: 128 samples/s, 32 Channels 

No.of 0.1s samples in filtered data: 
    White Noise: 494 
    Classical:   510
    Hip-Hop:     639 

Steps:
(Generate datasets for different channels)
1. Train test split
2. Apply PCA
3. Cross Validation
4. SVM classification

In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
from scipy import signal
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pickle as pkl
import itertools 
import glob
from sklearn import svm 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.decomposition import PCA

# %matplotlib inline 
%matplotlib qt


In [2]:
# Load feature scaled datasets
loaddir = 'F:/EEG-data/pkl/featureScaled/'
loadAUC_dir = loaddir+'AUC_only/'
loadPSD_dir = loaddir+'PSD_only/'
loadAUC_PSD_dir = loaddir+'AUC_PSD/'

In [9]:
# Testing with one dataset
files = glob.glob(loadAUC_PSD_dir+'*.pkl')
test_data = pd.read_pickle(files[0])

X = test_data.iloc[:,:-1].values
y = test_data.iloc[:,-1].values

print(X.shape)


(1643, 160)


In [48]:
# Apply PCA at four levels, 0.99, 0.95, 0.9, 0.85, then perform cross-validation for SVM 
def apply_PCA_CV_SVM(loaddir,savedir):
    files = glob.glob(loaddir+'*.pkl')
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    testdir = savedir+'testResults/'
    bestF1 = {'F1 Score':0,'dataset':'','params':''}
    test_results = {'F1 Score':[],'dataset':[],'params':[]}
    #Create PCA instances
    pca99 = PCA(n_components=0.99, svd_solver='full')
    pca95 = PCA(n_components=0.95, svd_solver='full')
    pca90 = PCA(n_components=0.90, svd_solver='full')
    pca85 = PCA(n_components=0.85, svd_solver='full')
    #Instantiate SVM gridsearch
    C_range = [1,3,10,30,100,300,1000]
    param_grid= [
                # {'C': C_range, 'kernel': ['linear']},
                {'C': C_range, 'gamma': [0.001, 0.0001, 'auto', 'scale'], 'kernel': ['rbf']},
                ]
    scoring = {'accuracy','f1_macro'}
    clf = svm.SVC()
    grid = GridSearchCV(clf,param_grid=param_grid,scoring=scoring,cv=5,refit='f1_macro')
    #Apply PCA and Gridsearch
    for file in files:
        #Create filenames for saving
        cv99_filename = file.split("\\")[-1].rstrip('.pkl') +'_PCA99_CV_results.pkl'
        cv95_filename = file.split("\\")[-1].rstrip('.pkl') +'_PCA95_CV_results.pkl'
        cv90_filename = file.split("\\")[-1].rstrip('.pkl') +'_PCA90_CV_results.pkl'
        cv85_filename = file.split("\\")[-1].rstrip('.pkl') +'_PCA85_CV_results.pkl'
        df = pd.read_pickle(file)
        X = df.iloc[:,:-1].values
        y = df.iloc[:,-1].values
        #Split into training and test sets
        for train_index, test_index in sss.split(X,y):
            X_train, X_test = X[train_index],X[test_index]
            y_train, y_test = y[train_index],y[test_index]
        #Fit transform on training data
        x_99_train = pca99.fit_transform(X_train)
        x_95_train = pca95.fit_transform(X_train)
        x_90_train = pca90.fit_transform(X_train)
        x_85_train = pca85.fit_transform(X_train)
        #Fit transform on test data
        x_99_test = pca99.transform(X_test)
        x_95_test = pca95.transform(X_test)
        x_90_test = pca90.transform(X_test)
        x_85_test = pca85.transform(X_test)
        #Gridsearch
        grid99 = grid.fit(x_99_train,y_train)
        grid95 = grid.fit(x_95_train,y_train)
        grid90 = grid.fit(x_90_train,y_train)
        grid85 = grid.fit(x_85_train,y_train)
        #Get number of principal components
        _,cols99 = x_99_train.shape
        _,cols95 = x_95_train.shape
        _,cols90 = x_90_train.shape
        _,cols85 = x_85_train.shape
        #Get results
        results_df99 = pd.DataFrame(grid99.cv_results_)
        results_df95 = pd.DataFrame(grid95.cv_results_)
        results_df90 = pd.DataFrame(grid90.cv_results_)
        results_df85 = pd.DataFrame(grid85.cv_results_)
        #Append number of principal components
        results_df99['No. of PC'] = [cols99]*len(results_df99.index) 
        results_df95['No. of PC'] = [cols95]*len(results_df95.index) 
        results_df90['No. of PC'] = [cols90]*len(results_df90.index) 
        results_df85['No. of PC'] = [cols85]*len(results_df85.index)  
        #Export results
        results_df99.to_pickle(savedir+cv99_filename)
        results_df95.to_pickle(savedir+cv95_filename)
        results_df90.to_pickle(savedir+cv90_filename)
        results_df85.to_pickle(savedir+cv85_filename)
        #Export as csv 
        results_df99.to_csv(savedir+cv99_filename+'.csv')
        #Fit SVM on best parameters 
        best_params99 = results_df99.loc[results_df99['rank_test_f1_macro'].idxmin()]['params']
        best_params95 = results_df95.loc[results_df95['rank_test_f1_macro'].idxmin()]['params']
        best_params90 = results_df90.loc[results_df90['rank_test_f1_macro'].idxmin()]['params']
        best_params85 = results_df85.loc[results_df85['rank_test_f1_macro'].idxmin()]['params']
        
        clf99 = svm.SVC(**best_params99)
        clf99.fit(x_99_train,y_train)
        y_99_pred = clf99.predict(x_99_test)

        clf95 = svm.SVC(**best_params95)
        clf95.fit(x_95_train,y_train)
        y_95_pred = clf95.predict(x_95_test)

        clf90 = svm.SVC(**best_params90)
        clf90.fit(x_90_train,y_train)
        y_90_pred = clf90.predict(x_90_test)

        clf85 = svm.SVC(**best_params85)
        clf85.fit(x_85_train,y_train)
        y_85_pred = clf85.predict(x_85_test)

        y_pred_list = [y_99_pred,y_95_pred,y_90_pred,y_85_pred]
        x_test_list = [x_99_test,x_95_test,x_90_test,x_85_test]
        clf_list = [clf99,clf95,clf90,clf85]
        best_params_list =[best_params99,best_params95,best_params90,best_params85]

        #Generate and save classification report, macro-avg f1 score, confusion matrix
        
        target_names = ['C','H','W']
        names_list = ['99','95','90','85']
        for i in range(len(y_pred_list)):
            df_name = file.split("\\")[-1].rstrip('.pkl')+'_'+names_list[i]
            report_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_clf_report'
            cm_name = file.split("\\")[-1].rstrip('.pkl') + '_'+names_list[i]+'_PCA_CV_SVM_ConfusionMatrix'
            ##Classification report 
            report = classification_report(y_test, y_pred_list[i], target_names=target_names, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            report_df.to_pickle(testdir+report_name+'.pkl')
            ##Macro-avg f1 score 
            f1Score = report['macro avg']['f1-score']
            test_results['F1 Score'].append(f1Score)
            test_results['dataset'].append(df_name)
            test_results['params'].append(best_params_list[i])
            test_results_df = pd.DataFrame(test_results)
            if bestF1['F1 Score']<f1Score:
                bestF1['F1 Score'] = f1Score
                bestF1['dataset'] = df_name
                bestF1['params'] = str(best_params_list[i])
            ##Confusion matrix
            fig,ax = plt.subplots()
            ax.set_title(df_name+' CM')
            plot_confusion_matrix(clf_list[i],x_test_list[i],y_test,labels=target_names,ax=ax,normalize='true')
            plt.savefig(testdir+df_name+'.png')
            plt.close()

    bestF1_df = pd.DataFrame(bestF1,index=[0],columns=['F1 Score','dataset','params'])
    bestF1_df.to_csv(testdir+bestF1['dataset']+'.csv')
    test_results_df.to_csv(testdir+'test_results.csv')
    test_results_df.to_csv(testdir+'test_results.pkl')
        

        
    
        

savedir = 'F:\EEG-data\pkl\\0.1s_results\PCA_CV_results/'
saveAUC_dir = savedir+'AUC_only/'
savePSD_dir = savedir+'PSD_only/'
saveAUC_PSD_dir = savedir+'AUC_PSD/'

apply_PCA_CV_SVM(loadAUC_dir,saveAUC_dir)
apply_PCA_CV_SVM(loadPSD_dir,savePSD_dir)
apply_PCA_CV_SVM(loadAUC_PSD_dir,saveAUC_PSD_dir)





## Compare PCA results

In [100]:
#Create dataframes for comparing PCA results 
AUC_PCA_dir = "F:\EEG-data\pkl\\0.1s_results\PCA_CV_results\AUC_only/"
PSD_PCA_dir = "F:\EEG-data\pkl\\0.1s_results\PCA_CV_results\PSD_only/"
AP_PCA_dir = "F:\EEG-data\pkl\\0.1s_results\PCA_CV_results\AUC_PSD/"

def compare_PCA_results(PCA_dir,AP=False):
    files = glob.glob(PCA_dir+'*.pkl')
    #Load test set results csv
    test_df = pd.read_csv(PCA_dir+'testResults/test_results.csv')
    #Create 1 dataframe 
    PCA_results = {'Dataset':[],'Variance retained':[],'No. of components':[],'Best mean F1 score':[]}
    for file in files:
        pca_df = pd.read_pickle(file)
        dfname_list = file.split("\\")[-1].rstrip("_CV_results.pkl").split("_")
        if AP:
            variance = dfname_list[4].lstrip('PCA')+'%'
            dfname = dfname_list[0]+'_'+dfname_list[1]+'_'+dfname_list[2]+'_'+dfname_list[3]+'_'+dfname_list[4].lstrip('PCA')
        else:
            variance = dfname_list[3].lstrip('PCA')+'%'
            dfname = dfname_list[0]+'_'+dfname_list[1]+'_'+dfname_list[2]+'_'+dfname_list[3].lstrip('PCA')
            

        PCA_results['Dataset'].append(dfname)
        f1Score = test_df.loc[(test_df['dataset']==dfname)]['F1 Score'].values[0]
        PCA_results['Best mean F1 score'].append(f1Score)
        no_components = pca_df.iloc[0,-1]
        PCA_results['No. of components'].append(no_components)
        PCA_results['Variance retained'].append(variance)

        # print(dfname)
        # print((f1Score))
        # print(variance)
        # print(no_components)
    
    PCA_results_df = pd.DataFrame(PCA_results).sort_values(by=['Dataset','Variance retained'])
    PCA_results_df.to_pickle(PCA_dir+'/testResults/PCA_results.pkl')
    PCA_results_df.to_csv(PCA_dir+'/testResults/PCA_results.csv')
    # print(PCA_results_df)


# compare_PCA_results(AUC_PCA_dir)
# compare_PCA_results(PSD_PCA_dir)
compare_PCA_results(AP_PCA_dir,AP=True)

## Investigating Channel effects

In [2]:
#Load the 3 feature scaled datasets with best F1 scores

AUC_df = pd.read_pickle("F:\EEG-data\pkl\\featureScaled\AUC_only/AUC_df_0124.pkl")
PSD_df = pd.read_pickle("F:\EEG-data\pkl\\featureScaled\PSD_only/PSD_df_01234.pkl")
AP_df = pd.read_pickle("F:\EEG-data\pkl\\featureScaled\AUC_PSD/AUC_PSD_df_01234.pkl")


In [15]:
#For each dataset, split into 32 channels
def testChannels(df,best_params):
    #Choose theta and alpha bands
    X = df.iloc[:,32:96].values
    print(X.shape)
    y = df.iloc[:,-1].values
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    pca99 = PCA(n_components=0.99, svd_solver='full')
    Ch_results = {'Channel combinations':[], "F1 score":[]}
    for train_index, test_index in sss.split(X,y):
            X_train, X_test = X[train_index],X[test_index]
            y_train, y_test = y[train_index],y[test_index]
    channels = [str(x) for x in range(64)]
    for i in range(62,64+1):
        combos_iter = itertools.combinations(channels,i)
        combos_list = [ [int(x) for x in combo] for combo in combos_iter]
        for combo in combos_list:
    
            X_combo = X_train[:,combo]
            x_ctest = X_test[:,combo]
            x_99_train = pca99.fit_transform(X_combo)
            x_99_test = pca99.transform(x_ctest)

            clf = svm.SVC(**best_params)
            clf.fit(x_99_train,y_train)
            y_pred = clf.predict(x_99_test)

            f1Score = f1_score(y_test,y_pred,average='macro')
            Ch_results['Channel combinations'].append((combo))
            Ch_results['F1 score'].append(f1Score)
            # print(f1Score)

    pd.DataFrame(Ch_results).to_csv('F:\EEG-data\pkl\\0.1s_results\\testResults\channels/AUC_channels.csv')
# AUC_df.to_csv("F:\EEG-data\pkl\\featureScaled\AUC_only/AUC_df_0124.csv")
# X = AUC_df.iloc[:,:-1].values
# print(X.shape)
# X = X[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]]
# print(X.shape)
# y = AUC_df.iloc[:,-1].values
# sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
# for train_index, test_index in sss.split(X,y):
#             X_train, X_test = X[train_index],X[test_index]
#             y_train, y_test = y[train_index],y[test_index]
# pca99 = PCA(n_components=0.99, svd_solver='full')
# x_99_train = pca99.fit_transform(X_train)
# x_99_test = pca99.transform(X_test)
# best_params = {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
# clf = svm.SVC(**best_params)
# clf.fit(x_99_train,y_train)
# y_pred = clf.predict(x_99_test)
# f1Score = f1_score(y_test,y_pred,average='macro')
# print(f1Score)

            

testChannels(AUC_df,{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'})

# AUC_df.iloc[:,[0,2]]

# channels = [str(x) for x in range(32)]
# combos_iter = itertools.combinations(channels,2)
# combos_list = [ [int(x) for x in combo] for combo in combos_iter]
# combos_list = [int(j) for i in combos_iter for j in i]
# a = itertools.combinations(channels,32)
# for i in a:
#     print(''.join(i))

# print(510 +639 +494)
# print(AUC_df.shape)
# print(PSD_df.shape)
# print(PSD_df.shape)

# print(combos_list)
# AUC_df.iloc[:,-1]

(1643, 64)
0.7569601882518802
0.7690465455638456
0.7729330907620781
0.7613956906526566
0.7610981476680206
0.787348788354875
0.7660602385284011
0.7602281621080893
0.7676190864523779
0.7750305592975103
0.7554388701707713
0.7592426828726727
0.7514660885941868
0.7601240248243307
0.7611374074808354
0.7526808768187743
0.7601620600602979
0.7786493200286304
0.7646878211394341
0.7550356926446905
0.7556444781505486
0.7481281422168121
0.7474705779563964
0.766467777950875
0.7473181537886028
0.7600997356054556
0.7451942583152734
0.7631638500490233
0.7483791093127579
0.7500374807910944
0.7700493641670113
0.7571454550177955
0.7487324615280953
0.74256772151509
0.7574227290797059
0.7333038608900678
0.7687894834633964


KeyboardInterrupt: 

In [23]:
# # View results
# pd.DataFrame(grid.cv_results_)[['mean_test_f1_macro', 'std_test_f1_macro', 'params']]
# df = pd.DataFrame(grid.cv_results_)
# print(df)

# pca99 = PCA(n_components=0.99, svd_solver='full')
# principalComponents = pca99.fit(X)
# print(principalComponents.explained_variance_ratio_)
# principalDf = pd.DataFrame(data = principalComponents
#              , columns = ['principal component 1', 'principal component 2'])
# finalDf = pd.concat([principalDf, y], axis = 1)



# fig = plt.figure(figsize = (8,8))
# ax = fig.add_subplot(1,1,1) 
# ax.set_xlabel('Principal Component 1', fontsize = 15)
# ax.set_ylabel('Principal Component 2', fontsize = 15)
# ax.set_title('2 component PCA', fontsize = 20)
# targets = ['C', 'H', 'W']
# colors = ['r', 'g', 'b']
# for target, color in zip(targets,colors):
#     indicesToKeep = finalDf.iloc[:,-1] == target
#     ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
#                , finalDf.loc[indicesToKeep, 'principal component 2']
#                , c = color
#                , s = 50)
# ax.legend(targets)
# ax.grid()

In [34]:
df['AAA'] = [32]*28
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_f1_macro,split1_test_f1_macro,...,rank_test_f1_macro,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,AAA
0,0.151003,0.015046,0.028916,0.00303,1,0.001,rbf,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.379273,0.304764,...,25,0.452471,0.422053,0.425856,0.425856,0.431298,0.431507,0.010889,25,32
1,0.160371,0.017767,0.028325,0.004789,1,0.0001,rbf,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.210615,0.195558,...,28,0.395437,0.391635,0.395437,0.391635,0.389313,0.392691,0.002397,28,32
2,0.141828,0.007284,0.024934,0.001766,1,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.508618,0.525238,...,6,0.509506,0.528517,0.570342,0.513308,0.526718,0.529678,0.021627,8,32
3,0.132637,0.006306,0.026535,0.001946,1,scale,rbf,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.496138,0.490916,...,21,0.498099,0.505703,0.520913,0.539924,0.522901,0.517508,0.014555,21,32
4,0.138023,0.006363,0.027527,0.001849,3,0.001,rbf,"{'C': 3, 'gamma': 0.001, 'kernel': 'rbf'}",0.452861,0.48394,...,23,0.471483,0.505703,0.513308,0.551331,0.496183,0.507602,0.026013,24,32
5,0.156977,0.01399,0.028124,0.002778,3,0.0001,rbf,"{'C': 3, 'gamma': 0.0001, 'kernel': 'rbf'}",0.252717,0.229562,...,27,0.410646,0.395437,0.410646,0.414449,0.412214,0.408678,0.006766,27,32
6,0.151201,0.01028,0.026535,0.002641,3,auto,rbf,"{'C': 3, 'gamma': 'auto', 'kernel': 'rbf'}",0.505268,0.52965,...,1,0.509506,0.532319,0.596958,0.524715,0.557252,0.54415,0.030588,1,32
7,0.13524,0.005784,0.026735,0.002482,3,scale,rbf,"{'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}",0.523083,0.489002,...,5,0.520913,0.501901,0.547529,0.551331,0.549618,0.534258,0.01964,4,32
8,0.132436,0.004471,0.026131,0.00213,10,0.001,rbf,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}",0.498114,0.494127,...,18,0.505703,0.513308,0.520913,0.555133,0.526718,0.524355,0.016939,15,32
9,0.141214,0.008533,0.026747,0.001722,10,0.0001,rbf,"{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}",0.383751,0.296192,...,26,0.452471,0.414449,0.425856,0.429658,0.427481,0.429983,0.012413,26,32
