# EEG Music SVM Classifier

Goal: Classify types of music by EEG features 

3 types of music: White Noise, Classical, Hip-Hop
Lengths of music (raw): 
    White Noise: 5934 recorded samples
    Classical:   6128 recorded samples
    Hip-Hop:     7674 recorded samples
Data collection: 128 samples/s, 32 Channels 

No.of 0.1s samples in filtered data:
    White Noise: 494
    Classical:   510
    Hip-Hop:     639 
    
Steps:
1. Feature Scaling
- Save feature scaled datasets 

2. Split dataset into Train and Test sets
- 80%-20% Train test stratified shuffle split (keeps same proportion of each class in each set)

3. SVM classification 
- Perform grid search for SVM parameters on each dataset
- Determine the best feature parameters and features 

4. SVM evaluation
- Using the best feature parameters, fit on the test sets  

Work Done:1,2,3,4

Problems: 
1. Grid search takes too long to run when C = 100

Future work: 

In [10]:
#Import necessary libraries

import pandas as pd
import numpy as np
from scipy import signal
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pickle as pkl
import itertools 
import glob
from sklearn import svm 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

# %matplotlib inline 
%matplotlib qt


In [3]:
#Feature Scaling

#Load datasets 
loaddir = 'F:/EEG-data/pkl/featureExtraction/'
loadAUC_dir = loaddir+'AUC_only/'
loadPSD_dir = loaddir+'PSD_only/'
loadAUC_PSD_dir = loaddir+'AUC_PSD/'

savedir = 'F:/EEG-data/pkl/featureScaled/'
saveAUC_dir = savedir+'AUC_only/'
savePSD_dir = savedir+'PSD_only/'
saveAUC_PSD_dir = savedir+'AUC_PSD/'

#Function for feature scaling
def featureScaling_df(loaddir,savedir):
    sc = StandardScaler()
    files = glob.glob(loaddir+'*pkl')
    for file in files:
        filename = file.split("\\")[-1]
        df = pd.read_pickle(file)
        cols = df.columns
        data = df.iloc[:,0:-1]
        music = df.iloc[:,-1].values
        music = np.reshape(music,(len(music),1))
        scaled_data = sc.fit_transform(data)
        new_df = pd.DataFrame(np.hstack((scaled_data,music)),columns=cols)
        savefile = savedir+filename
        new_df.to_pickle(savefile)
        # new_df.to_csv(savefile+'.csv')

# featureScaling_df(loadPSD_dir,savePSD_dir)
# featureScaling_df(loadAUC_dir,saveAUC_dir)
# featureScaling_df(loadAUC_PSD_dir,saveAUC_PSD_dir)





In [3]:
# Load feature scaled datasets
loaddir = 'F:/EEG-data/pkl/featureScaled/'
loadAUC_dir = loaddir+'AUC_only/'
loadPSD_dir = loaddir+'PSD_only/'
loadAUC_PSD_dir = loaddir+'AUC_PSD/'

files = glob.glob(loadAUC_PSD_dir+'*.pkl')
test_data = pd.read_pickle(files[0])

X = test_data.iloc[:,:-1].values
y = test_data.iloc[:,-1].values

print(X.shape)
print(y.shape)

(1643, 160)
(1643,)


In [3]:
#Run cross validation, fit on test set and save all the results

def SVM_CV(loaddir,savedir):
    files = glob.glob(loaddir+'*.pkl')
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    #Parameter values to be searched
    # param_grid = {'C': loguniform(1e0, 1e3),
    #             'gamma': [loguniform(1e-4, 1e-3),'auto','scale'],
    #             'kernel': ['linear','rbf']}
    # param_grid = {'C': [1,3,10,30,100],
    #             'gamma': [1e-4, 1e-3,'auto','scale'],
    #             'kernel': ['linear','rbf']}
    C_range = [1,3,10,30]
    param_grid= [
                {'C': C_range, 'kernel': ['linear']},
                {'C': C_range, 'gamma': [0.001, 0.0001, 'auto', 'scale'], 'kernel': ['rbf']},
                ]
    scoring = {'accuracy','f1_macro'}
    #SVM classifer
    clf = svm.SVC()
    for file in files:
        df = pd.read_pickle(file)
        cv_filename = file.split("\\")[-1] +'CV_results.pkl'
        # test_filename = file.split("\\")[-1] +'test_results.pkl'
        X = df.iloc[:,:-1].values
        y = df.iloc[:,-1].values
        #Split into training and test sets
        for train_index, test_index in sss.split(X,y):
            X_train, X_test = X[train_index],X[test_index]
            y_train, y_test = y[train_index],y[test_index]
        #Instantiate grid search
        grid = GridSearchCV(clf,param_grid=param_grid,scoring=scoring,cv=5,refit='f1_macro')
        grid.fit(X_train,y_train)
        #Get results
        results_df = pd.DataFrame(grid.cv_results_)
        #Export results
        results_df.to_pickle(savedir+cv_filename)
        #Get best model parameters and test on test set 


loaddir = 'F:/EEG-data/pkl/featureScaled/'
loadAUC_dir = loaddir+'AUC_only/'
loadPSD_dir = loaddir+'PSD_only/'
loadAUC_PSD_dir = loaddir+'AUC_PSD/'

savedir = 'F:/EEG-data/pkl/0.1s_results/'
saveAUC_dir = savedir+'AUC_only/'
savePSD_dir = savedir+'PSD_only/'
saveAUC_PSD_dir = savedir+'AUC_PSD/'


SVM_CV(loadAUC_dir,saveAUC_dir)
SVM_CV(loadPSD_dir,savePSD_dir)
SVM_CV(loadAUC_PSD_dir,saveAUC_PSD_dir)


KeyboardInterrupt: 

In [4]:
#Evaluation on test sets 

#Data directories 
datadir = 'F:/EEG-data/pkl/featureScaled/'
dataAUC_dir = datadir+'AUC_only/'
dataPSD_dir = datadir+'PSD_only/'
dataAUC_PSD_dir = datadir+'AUC_PSD/'
#Results directories 
resdir ='F:\EEG-data\pkl\\0.1s_results/' 
resAUC_dir = resdir+'AUC_only/'
resPSD_dir = resdir+'PSD_only/'
resAUC_PSD_dir = resdir+'AUC_PSD/'
#Save directories 
testdir = 'F:\EEG-data\pkl\\0.1s_results\\testResults/'
testAUC_dir = testdir+'AUC_only/'
testPSD_dir = testdir+'PSD_only/'
testAUC_PSD_dir = testdir+'AUC_PSD/'


def SVM_test(datadir,resdir,testdir):
    sss = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    datafiles = glob.glob(datadir+'*.pkl')
    resfiles = glob.glob(resdir+'*.pkl')
    bestF1 = {'F1 Score':0,'dataset':'','params':''}
    test_results = {'F1 Score':[],'dataset':[],'params':[]}
    if len(datafiles) == len(resfiles):
        for i in range(len(datafiles)):
            #Read files
            data = pd.read_pickle(datafiles[i])
            dfname = datafiles[i].split("\\")[-1].rstrip('.pkl')
            CVres = pd.read_pickle(resfiles[i])
            #Split into train and test sets 
            X = data.iloc[:,:-1].values
            y = data.iloc[:,-1].values
            for train_index, test_index in sss.split(X,y):
                X_train, X_test = X[train_index],X[test_index]
                y_train, y_test = y[train_index],y[test_index]
            #Fit SVM on best parameters 
            best_params = CVres.loc[CVres['rank_test_f1_macro'].idxmin()]['params']
            clf = svm.SVC(**best_params)
            clf.fit(X_train,y_train)
            y_pred = clf.predict(X_test)
            #Generate and save classification report, macro-avg f1 score, confusion matrix
            ##Classification report 
            target_names = ['C','H','W']
            report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
            report_df = pd.DataFrame(report).transpose()
            report_df.to_pickle(testdir+dfname+'.pkl')
            ##Macro-avg f1 score 
            f1Score = report['macro avg']['f1-score']
            test_results['F1 Score'].append(f1Score)
            test_results['dataset'].append(dfname)
            test_results['params'].append(best_params)
            test_results_df = pd.DataFrame(test_results)
            if bestF1['F1 Score']<f1Score:
                bestF1['F1 Score'] = f1Score
                bestF1['dataset'] = dfname
                bestF1['params'] = str(best_params)
            ##Confusion matrix
            fig,ax = plt.subplots()
            ax.set_title(dfname+' CM')
            plot_confusion_matrix(clf,X_test,y_test,labels=target_names,ax=ax)
            plt.savefig(testdir+dfname+'.png')
            plt.close()
    else:
        print("Number of files do not match.")
    bestF1_df = pd.DataFrame(bestF1,index=[0],columns=['F1 Score','dataset','params'])
    bestF1_df.to_csv(testdir+bestF1['dataset']+'.csv')
    test_results_df.to_csv(testdir+'test_results.csv')
    test_results_df.to_csv(testdir+'test_results.pkl')
    print(bestF1_df)
    return test_results_df 

In [6]:
AUC_results_df = SVM_test(dataAUC_dir,resAUC_dir,testAUC_dir)

F1 Score      dataset                                      params
0  0.795252  AUC_df_0124  {'C': 3, 'gamma': 'auto', 'kernel': 'rbf'}


In [7]:
PSD_results_df = SVM_test(dataPSD_dir,resPSD_dir,testPSD_dir)

F1 Score       dataset                                     params
0  0.816176  PSD_df_01234  {'C': 3, 'gamma': 0.001, 'kernel': 'rbf'}


In [8]:
AUC_PSD_results_df = SVM_test(dataAUC_PSD_dir,resAUC_PSD_dir,testAUC_PSD_dir)

F1 Score         dataset                                       params
0  0.811716  AUC_PSD_df_123  {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}


In [53]:
#Plot results

def plot_barF1_barFreqBands(results_df,testdir):
    sorted_df = results_df.sort_values(by=['F1 Score'],ascending=False)
    strip = ''
    splitList = sorted_df['dataset'][1].split("_")
    for i in range(len(splitList)-1):
        strip+=(splitList[i]+"_")
    sorted_labels = [x.lstrip(strip) for x in sorted_df['dataset']]
    color = [0]*31
    color_range = ['tab:blue','tab:orange','tab:green','tab:purple','tab:red']
    for i in range(len(sorted_labels)):
        color[i] = color_range[len(sorted_labels[i])-1]
    fig1,ax1 = plt.subplots(figsize=(50,10))
    ax1.bar(sorted_labels,sorted_df['F1 Score'], color=color)
    figtitle = ''
    for i in range(len(splitList)):
        if splitList[i]=='df':
            break 
        else:
            figtitle+=splitList[i]+" "

    # build the legend
    blue_patch = mpatches.Patch(color='tab:blue', label='1 band')
    orange_patch = mpatches.Patch(color='tab:orange', label='2 bands')
    green_patch = mpatches.Patch(color='tab:green', label='3 bands')
    purple_patch = mpatches.Patch(color='tab:purple', label='4 bands')
    red_patch = mpatches.Patch(color='red', label='5 bands')

    # set up for handles declaration
    patches = [blue_patch, orange_patch, green_patch, purple_patch,red_patch]
    ax1.legend(handles=patches)
    ax1.set_title(figtitle+'results,showing different frequency band combinations',fontsize=20)
    ax1.set_xlabel('Datasets: 0-Delta,1-Theta,2-Alpha,3-Beta,4-Gamma',fontsize=20)
    ax1.set_ylabel('F1 Score',fontsize=20)
    plt.savefig(testdir+figtitle+' F1score_barchart.png')

    #Figure for frequency band counts 
    fig2,ax2 = plt.subplots(figsize=(50,10))
    counts10 = [0]*5
    for label in sorted_labels[0:10]:
        for i in range(len(label)):
            num = int(label[i])
            counts10[num]+=1
    ax2.bar(['Delta','Theta','Alpha','Beta','Gamma'],counts10)
    ax2.set_title(figtitle+' results,frequency band counts in 10 highest F1 scores',fontsize=20)
    ax2.set_xlabel('Frequency Bands',fontsize=20)
    ax2.set_ylabel('Counts',fontsize=20)
    
    plt.savefig(testdir+figtitle+'FrequencyBandCounts_barchart.png')
    plt.show()



In [35]:
plot_barF1_barFreqBands(AUC_results_df,testAUC_dir)

In [38]:
plot_barF1_barFreqBands(PSD_results_df,testPSD_dir)

In [54]:
plot_barF1_barFreqBands(AUC_PSD_results_df,testAUC_PSD_dir)

In [16]:
###########Code snippets for future use ##############

#Cross Validation Score 
# from sklearn.model_selection import cross_val_score
# cv = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
# cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')

#Predict on test results
# from sklearn.metrics import accuracy_score
# y_pred = clf.predict(X_test)
# accuracy_score(y_test, y_pred)

#Randomized Grid Search 
# from sklearn.utils.fixes import loguniform 
# from sklearn.model_selection import RandomizedSearchCV
#Parameter values to be searched
# param_grid = {'C': loguniform(1e0, 1e3),
#  'gamma': [loguniform(1e-4, 1e-3),'auto','scale'],
#  'kernel': ['rbf'],
#  }

# scoring = {'accuracy','f1_macro'}
#SVM classifer
# clf = svm.SVC()

#Instantiate grid search
# grid = RandomizedSearchCV(clf,param_distributions=param_grid,scoring=scoring,cv=5,refit='f1_macro')
# grid.fit(X_train,y_train)

#View results
# pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
# df = pd.DataFrame(grid.cv_results_)

# examine the best model
# print(grid.best_score_)
# print(grid.best_params_)