# EEG Music Feature Extraction

Goal: Classify types of music by EEG features 

3 types of music: White Noise, Classical, Hip-Hop 
Lengths of music (raw): 
    White Noise: 5934 recorded samples 
    Classical:   6128 recorded samples 
    Hip-Hop:     7674 recorded samples 
Data collection: 128 samples/s, 32 Channels 

No.of 0.1s samples in filtered data: 
    White Noise: 494 
    Classical:   510
    Hip-Hop:     639 
    
    
Feature Extraction steps:</br>
1. Get welch's PSD estimate for each channel and plot them 
2. Create features from PSD estimates:
    - PSD estimates for each channel
    - Area under PSD for each channel
    - Polynomial features
3. Create and save datasets 
    - Combinations of datasets by frequency bands 
    

Work Done: 1,2(not Polynomial),3

Problems: Creating Polynomial features took up too much memory to run

Future work: Using the multi-taper method, Creating other features, wavelet transform, ratio of bands 

In [5]:
#Import necessary libraries

import pandas as pd
import numpy as np
from scipy import signal
import os
import matplotlib.pyplot as plt
import pickle as pkl
import itertools 
from sklearn.preprocessing import StandardScaler
# %matplotlib inline 
%matplotlib qt


In [2]:
#Load preprocessed data
cwd = os.getcwd() #current directory
C_filename = cwd+"/pkl/preprocessing/C_bands_0.1s_list.pkl"
H_filename = cwd+"/pkl/preprocessing/H_bands_0.1s_list.pkl"
W_filename = cwd+"/pkl/preprocessing/W_bands_0.1s_list.pkl"

with open(C_filename,"rb") as f:
    C_bands_split_list=pkl.load(f)
    
with open(H_filename,"rb") as f:
    H_bands_split_list=pkl.load(f)
    
with open(W_filename,"rb") as f:
    W_bands_split_list=pkl.load(f)

#Checking loaded data
if  (all(isinstance(x.shape,tuple) for x in C_bands_split_list) and 
    all(isinstance(x.shape,tuple) for x in H_bands_split_list) and 
    all(isinstance(x.shape,tuple) for x in W_bands_split_list)):
    
    print(C_bands_split_list[0].shape)
    print(C_bands_split_list[0].iloc[0,0].shape)
    print(H_bands_split_list[0].iloc[0,0].shape)
    print(W_bands_split_list[0].iloc[0,0].shape)



(510, 32)
(12,)
(12,)
(12,)


In [3]:
#Apply Welch's method for each item in the dataframes

def welch_bands_split_list(bands_list,fs,win,check=False):
    #fs -sampling freq, win - number of recorded samples in window
    #Using this method, freq resolution = 1/t = fs/N (N=no.of points in windows)
    PSD_df_list = [0]*len(bands_list)
    for df_no in range(len(bands_list)):
        #Every item in the dataframe is mapped to the PSD estimates
        PSD_df_list[df_no] = bands_list[df_no].applymap(lambda x: signal.welch(x,fs,nperseg=win)[1])
    #Frequency axis for plotting, samee for all
    freqs,_ = signal.welch(bands_list[0].iloc[0,0],fs,nperseg=win)
    
    if check:
        if all(isinstance(x.shape,tuple) for x in PSD_df_list):
            print(PSD_df_list[0].shape)
#         if len(PSD_df_list[0].iloc[0,0]) == len(PSD_df_list[1].iloc[0,0]) == len(PSD_df_list[2].iloc[0,0]) == len(PSD_df_list[3].iloc[0,0]):
#             len(PSD_df_list[0].iloc[0,0]
    
    return freqs, PSD_df_list

fs = 128 
win = 6 #Half of sample length (12)

#Get PSD estimates, freqs is the same for all
freqs, C_PSD_df_list = welch_bands_split_list(C_bands_split_list,fs,win,check=True)
_, H_PSD_df_list = welch_bands_split_list(H_bands_split_list,fs,win,check=True)
_, W_PSD_df_list = welch_bands_split_list(W_bands_split_list,fs,win,check=True)


(510, 32)
(639, 32)
(494, 32)


In [4]:
#Create features from PSD 

def get_AUC_PSD_df_list(PSD_df_list):
    AUC_PSD_df_list = [0]*len(PSD_df_list)
    for df_no in range(len(PSD_df_list)):
        AUC_PSD_df_list[df_no] = PSD_df_list[df_no].applymap(lambda x: np.trapz(x))
    return AUC_PSD_df_list

C_AUC_PSD_df_list = get_AUC_PSD_df_list(C_PSD_df_list)
H_AUC_PSD_df_list = get_AUC_PSD_df_list(H_PSD_df_list)
W_AUC_PSD_df_list = get_AUC_PSD_df_list(W_PSD_df_list)


In [5]:
#Expand all lists in cells to their own variables

def expand_PSD_df_list(PSD_df_list):
    e_PSD_df_list = [0]*len(PSD_df_list)
    for df_no in range(len(PSD_df_list)):
        #e_PSD_df_cols_list will be used to create new dataframe
        no_PSD = len(PSD_df_list[0].iloc[0,0])
        e_PSD_df_cols_list = [0]*32
        
        for channel in range(len(PSD_df_list[df_no].columns)):
            #Expand each column into its own dataframe
            new_col = PSD_df_list[df_no][channel].apply(pd.Series)
            #Rename every variable in the new column
            new_col = new_col.rename(columns = lambda x: "Ch"+str(channel+1)+'_'+str(np.linspace(0,64,no_PSD)[x]))
            #Add new_col to cols_list
            e_PSD_df_cols_list[channel] = new_col
        
        #Create new dataframe
        e_PSD_df = pd.concat(e_PSD_df_cols_list, axis=1)
        
        #Add to list
        e_PSD_df_list[df_no] = e_PSD_df
    return e_PSD_df_list 

C_e_PSD_df_list = expand_PSD_df_list(C_PSD_df_list)
H_e_PSD_df_list = expand_PSD_df_list(H_PSD_df_list)
W_e_PSD_df_list = expand_PSD_df_list(W_PSD_df_list)

print(C_e_PSD_df_list[0].shape)
print(H_e_PSD_df_list[0].shape)
print(W_e_PSD_df_list[0].shape)

(510, 128)
(639, 128)
(494, 128)


In [6]:
#Create datasets with all possible combinations of frequency bands

indices = [0,1,2,3,4] #indices representing delta, theta, alpha, beta, gamma bands in order

def all_combinations(any_list):
    return itertools.chain.from_iterable(
        itertools.combinations(any_list, i + 1)
        for i in range(len(any_list)))

combos = list(all_combinations(indices)) #2^(len(indices))-1 combinations

print(combos)
print(type(combos[0]))
print(len(combos))

[(0,), (1,), (2,), (3,), (4,), (0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), (0, 1, 2), (0, 1, 3), (0, 1, 4), (0, 2, 3), (0, 2, 4), (0, 3, 4), (1, 2, 3), (1, 2, 4), (1, 3, 4), (2, 3, 4), (0, 1, 2, 3), (0, 1, 2, 4), (0, 1, 3, 4), (0, 2, 3, 4), (1, 2, 3, 4), (0, 1, 2, 3, 4)]
<class 'tuple'>
31


In [7]:
#Concatenate to form all possible dataframes (PSD and AUC features)

#Only PSD features
#Only AUC features 
#Both PSD and AUC features 

def get_1F_combos_df_list(e_PSD_df_list,combos): 
    #single feature
    combos_df_list = [0]*len(combos)
    for i in range(len(combos)):
        concat_list = [e_PSD_df_list[x] for x in combos[i]]
        combos_df_list[i] = pd.concat(concat_list,axis=1)
    return combos_df_list

def get_2F_combos_df_list(PSD_df_list,AUC_df_list,combos):
    #Two features
    combos_df_list = [0]*len(combos)
    for i in range(len(combos)):
        psd_list = [PSD_df_list[x] for x in combos[i]]
        auc_list = [AUC_df_list[x] for x in combos[i]]
        concat_list = psd_list + auc_list 
        combos_df_list[i] = pd.concat(concat_list,axis=1)
    return combos_df_list

#List of dataframes with only PSD features
C_PSD_combos_df_list = get_1F_combos_df_list(C_e_PSD_df_list,combos)
H_PSD_combos_df_list = get_1F_combos_df_list(H_e_PSD_df_list,combos)
W_PSD_combos_df_list = get_1F_combos_df_list(W_e_PSD_df_list,combos)

#List of dataframes with only AUC features
C_AUC_combos_df_list = get_1F_combos_df_list(C_AUC_PSD_df_list,combos)
H_AUC_combos_df_list = get_1F_combos_df_list(H_AUC_PSD_df_list,combos)
W_AUC_combos_df_list = get_1F_combos_df_list(W_AUC_PSD_df_list,combos)

#List of dataframes with both features
C_2F_combos_df_list = get_2F_combos_df_list(C_e_PSD_df_list,C_AUC_PSD_df_list,combos)
H_2F_combos_df_list = get_2F_combos_df_list(H_e_PSD_df_list,H_AUC_PSD_df_list,combos)
W_2F_combos_df_list = get_2F_combos_df_list(W_e_PSD_df_list,W_AUC_PSD_df_list,combos)

print(len(C_2F_combos_df_list))
print(C_2F_combos_df_list[30].shape)

# combos_df_list = [0]*len(combos)
# concat_list = [C_e_PSD_df_list[x] for x in combos[15]]
# combos_df_list = pd.concat(concat_list,axis=1)

# H_e_PSD_df_list
# W_e_PSD_df_list

31
(510, 800)


In [11]:
#Append music type columns to all dataframes 

def add_music_col(df_list,music_type):
    new_list = [0]*len(df_list)
    for i in range(len(df_list)):
        new_df = df_list[i][:]
        new_df['Music'] = pd.Series(music_type,index=df_list[i].index) 
        new_list[i] = new_df
    return new_list

#Lists
#List of dataframes with only PSD features
C_PSD_combosM_df_list = add_music_col(C_PSD_combos_df_list,'C')
H_PSD_combosM_df_list = add_music_col(H_PSD_combos_df_list,'H')
W_PSD_combosM_df_list = add_music_col(W_PSD_combos_df_list,'W')

# #List of dataframes with only AUC features
C_AUC_combosM_df_list = add_music_col(C_AUC_combos_df_list,'C')
H_AUC_combosM_df_list = add_music_col(H_AUC_combos_df_list,'H')
W_AUC_combosM_df_list = add_music_col(W_AUC_combos_df_list,'W')

# #List of dataframes with both features
C_2F_combosM_df_list = add_music_col(C_2F_combos_df_list,'C')
H_2F_combosM_df_list = add_music_col(H_2F_combos_df_list,'H')
W_2F_combosM_df_list = add_music_col(W_2F_combos_df_list,'W')



In [21]:
#Combine different music types to form full datasets

#Create list of strings representing combinations of frequency bands to append to saved files
def get_combo_strings(combos):
    combo_strings = []
    for combo in combos:
        strings = [str(x) for x in combo]
        string = ''
        for i in range(len(strings)):
            string += strings[i]
        combo_strings.append(string)
    return combo_strings
combo_strings = get_combo_strings(combos)

def concatSave_df_list(C_df_list,H_df_list,W_df_list,combo_strings,filename,savedir):
    if len(C_df_list) == len(H_df_list) == len(W_df_list):
        for i in range(len(C_df_list)):
            new_df = pd.concat([C_df_list[i],H_df_list[i],W_df_list[i]],axis=0)
            savepath = savedir+filename+'_'+ combo_strings[i]+'.pkl'
            #Save to external HDD as pkl files 
            new_df.to_pickle(savepath)
    else:
        print("Lists are of unequal lengths.")


PSDsavedir = 'F:/EEG-data/pkl/featureExtraction/PSD_only/'
AUCsavedir = 'F:/EEG-data/pkl/featureExtraction/AUC_only/'
AUC_PSDsavedir = 'F:/EEG-data/pkl/featureExtraction/AUC_PSD/'

PSD_filename = 'PSD_df'
AUC_filename = 'AUC_df'
AUC_PSDfilename = 'AUC_PSD_df'

#"combosM" means "Music" column has been added
#List of dataframes with only PSD features
PSD_combosM_df_list = concatSave_df_list(C_PSD_combosM_df_list,H_PSD_combosM_df_list,W_PSD_combosM_df_list, combo_strings,PSD_filename,PSDsavedir)


#List of dataframes with only AUC features
AUC_combosM_df_list = concatSave_df_list(C_AUC_combosM_df_list,H_AUC_combosM_df_list,W_AUC_combosM_df_list , combo_strings,AUC_filename,AUCsavedir)

#List of dataframes with both features
AUC_PSD_combosM_df_list = concatSave_df_list(C_2F_combosM_df_list,H_2F_combosM_df_list,W_2F_combosM_df_list , combo_strings,AUC_PSDfilename,AUC_PSDsavedir)




In [8]:
####### Don't do polynomial features for now - too much memory ################

# #Generate polynomial features for each dataset 
# #Storing these as generators to save space, each iterable in the generator is a np array 

# from sklearn.preprocessing import PolynomialFeatures 
# poly = PolynomialFeatures(2)

# #Generators with only PSD features, polynomial
# C_poly_PSD_combos_df_gen = (poly.fit_transform(x) for x in C_PSD_combos_df_list)
# H_poly_PSD_combos_df_gen = (poly.fit_transform(x) for x in H_PSD_combos_df_list)
# W_poly_PSD_combos_df_gen = (poly.fit_transform(x) for x in W_PSD_combos_df_list)

# #Generators with only AUC features, polynomial
# C_poly_AUC_combos_df_gen = (poly.fit_transform(x) for x in C_AUC_combos_df_list)
# H_poly_AUC_combos_df_gen = (poly.fit_transform(x) for x in H_AUC_combos_df_list)
# W_poly_AUC_combos_df_gen = (poly.fit_transform(x) for x in W_AUC_combos_df_list)

# #Generators with both features, polynomial
# C_poly_2F_combos_df_gen = (poly.fit_transform(x) for x in C_2F_combos_df_list)
# H_poly_2F_combos_df_gen = (poly.fit_transform(x) for x in H_2F_combos_df_list)
# W_poly_2F_combos_df_gen = (poly.fit_transform(x) for x in W_2F_combos_df_list)

# savedir = 'F:/EEG-data/numpy/featureExtraction/'
# def get_poly_combos_df(combos_df_list,poly,music_type,filename,savedir):
#     for i in range(len(combos_df_list)):
#         savepath = savedir+filename+"_"+str(i)
#         df = poly.fit_transform(combos_df_list[i])
#         # music_array = np.array([music_type]*df.shape[0])
#         # music_array = music_array.reshape(len(music_array),1)
#         # df_M = np.hstack((df,music_array))
#         np.savez(savepath,df_M)

# get_df_coget_poly_combos_dfs_df_list,poly,'C','C_poly_2F_combos',savedir)


In [None]:
####### Don't do polynomial features for now - too much memory ################

# #Append music type columns to all dataframes in generators

# #Create list of strings representing combinations of frequency bands to append to saved files
# def get_combo_strings(combos):
#     combo_strings = []
#     for combo in combos:
#         strings = [str(x) for x in combo]
#         string = ''
#         for i in range(len(strings)):
#             string += strings[i]
#         combo_strings.append(string)
#     return combo_strings
# combo_strings = get_combo_strings(combos)

# #Adds music type column to each dataframe generated
# def add_music_col_gen(generator,music_type,combo_strings,filename,savedir):
#     i=0
#     while i<len(combo_strings):
#         df = next(generator)
#         music_array = np.array([music_type]*df.shape[0])
#         music_array = music_array.reshape(len(music_array),1)
#         df_M = np.hstack((df,music_array))
#         # print(df.shape)
#         # print(music_array.shape)
#         savepath = savedir+filename+'_'+ combo_strings[i]
#         # print(df_M.shape)
#         np.savez_compressed(savepath,df_M)
#         # print(i)
#         i+=1


# #Save to external HDD as compressed npz files 
# savedir = 'F:/EEG-data/numpy/featureExtraction/'

# #Generators with only PSD features, polynomial
# add_music_col_gen(C_poly_PSD_combos_df_gen,'C',combo_strings,'C_poly_PSD_combosM',savedir)
# add_music_col_gen(H_poly_PSD_combos_df_gen,'H',combo_strings,'H_poly_PSD_combosM',savedir)
# add_music_col_gen(W_poly_PSD_combos_df_gen,'W',combo_strings,'W_poly_PSD_combosM',savedir)

# #Generators with only AUC features, polynomial
# add_music_col(C_poly_AUC_combos_df_gen,'C',combo_strings,'C_poly_AUC_combosM', savedir)
# add_music_col(H_poly_AUC_combos_df_gen,'H',combo_strings,'H_poly_AUC_combosM', savedir)
# add_music_col(W_poly_AUC_combos_df_gen,'W',combo_strings,'W_poly_AUC_combosM', savedir)

# #Generators with both features, polynomial
# add_music_col_gen(C_poly_2F_combos_df_gen,'C',combo_strings,'C_poly_2F_combosM',savedir)
# add_music_col_gen(H_poly_2F_combos_df_gen,'H',combo_strings,'H_poly_2F_combosM',savedir)
# add_music_col_gen(W_poly_2F_combos_df_gen,'W',combo_strings,'W_poly_2F_combosM',savedir)


In [31]:
print(len(C_2F_combosM_df_list[0].index))
print(len(H_2F_combosM_df_list[0].index))
print(len(W_2F_combosM_df_list[0].index))

510
639
494


In [None]:
#Combine different music types 

In [None]:
#Split training, cross-validation, test datasets

In [None]:
#Save 