In [9]:
import math
import csv
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import antropy as ant
from sklearn.feature_selection import SelectFdr, SelectPercentile, SelectKBest, chi2


In [10]:
begin, end = 1, 61 # (begin is inclusive, end is exclusive)
num_people = 14
count_samples = {
    "active": 8,
    "meditate": 8,
    "neutral": 8
}

class Sample:
    def __init__(self):
        self.data = {
            'RawEEG': [],
            'Alpha': [],
            'Low Beta': [],
            'High Beta': [],
            'Gamma': [],
            'Theta': [],
            'Delta': [],
            'Meditation': [],
            'Attention': []
        }

    def recordDataPoint(self, RawEEG, Attention, Meditation, Alpha, Delta, Theta, LowBeta, HighBeta, Gamma):
        self.data['RawEEG'].append(float(RawEEG))
        self.data['Attention'].append(float(Attention))
        self.data['Meditation'].append(float(Meditation))
        self.data['Alpha'].append(float(Alpha))
        self.data['Delta'].append(float(Delta))
        self.data['Theta'].append(float(Theta))
        self.data['Low Beta'].append(float(LowBeta))
        self.data['High Beta'].append(float(HighBeta))
        self.data['Gamma'].append(float(Gamma))

    '''
    Record a line of data from the CSV output, which takes form RawEEG, Alpha, Delta, Gamma, Low Beta, High Beta, Theta, Attention, Meditation

    '''
    def recordDataLine(self, line):
        self.recordDataPoint(line[0], line[7], line[8], line[1], line[2], line[6], line[4], line[5], line[3])
    
    def getEEG(self):
        return self.data['RawEEG']
    
    def getAttention(self):
        return self.data["Attention"]
    
    def getMeditation(self):
        return self.data["Meditation"]
    
    def getAlpha(self):
        return self.data["Alpha"]
    
    def getDelta(self):
        return self.data["Delta"]
    
    def getTheta(self):
        return self.data["Theta"]
    
    def getLowBeta(self):
        return self.data["Low Beta"]
    
    def getHighBeta(self):
        return self.data["High Beta"]
    
    def getGamma(self):
        return self.data["Gamma"]

    def get(self, key):
        return self.data[key]

    '''
    Filter out all outliers, as defined by being outside 3*std from the mean, and replace with mean of the samples around them
    '''
    def filter_outliers(self):
        sampleBad = False
        for key in ['RawEEG', 'Alpha', 'Theta', 'Low Beta', 'High Beta', "Gamma", 'Delta']:
            data = self.data[key]
            
            filtered = []
            for i, x in enumerate(data):
                iqr = np.subtract(*np.percentile(data, [75, 25]))
                med = np.median(data)
                if (med - 1.5*iqr > x) or (med + 1.5*iqr < x) or abs(x - np.mean(data)) > 2 * np.std(data):
                    filtered.append(med)
                    # filtered.append(np.median(data[max(0, i-5):i] + data[i+1:min(len(data), i+5)]))
                else:
                    filtered.append(x)
                    
            self.data[key] = filtered
        return sampleBad

In [11]:
# {personNum : {state: [sampleNums]}}
# 0 = key for throwing away all samples of that state

badSamples = {
    1: {"active": [5], "neutral": [2], "meditate": []},
    2: {"active": [0], "neutral": [0], "meditate": [0]},
    3: {"active": [1, 4], "neutral": [1], "meditate": [5, 6, 7, 8]},
    4: {"active": [2], "neutral": [7], "meditate": [1, 8]}, # maybe n1
    5: {"active": [], "neutral": [], "meditate": []}, # i love you person 5 
    6: {"active": [], "neutral": [2, 6], "meditate": []},
    7: {"active": [5], "neutral": [4, 6, 7], "meditate": [1, 3, 4, 8]}, # think about killing some of this data
    8: {"active": [5], "neutral": [1], "meditate": []}, # maybe m5 and m8
    9: {"active": [], "neutral": [], "meditate": []}, 
    10: {"active": [6, 8], "neutral": [4, 5, 6], "meditate": []},
    11: {"active": [4], "neutral": [4, 8], "meditate": [1, 2, 3, 5, 7]},
    12: {"active": [2, 3, 8], "neutral": [0], "meditate": [6]}, # maybe n0
    13: {"active": [], "neutral": [8], "meditate": []},
    14: {"active": [4, 5, 8], "neutral": [0], "meditate": [1, 2, 8]}
}

In [12]:
data = []
dataLabels = []

def transcribeFileToSample(personN: int, sampleN: int, state: str):
    sample_data = Sample()

    with open("data/all_data/" + state + "_" + str(personN) + "_" + str(sampleN) + ".csv") as f:
        reader = csv.reader(f)

        header = next(reader)
        
        for row in reader:
            sample_data.recordDataLine(row)
            
        if (0 not in badSamples[personN][state] and sampleN not in badSamples[personN][state]):

            for key in sample_data.data:
                sample_data.data[key] = sample_data.data[key][begin:end]

            sample_data.filter_outliers()
            data.append(sample_data)
            dataLabels.append(state)

for person in range(num_people):
    for state in count_samples:
        for i in range(count_samples[state]):
            transcribeFileToSample(person + 1, i + 1, state)

In [13]:
dataExtracted = []

def safety_check(x):
    if math.isnan(x): return 0
    if math.isinf(x): return 99999999999
    return x

for point in data:
    extractedPoint = []

    for key in point.data:
        if key == 'Meditation' or key == 'Attention': continue
        for func in [np.mean, np.std, ant.sample_entropy, ant.higuchi_fd]:
            extractedPoint.append(safety_check(func(point.get(key))))

    extractedPoint.append(safety_check(ant.spectral_entropy(point.getEEG(), sf=1)))
    
    dataExtracted.append(extractedPoint)

  se = -(psd_norm * np.log2(psd_norm)).sum(axis=axis)
  se = -(psd_norm * np.log2(psd_norm)).sum(axis=axis)


In [14]:
X = dataExtracted.copy()
y = dataLabels.copy()
feature_names = []
bands = ['Raw EEG', 'Alpha', 'Low Beta', 'High Beta', 'Gamma', 'Theta', 'Delta']
for band in bands:
    feature_names.append("mean " + band)
    feature_names.append("std " + band)
    feature_names.append("sample entropy " + band)
    feature_names.append("higuchi fd " + band)
feature_names.append("spectral entropy")

In [15]:
percentile = SelectKBest(k=5)
X_new = percentile.fit_transform(X, y)
percentile.get_feature_names_out(feature_names)

array(['mean Low Beta', 'mean High Beta', 'std High Beta', 'mean Gamma',
       'std Gamma'], dtype=object)

In [16]:
forest = RandomForestClassifier(n_estimators=1800, max_depth=70, random_state=0)
forest.fit(X, y)

for name, importance in sorted(zip(feature_names, forest.feature_importances_), key=lambda x: x[1], reverse=True):
    print(name, importance)

std Gamma 0.08092091215971141
mean Theta 0.07414497719509928
mean Gamma 0.06991423535926411
mean High Beta 0.06009976707123903
std High Beta 0.05394636983455957
std Theta 0.04820730060929454
mean Low Beta 0.046156882504495955
std Delta 0.043713862780865226
mean Alpha 0.039576546560637076
mean Delta 0.035673375716230256
std Low Beta 0.03562598938599488
sample entropy Delta 0.03232278362972651
std Raw EEG 0.029204513086664795
higuchi fd Gamma 0.028663091528539734
std Alpha 0.023891149935124405
sample entropy Low Beta 0.02362338017327561
higuchi fd Alpha 0.023482040674867886
higuchi fd Low Beta 0.02265263664387047
sample entropy High Beta 0.02204225775246362
higuchi fd Delta 0.021789294600848
sample entropy Alpha 0.021578207765799592
spectral entropy 0.02157277456645085
higuchi fd High Beta 0.021322295593403724
sample entropy Raw EEG 0.020935850323483586
higuchi fd Theta 0.020892955686202804
sample entropy Gamma 0.020492509526418762
mean Raw EEG 0.019905093844658196
sample entropy Theta 0