In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import numpy as np
import csv

In [2]:
begin, end = 1, 61 # (begin is inclusive, end is exclusive)
num_people = 12
count_samples = {
    "active": 8,
    "meditate": 8,
    "neutral": 8
}

class Sample:
    def __init__(self):
        self.data = {
            'RawEEG': [],
            'Alpha': [],
            'Low Beta': [],
            'High Beta': [],
            'Gamma': [],
            'Theta': [],
            'Delta': [],
            'Meditation': [],
            'Attention': []
        }

    def recordDataPoint(self, RawEEG, Attention, Meditation, Alpha, Delta, Theta, LowBeta, HighBeta, Gamma):
        self.data['RawEEG'].append(float(RawEEG))
        self.data['Attention'].append(float(Attention))
        self.data['Meditation'].append(float(Meditation))
        self.data['Alpha'].append(float(Alpha))
        self.data['Delta'].append(float(Delta))
        self.data['Theta'].append(float(Theta))
        self.data['Low Beta'].append(float(LowBeta))
        self.data['High Beta'].append(float(HighBeta))
        self.data['Gamma'].append(float(Gamma))

    '''
    Record a line of data from the CSV output, which takes form RawEEG, Alpha, Delta, Gamma, Low Beta, High Beta, Theta, Attention, Meditation

    '''
    def recordDataLine(self, line):
        self.recordDataPoint(line[0], line[7], line[8], line[1], line[2], line[6], line[4], line[5], line[3])
    
    def getEEG(self):
        return self.data['RawEEG']
    
    def getAttention(self):
        return self.data["Attention"]
    
    def getMeditation(self):
        return self.data["Meditation"]
    
    def getAlpha(self):
        return self.data["Alpha"]
    
    def getDelta(self):
        return self.data["Delta"]
    
    def getTheta(self):
        return self.data["Theta"]
    
    def getLowBeta(self):
        return self.data["Low Beta"]
    
    def getHighBeta(self):
        return self.data["High Beta"]
    
    def getGamma(self):
        return self.data["Gamma"]

    def get(self, key):
        return self.data[key]

    '''
    Filter out all outliers, as defined by being outside 3*std from the mean, and replace with mean of the samples around them
    '''
    def filter_outliers(self):
        sampleBad = False
        for key in ['RawEEG', 'Alpha', 'Theta', 'Low Beta', 'High Beta', "Gamma", 'Delta']:
            data = self.data[key]
            
            filtered = []
            for i, x in enumerate(data):
                if (key != "RawEEG" and (x > 0.8)):
                    sampleBad = True
                    break
                elif abs(x - np.mean(data)) < 2 * np.std(data):
                    filtered.append(x)
                else:
                    filtered.append(np.mean(data[max(0, i-5):i] + data[i+1:min(len(data), i+5)])) # Mean of the four values in closest time to the outlier, but avoiding index out of bounds
                    # filtered.append(np.median(data))
                    
            self.data[key] = filtered
        return sampleBad

In [3]:
# {personNum : {state: [sampleNums]}}
# 0 = key for throwing away all samples of that state

badSamples = {
    1: {"active": [5], "neutral": [2], "meditate": []},
    2: {"active": [0], "neutral": [0], "meditate": [0]},
    3: {"active": [1, 4], "neutral": [1], "meditate": [5, 6, 7, 8]},
    4: {"active": [2], "neutral": [7], "meditate": [1, 8]}, # maybe n1
    5: {"active": [], "neutral": [], "meditate": []}, # i love you person 5 
    6: {"active": [], "neutral": [2, 6], "meditate": []},
    7: {"active": [5], "neutral": [4, 6, 7], "meditate": [1, 3, 4, 8]}, # think about killing some of this data
    8: {"active": [5], "neutral": [1], "meditate": []}, # maybe m5 and m8
    9: {"active": [], "neutral": [], "meditate": []}, 
    10: {"active": [6, 8], "neutral": [4, 5, 6], "meditate": []},
    11: {"active": [4], "neutral": [4, 8], "meditate": [1, 2, 3, 5, 7]},
    12: {"active": [2, 3, 8], "neutral": [0], "meditate": [6]}, # maybe n0
}

In [4]:
data = []
dataLabels = []

def transcribeFileToSample(personN: int, sampleN: int, state: str):
    sample_data = Sample()

    with open("data/all_data/" + state + "_" + str(personN) + "_" + str(sampleN) + ".csv") as f:
        reader = csv.reader(f)

        header = next(reader)
        
        for row in reader:
            sample_data.recordDataLine(row)
            
        if (0 not in badSamples[personN][state] and sampleN not in badSamples[personN][state]):

            for key in sample_data.data:
                sample_data.data[key] = sample_data.data[key][begin:end]

            sampleBad = sample_data.filter_outliers()

            if (not sampleBad):   
                sample_data.filter_outliers()
                data.append(sample_data)
                dataLabels.append(state)

for person in range(num_people):

    for state in count_samples:
        for i in range(count_samples[state]):
            transcribeFileToSample(person + 1, i + 1, state)

In [5]:
dataExtracted = []

for point in data:
    extractedPoint = []

    extractedPoint.append(np.mean(point.getAlpha()))
    extractedPoint.append(np.mean(point.getLowBeta()))
    extractedPoint.append(np.std(point.getAlpha()))
    extractedPoint.append(np.mean(point.getTheta()))
    extractedPoint.append(np.std(point.getGamma()))
    extractedPoint.append(np.std(point.getDelta()))

    dataExtracted.append(extractedPoint)

In [6]:
cvclf = make_pipeline(StandardScaler(), PCA(n_components=3), RandomForestClassifier(n_estimators=700))
scores = cross_val_score(cvclf, dataExtracted, dataLabels, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.48 accuracy with a standard deviation of 0.06


In [7]:
total = 0
n = 100
errorLabels = {"active": {"neutral": 0, "meditate": 0}, 
            "neutral": {"active": 0, "meditate": 0}, 
            "meditate": {"active": 0, "neutral": 0}}
            # first key = actual, second key = prediction

for _ in range(n):
    train, test, trainLabels, testLabels = train_test_split(dataExtracted, dataLabels, test_size=0.20)

    clf = make_pipeline(StandardScaler(), PCA(n_components=3), RandomForestClassifier(n_estimators=700))
    clf.fit(train, trainLabels)

    predictions = clf.predict(test)
    for i in range(len(predictions)):
        if predictions[i] != testLabels[i]:
            errorLabels[testLabels[i]][predictions[i]] += 1 
            
    total += clf.score(test, testLabels) 

print(total/n)


0.5918604651162793


In [8]:
print(errorLabels)

{'active': {'neutral': 347, 'meditate': 267}, 'neutral': {'active': 413, 'meditate': 254}, 'meditate': {'active': 257, 'neutral': 217}}
