In [15]:
import pandas as pd
import numpy as np
from numpy import array
import os
from pathlib import Path
import tensorflow as tflow

In [16]:
tflow.config.experimental.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [17]:
def clean_training_files():

    tf = pd.read_csv("Trainingfiles.csv")
    print("Initial length of training files: {}".format(len(tf)))
    
    zero_final = tf['final_rows'] == 0 # remove the files with zero final row
    empty_json = tf['no_para'] == 0 # remove files with empty JSON's
    four_para = tf['no_para'] == 4 # remove files with only four paragraphs
    indistinct_timestamp = tf['are_timestamps_distinct'] == False # remove files with indistinct timestamps
    files_rm = [zero_final, empty_json, four_para, indistinct_timestamp] # store them all in a list
    for file_rm in files_rm:
        tf = tf.drop(tf[file_rm].index) # drop files sequentially
    
    tf = tf[(tf['user'] != 21) & (tf['user'] != 16)] # remove users 16 and 21 as they only have 2 and 1 test respectively
    print("Final length of training files: {}".format(len(tf)))
    tf.to_csv("clean_trainingfiles.csv", index=False)
    return tf
    

In [18]:
def get_samples_from_test(df, slider, sample_size, bandpass):
    """
    Method for creating samples within a dataset
    :df: test that is being sampled
    :slider: the amount by which the window slides during sampling. The lower the number, the more samples.
    """
    df = df.drop(["Timestamp", " AdjustedUnix"], axis=1) #remove unnecessary columns
    Sample = namedtuple('Sample', 'inputs effort attention interest')
    sampled_tests = []
    
    
    # Group by paragraph and add each paragraph into an array
    paragraphs = df.groupby('para') 
    paragraphs = [paragraphs.get_group(x) for x in paragraphs.groups]
    
    incorrect_length = 0
    # Loop trough each paragraph to create samples
    for para in paragraphs:
        
        if not len(para) > sample_size: # check the length of paragraph if it is bigger than the sample size
            continue
            
        new_sample_length = len(para[0:sample_size])
        counter = 0
        while  new_sample_length >= sample_size: # this checks that the new sample is at least greater than or equal to sample size
            """
            **Sliding window algorithm**
            - Create new samples based on sample size and iterate using the slider size for size of overlap
            - Create separate values for inputs, effort, attention, interest to add to a tuple
            """
            new_sample = para[counter : counter + sample_size] #gets new sample based on the counter and sample size
            new_sample_length = len(new_sample)
            
            #checks new_sample length
            if new_sample_length == sample_size:
                # Extract the sample specific data and apply band pass filtering if true
                if bandpass == True:
#                     _, inputs = chebyBandpassFilter(array(new_sample.iloc[:, :8]), [0.05, 0.1, 40, 42])
                    inputs = filter_sample(array(new_sample.iloc[:, :8]))
                else:
                    inputs = array(new_sample.iloc[:, :8])
                
                effort, attention, interest = new_sample[["effort", "attention", "interest"]].T.values
                sampled_tests.append(Sample(inputs, int(max(effort)), int(max(attention)), int(max(interest))))
            else:
                incorrect_length += 1
                continue             
         
            # increase by slider
            counter += slider
        
        
    sampled_tests_df = pd.DataFrame(sampled_tests) #sampled list data frame
    print("This sampled test has {0} samples".format(len(sampled_tests_df)))
    inputs_and_labels = {}
    
    inputs_list = sampled_tests_df['inputs'].values
    inputs_list = np.rollaxis(np.dstack(inputs_list),-1) #combine all the inputs into 3D array
    inputs_and_labels['inputs'] = inputs_list # add inputs into dictionary
    labels = ["effort", "attention", "interest"]
    for label in labels: 
        inputs_and_labels[label] = array(sampled_tests_df[label].values) #add lables to dictionary
    

    print(inputs_and_labels['inputs'])
    return inputs_and_labels  

In [19]:
def generate_all_samples_or_tests(slider, sample_size, agg, bandpass):
    """
    Method for generating samples for all the tests with a default sample size of 60 and slide of 60. 
    Combines all tests of a user into a tuple consisting of inputs, attention, interest, effort. 
    Saves all tests in a dictionary
    :slider:
    :sample_size:
    """
    clean_tf = pd.read_csv("clean_trainingfiles.csv")
    users = set(clean_tf['user'])
    user_tests = {}
    
 
    for user in users:
        
        #store all of user's test in a dictionary
        user_test_paths = array(clean_tf[clean_tf['user'] == user]["path"])
    

        file = "annotated_EEG.csv"

        # Loop through all the tests, generate samples and then append them to an array in the dictionary
        inputs_and_labels = {"inputs":[], "attention":[], "effort":[], "interest":[]}
        for test_path in user_test_paths:
            print("Processing user {0} , test {1}".format(extract_user_number(test_path), extract_test_number(test_path)))
            test_file = test_path + "/" + file
            test_dataset = pd.read_csv(test_file)
            if len(test_dataset) == 0:
                continue
        #convert the test into windowed format with samples     
            sampled_test_dataset = get_samples_from_test(test_dataset, slider, sample_size, bandpass)
        #add all the tests to the dictionary, inputs and labels
            for key in inputs_and_labels:
                inputs_and_labels[key].append(sampled_test_dataset[key])
                
        if agg == True: 
            # loop through he dictionary and concatenate the list
            for key in inputs_and_labels:
                inputs_and_labels[key] = np.concatenate(inputs_and_labels[key], axis=0)
                print("Shape of {0}: {1}".format(key,inputs_and_labels[key].shape ))
        
        user_tests[user] = inputs_and_labels
        print(user_tests.keys())

 
    print("Adding dictionary...")
    return user_tests

In [20]:
def save_datasets(window_size, slider, bandpass):
    all_tests_agg = generate_all_samples_or_tests(slider,window_size,agg=True, bandpass=bandpass)
    all_tests_no_agg = generate_all_samples_or_tests(slider,window_size,agg=False, bandpass=bandpass)
    saved_file_agg = "HackX/Datasets/saved_user_and_test_data/all_users_sampled_{0}_window_annotated_EEG_agg_bandpass_{1}_slider_{2}.pickle".format(window_size,bandpass, slider)
    saved_file_no_agg = "HackX/Datasets/saved_user_and_test_data/all_users_sampled_{0}_window_annotated_EEG_no_agg_bandpass_{1}_slider_{2}.pickle".format(window_size,bandpass, slider)
    save_file(saved_file_agg, all_tests_agg)
    save_file(saved_file_no_agg, all_tests_no_agg)

In [21]:
def combine_test_per_user(agg=False):
    """
    Combines all tests per user without sampling
    """
    df = pd.read_csv("clean_trainingfiles.csv")
    users = set(array(df['user']))
    all_users = {}
    
    file = "annotated_EEG.csv"
    for user in users:
        test_paths = array(df[df['user'] == user]["path"])
        test_list = []
        for test_path in test_paths:
            test_file = test_path + "/" + file
            test_dataset = pd.read_csv(test_file)
            test_list.append(test_dataset)
            
        if agg == True: test_list  = pd.concat(test_list)
            
        all_users[user] = test_list
        print("Processed user {0}:\tDataframe size: {1}".format(user, len(test_list)))
        
    print("Saving dictionary...")
    saved_file = "HackX/Datasets/saved_user_and_test_data/all_tests_EEG_{0}.pickle".format(agg)
    with open(saved_file, 'wb') as handle:            
        pickle.dump(all_users, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return all_users 