<font size="6" face="Times"> <b>Create Participant File</b> </font>

<b>Variables</b>

In [1]:
participant = "P091"
read_raw_data = "D:/Unisinos/Bitalino/01 - Raw Data/" + participant + "/"
read_annotations = "D:/Unisinos/Bitalino/02 - Data Control/annotations.csv"
write_data = "D:/Unisinos/Bitalino/05 - Data CSV for ML/Participants/" + participant + "_File.csv"
reading_folder = "D:/Unisinos/Bitalino/03 - Data CSV/"

<b>Libraries</b>

In [2]:
import pickle
import pytz
import datetime
import itertools
import pandas as pd
import numpy as np
from os import listdir
from collections import Counter
from os.path import isfile, join

<b>Read All the Files and Process the Data </b>

In [3]:
_pwd = read_raw_data

In [4]:
files = {}
data = []
for participant_folder in listdir(_pwd):
    _participant = participant_folder.split(' ')[1]
    for folder in listdir(_pwd + participant_folder):
        print(folder)
        data_timestamp = int(folder.split('-')[1].lstrip())
        data_timestamp = datetime.datetime.fromtimestamp(data_timestamp)

        files = [f for f in listdir(_pwd+participant_folder+'/'+folder) if isfile(join(_pwd+participant_folder+'/'+folder, f))]

        for f in files:
            file_index = int(f.split('.')[0])
            data_values = pickle.load(open(_pwd+participant_folder+'/'+folder+'/'+f, "rb")).tolist()
            data_values = pd.DataFrame(data_values)
            data_values.columns = ['DIGITAL{0}'.format(x) for x in range(0,len(data_values.columns))]
            data_values.rename(columns={'DIGITAL6':'ECG','DIGITAL7':'EMG','DIGITAL8':'EDA'}, inplace=True)
            data_values['PARTICIPANT'] = _participant

            data_values['ECG'] = [((((x/1024)-(1/2))*3.3)/1100)*1000 for x in data_values['ECG']]

            data_values['FILEINDEX'] = file_index

            # At each entry counts as a more one milesecond (@1000Hz)
            row_objects = [1000]*len(data_values)
            row_objects = list(np.cumsum(row_objects))
            row_objects = [data_timestamp + datetime.timedelta(milliseconds=int(x)) for x in row_objects]

            data.append(pd.concat([pd.DataFrame(row_objects, columns=['TIMESTAMP']), data_values], axis=1))

201806130369 - 1568307210
201806130369 - 1568308186
201806130369 - 1568309371
201806130369 - 1568310201


<b>Merge All Data into a Same Data Frame </b>

In [5]:
data = pd.concat(data)

Apply the TIMESTAMP

In [6]:
data['TIMESTAMP'] = data['TIMESTAMP'].apply(lambda x: x.to_pydatetime().replace(tzinfo=pytz.timezone('America/Sao_Paulo')))

In [7]:
data = data.sort_values(['TIMESTAMP'])

Delete unused columns

In [8]:
data=data.drop(['DIGITAL0','DIGITAL1','DIGITAL2','DIGITAL3','DIGITAL4','DIGITAL5','DIGITAL9','DIGITAL10','FILEINDEX'], axis=1)

<b>Read the Annotations and Preprocess<b>

Read the File

In [9]:
annotations = pd.read_csv(read_annotations, sep=';', encoding='utf-8')

Replace NaN with None

In [10]:
annotations = annotations.where((pd.notnull(annotations)), None)

In [11]:
annotations['PARTICIPANTE'] = annotations['PARTICIPANTE'].apply(lambda x: x.lstrip().rstrip() if x is not None else x)


List Unique Participants of Datas

Method to Combine <b>Date</b> and <b>Time</b> into a <b>Datetime</b>

In [12]:
def combine_date_time(xdate, xtime):
    new_feature = []
    for d,t in zip(xdate,xtime):
        if d is None or t is None:
            new_feature.append(None)
        else:
            new_feature.append(datetime.datetime.strptime(d + ' ' + t, '%d/%m/%Y %H:%M'))
    return new_feature

In [13]:
def combine_date_time(xdate, xtime):
    new_feature = []
    for d,t in zip(xdate,xtime):
        if d is None or t is None:
            new_feature.append(None)
        else:
            new_feature.append(datetime.datetime.strptime(t, '%d-%m-%Y %H:%M'))
    return new_feature

Cast <b>Start Time</b> and <b>End Time</b> into <b>Datetime</b>

In [14]:
annotations['HORAINICIO'] = combine_date_time(annotations['DATA'],annotations['HORAINICIO'])
annotations['HORAFINAL'] = combine_date_time(annotations['DATA'],annotations['HORAFINAL'])

Cast <b>timer</b> into <b>datetime.time</b>

In [15]:
annotations['CRONOMETROINICIO'] = annotations['CRONOMETROINICIO'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time() if isinstance(x, str) else None)
annotations['CRONOMETROFINAL'] = annotations['CRONOMETROFINAL'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time() if isinstance(x, str) else None)

Fulfill Missing <b>HORAFINAL</b> Information

In [16]:
import itertools

def get_next(some_iterable, window=1):
    items, nexts = itertools.tee(some_iterable, 2)
    nexts = itertools.islice(nexts, window, None)
    return zip(items, nexts)

new_feature = []

for _current, _next in get_next(zip(annotations['PARTICIPANTE'], annotations['HORAINICIO'],annotations['HORAFINAL'])):
    if _next[0] == _current[0]:
        if (_current[1] is None) or (isinstance(_current[1],pd._libs.tslibs.nattype.NaTType)):
            new_feature.append(_current[1])
        else:
            if (_current[2] is None) or (isinstance(_current[2],pd._libs.tslibs.nattype.NaTType)):
                new_feature.append(_next[1]) # Add the next HORAINICIO value
            else:
                new_feature.append(_current[2]) # Add the current HORAFINAL value
    else:
        new_feature.append(_current[2]) # Add the current HORAFINAL value

new_feature.append(_current[2]) # Add the current HORAFINAL value

In [17]:
annotations['HORAFINAL'] = new_feature

Fulfill Missing <b>CRONOMETROFINAL</b> Information

In [18]:
import itertools

def get_next(some_iterable, window=1):
    items, nexts = itertools.tee(some_iterable, 2)
    nexts = itertools.islice(nexts, window, None)
    return zip(items, nexts)

new_feature = []

for _current, _next in get_next(zip(annotations['PARTICIPANTE'], annotations['CRONOMETROINICIO'],annotations['CRONOMETROFINAL'])):
    if _next[0] == _current[0]:
        if (_current[1] is None) or (isinstance(_current[1],pd._libs.tslibs.nattype.NaTType)):
            new_feature.append(_current[1])
        else:
            if (_current[2] is None) or (isinstance(_current[2],pd._libs.tslibs.nattype.NaTType)):
                new_feature.append(_next[1]) # Add the next HORAINICIO value
            else:
                new_feature.append(_current[2]) # Add the current HORAFINAL value
    else:
        new_feature.append(_current[2]) # Add the current HORAFINAL value

new_feature.append(_current[2]) # Add the current HORAFINAL value

In [19]:
annotations['CRONOMETROFINAL'] = new_feature

Create a New Feature from the <b>start time difference</b> and <b>end time difference</b>

In [20]:
new_feature = []
for start, end in zip(annotations['HORAINICIO'],annotations['HORAFINAL']):   
    if start is None or isinstance(start,pd._libs.tslibs.nattype.NaTType) or end is None or isinstance(end,pd._libs.tslibs.nattype.NaTType):
        new_feature.append(None)
    else:
        start = start.time()
        end = end.time()
        new_feature.append(datetime.datetime.combine(datetime.date.today(), end) - datetime.datetime.combine(datetime.date.today(), start))

In [21]:
annotations['HORADIFERENCA'] = new_feature

Create a New Feature from the <b>start timer difference</b> and <b>end timer difference</b>

In [22]:
new_feature = []
for start, end in zip(annotations['CRONOMETROINICIO'],annotations['CRONOMETROFINAL']):
    if start is None or isinstance(start,pd._libs.tslibs.nattype.NaTType) or end is None or isinstance(end,pd._libs.tslibs.nattype.NaTType):
        new_feature.append(None)
    else:
        new_feature.append(datetime.datetime.combine(datetime.date.today(), end) - datetime.datetime.combine(datetime.date.today(), start))

In [23]:
annotations['CRONOMETRODIFERENCA'] = new_feature

Cast the Timestamp type to datetime

In [24]:
annotations['HORAINICIO'] = annotations['HORAINICIO'].apply(lambda x: x.to_pydatetime())
annotations['HORAFINAL'] = annotations['HORAFINAL'].apply(lambda x: x.to_pydatetime())

Unify the many Annotations into Specified Categories

In [25]:
unifyAnnotation = {
    'baseline':[
        'Linha de base sensores (5min)',
        'Linha de base sensores'
    ],
    
    'tsst':[
        'Fala livre participante',
        'Instruções fala livre'
    ],
    
    'arithmetic':[
        'Instruções aritmética',
        'Tarefa de aritmética'
    ],
    
    'post_test_sensors_1':[
        'Pós-teste sensores 1'
    ],
    
    'post_test_sensors_2':[
        'Pós-teste sensores 2'
        
    ],
    
    'no_category':[
        'None',
        'Ligou Polar',
        'Ligou Esense',
        'Ligou Bewell',
        'Ligou Bewell 1 – mão',
        'Ligou Bewell 2 – peito',
        'Instruções para coleta da linha de base',
        'Pré-teste instrumentos',
        'Trajeto sala TSST',
        'Instruções para o participante',
        'Saída da sala pesquisadora',
        'Trajeto sala Pós-teste',
        'Saída da sala participante',
        'Pós-teste Instrumentos',
        'Trajeto sala Pós-teste',
        'Coleta Saliva',
        'Coleta Saliva I (-1min)',
        'Coleta Saliva II (+1min)',
        'Coleta Saliva III',
        'Coleta Saliva IV (+30min)',
        'Final – participante liberado',
        'Final - participante liberado',
        'Fim desligar sensores',
        'Fim - desligar sensores',
        'Preparação participante',
        'Fim – desligar sensores'
    ]

}

In [26]:
def mergeConfusingAnnotations(x):
    if x is None:
        return x
    
    x = x.lower().rstrip().lstrip()
    for k in unifyAnnotation.keys():
        if (k.lower() in x) or (x in k.lower()):
            return k
        else:
            for y in unifyAnnotation[k]:
                #print('{0} -> {1} : {2}'.format(x, y.lower(), (y.lower() in x) or (x in y.lower())))
                if (y.lower() in x) or (x in y.lower()):
                    return k
    return None

In [27]:
annotations['CATEGORIACOMPARACAO'] = annotations['CATEGORIA'].apply(lambda x: mergeConfusingAnnotations(x))

In [28]:
b = annotations[['CATEGORIA','CATEGORIACOMPARACAO']]

In [29]:
annotations['CATEGORIA'] = annotations['CATEGORIA'].apply(lambda x: mergeConfusingAnnotations(x))

<b>Relates the Data and Annotations</b>

Relates the Annotation <b>category</b> with Data by <b>participant</b>, <b>start time</b>, and <b>end time</b>

In [30]:
data['CATEGORY'] = None

In [31]:
_categories = {}

for pat in annotations['PARTICIPANTE'].unique():
    _participant = {}
    for cat in annotations['CATEGORIA'].unique():      
        condition = ((annotations['PARTICIPANTE'] == pat) & (annotations['CATEGORIA'] == cat))
        pat_min = annotations[condition]['HORAINICIO'].min()
        pat_max = annotations[condition]['HORAFINAL'].max()

        _participant[cat] = (pat_min,pat_max)
        
    _categories[pat] = _participant

In [32]:
data.index = list(range(len(data)))

In [33]:
data['TIMESTAMP'] = data['TIMESTAMP'].apply(lambda x: x.tz_localize(None))

In [34]:
new_feature = {}
for x in zip(data.index, data['PARTICIPANT'], data['TIMESTAMP']):
    if x[1] in _categories.keys():
        for cat in _categories[x[1]]:
            pat_min = _categories[x[1]][cat][0]
            pat_max = _categories[x[1]][cat][1]

            if (x[2] >= pat_min) & (x[2] <= pat_max):
                new_feature[x[0]] =  cat

In [35]:
new_feature = [new_feature[x] if x in new_feature else None for x in data.index]

In [36]:
data['CATEGORY'] = new_feature

Delete unused columns

In [37]:
data=data.drop(['TIMESTAMP','PARTICIPANT'], axis=1)

Delete the <b>"no_category"</b> data

In [38]:
new_data = data.loc[data['CATEGORY'] != "no_category"].copy()

<b>Method to Complement the Data Category - "baseline"</b>

In [39]:
def data_generator_baseline(df):
    file = "baseline_file.csv"
    new_df = pd.read_csv(reading_folder + file, sep=';', encoding='utf-8')
    df = df.append(new_df)
    return df

<b>Method to Complement the Data Category - "tsst"</b>

In [40]:
def data_generator_tsst(df):
    file = "tsst_file.csv"
    new_df = pd.read_csv(reading_folder + file, sep=';', encoding='utf-8')
    df = df.append(new_df)
    return df

<b>Method to Complement the Data Category - "arithmetic"</b>

In [41]:
def data_generator_arithmetic(df):
    file = "arithmetic_file.csv"
    new_df = pd.read_csv(reading_folder + file, sep=';', encoding='utf-8')
    df = df.append(new_df)
    return df

<b>Method to Complement the Data Category - "post_test_sensors_1"</b>

In [42]:
def data_generator_post_test_sensors_1(df):
    file = "post_test_sensors_1_file.csv"
    new_df = pd.read_csv(reading_folder + file, sep=';', encoding='utf-8')
    df = df.append(new_df)
    return df

<b>Method to Complement the Data Category - "post_test_sensors_2"</b>

In [43]:
def data_generator_post_test_sensors_2(df):
    file = "post_test_sensors_2_file.csv"
    new_df = pd.read_csv(reading_folder + file, sep=';', encoding='utf-8')
    df = df.append(new_df)
    return df

<b>Method to Compare the Size Each Category</b>

In [44]:
def comparator (new_data):
    size_baseline = len(new_data.loc[new_data["CATEGORY"]=="baseline"])
    size_tsst = len(new_data.loc[new_data["CATEGORY"]=="tsst"])
    size_arithmetic = len(new_data.loc[new_data["CATEGORY"]=="arithmetic"])
    size_post_test_sensors_1 = len(new_data.loc[new_data["CATEGORY"]=="post_test_sensors_1"])
    size_post_test_sensors_2 = len(new_data.loc[new_data["CATEGORY"]=="post_test_sensors_2"])
    
    print("Baseline: " + str(size_baseline))
    print("TSST: " + str(size_tsst))
    print("Arithmetic: " + str(size_arithmetic))
    print("Post Test Sensors I: " + str(size_post_test_sensors_1))
    print("Post Test Sensors II: " + str(size_post_test_sensors_2))
    
        
    if (size_baseline <= 25000):
        size_baseline = 100000000
    
    if (size_tsst <= 25000):
        size_tsst = 100000000
        
    if (size_arithmetic <= 25000):
        size_arithmetic = 100000000
        
    if (size_post_test_sensors_1 <= 25000):
        size_post_test_sensors_1 = 100000000
        
    if (size_post_test_sensors_2 <= 25000):
        size_post_test_sensors_2 = 100000000
    
    if (size_baseline <= size_tsst):
        if (size_baseline <= size_arithmetic):
            if (size_baseline <= size_post_test_sensors_1):
                if (size_baseline <= size_post_test_sensors_2):
                    smaller = size_baseline
                else:
                    smaller = size_post_test_sensors_2
            else:
                if (size_post_test_sensors_1 <= size_post_test_sensors_2):
                    smaller = size_post_test_sensors_1
                else:
                    smaller = size_post_test_sensors_2
        else:
            if (size_arithmetic <= size_post_test_sensors_1):
                if (size_arithmetic <= size_post_test_sensors_2):
                    smaller = size_arithmetic
                else:
                    smaller = size_post_test_sensors_2
        
            else:
                if (size_post_test_sensors_1 <= size_post_test_sensors_2):
                    smaller = size_post_test_sensors_1
                else:
                    smaller = size_post_test_sensors_2
            
    else:
        if (size_tsst <= size_arithmetic):
            if (size_tsst <= size_post_test_sensors_1):
                if (size_tsst <= size_post_test_sensors_2):
                    smaller = size_tsst
                    
            else:        
                if (size_post_test_sensors_1 <= size_post_test_sensors_2):
                    smaller = size_post_test_sensors_1
                else:
                    smaller = size_post_test_sensors_2
        else:
            if (size_arithmetic <= size_post_test_sensors_1):
                if (size_arithmetic <= size_post_test_sensors_2):
                    smaller = size_arithmetic
                else:
                    smaller = size_post_test_sensors_2
        
            else:
                if (size_post_test_sensors_1 <= size_post_test_sensors_2):
                    smaller = size_post_test_sensors_1
                else:
                    smaller = size_post_test_sensors_2
    print (smaller)
                    
    return smaller

<b>Method to Create File for Each Category</b>

In [45]:
def data_generator(new_data, minimum_size):
    size_baseline = len(new_data.loc[new_data["CATEGORY"]=="baseline"])
    size_tsst = len(new_data.loc[new_data["CATEGORY"]=="tsst"])
    size_arithmetic = len(new_data.loc[new_data["CATEGORY"]=="arithmetic"])
    size_post_test_sensors_1 = len(new_data.loc[new_data["CATEGORY"]=="post_test_sensors_1"])
    size_post_test_sensors_2 = len(new_data.loc[new_data["CATEGORY"]=="post_test_sensors_2"])
    
    if (size_baseline <= 25000):
        data_baseline = new_data.loc[new_data['CATEGORY'] == "baseline"].copy()
        data_baseline = data_generator_baseline(data_baseline)
        data_save = (data_baseline[0:minimum_size])
    else:
        data_baseline = new_data.loc[new_data['CATEGORY'] == "baseline"].copy()
        data_save = (data_baseline[0:minimum_size])
    
    if (size_tsst <= 25000):
        data_tsst = new_data.loc[new_data['CATEGORY'] == "tsst"].copy()
        data_tsst = data_generator_tsst(data_tsst)
        data_save = data_save.append(data_tsst[0:minimum_size])
    else:
        data_tsst = new_data.loc[new_data['CATEGORY'] == "tsst"].copy()
        data_save = data_save.append(data_tsst[0:minimum_size])
    
    if (size_arithmetic <= 25000):
        data_arithmetic = new_data.loc[new_data['CATEGORY'] == "arithmetic"].copy()
        data_arithmetic = data_generator_arithmetic(data_arithmetic)
        data_save = data_save.append(data_arithmetic[0:minimum_size])
    else:
        data_arithmetic = new_data.loc[new_data['CATEGORY'] == "arithmetic"].copy()
        data_save = data_save.append(data_arithmetic[0:minimum_size])
        
    if (size_post_test_sensors_1 <= 25000):
        data_post_test_sensors_1 = new_data.loc[new_data['CATEGORY'] == "post_test_sensors_1"].copy()
        data_post_test_sensors_1 = data_generator_post_test_sensors_1(data_post_test_sensors_1)
        data_save = data_save.append(data_post_test_sensors_1[0:minimum_size])
    else:
        data_post_test_sensors_1 = new_data.loc[new_data['CATEGORY'] == "post_test_sensors_1"].copy()
        data_save = data_save.append(data_post_test_sensors_1[0:minimum_size])
        
    if (size_post_test_sensors_2 <= 25000):
        data_post_test_sensors_2 = new_data.loc[new_data['CATEGORY'] == "post_test_sensors_2"].copy()
        data_post_test_sensors_2 = data_generator_post_test_sensors_2(data_post_test_sensors_2)
        data_save = data_save.append(data_post_test_sensors_2[0:minimum_size])
    else:
        data_post_test_sensors_2 = new_data.loc[new_data['CATEGORY'] == "post_test_sensors_2"].copy()
        data_save = data_save.append(data_post_test_sensors_2[0:minimum_size])
        
    return data_save

Call the <b>"comparator"</b> Method to Find the Smaller Category

In [46]:
minimum_size = comparator(new_data)

Baseline: 139230
TSST: 73931
Arithmetic: 346907
Post Test Sensors I: 584527
Post Test Sensors II: 741600
73931


Call the <b> "data_generator" </b> Method to Generate the File with Standardized Data

In [47]:
data_save = data_generator(new_data, minimum_size)

Call the <b>"comparator"</b> Verify the DataFrame

In [48]:
comparator(data_save)

Baseline: 73931
TSST: 73931
Arithmetic: 73931
Post Test Sensors I: 73931
Post Test Sensors II: 73931
73931


73931

Save the standardized file

In [147]:
data_save.to_csv(write_data, index=None, header=True, sep=';', encoding='utf-8')