In [None]:
import os
import csv
import pandas as pd

from tqdm import tqdm
from collections import Counter

In [None]:
file_list = os.listdir('./Keystrokes/files/')
file_list.remove('readme.txt')
file_list.remove('metadata_participants.txt')
len(file_list)

In [None]:
for file in tqdm(file_list):

    try: 
        df = pd.read_csv('./Keystrokes/files/' + file, sep='\t', encoding="windows-1252", quotechar='|', quoting=csv.QUOTE_NONE, usecols=["PARTICIPANT_ID", "TEST_SECTION_ID", "PRESS_TIME", "RELEASE_TIME", "KEYCODE"])
    except:
        print(file)
        break


In [None]:
df.head()

In [40]:
def session_to_features(session):
    typing_features = []
    for idx, (tstamp, event, key) in enumerate(session):

        if event == 1:
            continue

        # get the release event
        for idx_rel, (tstamp_rel, event_rel, key_rel) in enumerate(session[idx + 1:]):
            if event_rel == 1 and key_rel == key:

                hl = tstamp_rel - tstamp
                
                # next pressed key
                for idx_next, (tstamp_next, event_next, key_next) in enumerate(session[idx + 1:]):
                    if event_next == 0 and key_next != key:

                        il = tstamp_next - tstamp_rel
                        pl = tstamp_next - tstamp

                        # next release key
                        for idx_next_rel, (tstamp_next_rel, event_next_rel, key_next_rel) in enumerate(session[idx + 1:]):
                            if event_next_rel == 1 and key_next_rel == key_next:
                                rl = tstamp_next_rel - tstamp_rel
                                # print("key: {}, hold: {}, inter: {}, press: {}, release: {}".format(key, hl, il, pl, rl))
                                # transform to s
                                typing_features.append([key, hl / 1000, il / 1000, pl / 1000, rl / 1000])
                                break
                        break
                break
    return typing_features

In [41]:
# Transform to sequence dataset
# Path: 136M_eda.ipynb
grp_df = df.groupby(['PARTICIPANT_ID', 'TEST_SECTION_ID'])

# iterate over each group
for name, seq_group in grp_df:
    print("Participant ID: ", name[0])
    print("Test Section ID: ", name[1])
    # iterate through seq_group
    sequence = []
    press_code = 0
    release_code = 1
    for index, row in seq_group.iterrows():
        press_time = row['PRESS_TIME']
        release_time = row['RELEASE_TIME']
        keycode = row['KEYCODE']
        sequence.append([press_time, press_code, keycode])
        sequence.append([release_time, release_code, keycode])

    # sort sequence by time
    sequence.sort(key=lambda x: x[0])

    print(sequence)

    # get typing features
    typing_features = session_to_features(sequence)

    # print sequence
    print(typing_features)

    break



Participant ID:  426279
Test Section ID:  4594212
[[1477859450648, 0, 16], [1477859450880, 0, 65], [1477859451072, 1, 65], [1477859451088, 0, 78], [1477859451144, 1, 16], [1477859451200, 1, 78], [1477859451240, 0, 68], [1477859451384, 1, 68], [1477859451408, 0, 32], [1477859451528, 1, 32], [1477859452056, 0, 8], [1477859452152, 1, 8], [1477859452232, 0, 8], [1477859452336, 1, 8], [1477859452640, 0, 8], [1477859452696, 1, 8], [1477859452968, 0, 78], [1477859453104, 1, 78], [1477859453105, 0, 68], [1477859453256, 1, 68], [1477859453256, 0, 32], [1477859453368, 1, 32], [1477859453520, 0, 16], [1477859454312, 0, 73], [1477859454400, 1, 73], [1477859454456, 1, 16], [1477859454552, 0, 32], [1477859454688, 1, 32], [1477859454720, 0, 67], [1477859454855, 1, 67], [1477859454912, 0, 65], [1477859455040, 1, 65], [1477859455168, 0, 78], [1477859455304, 1, 78], [1477859455488, 0, 84], [1477859455591, 1, 84], [1477859456791, 0, 8], [1477859456840, 1, 8], [1477859457288, 0, 222], [1477859457376, 1, 2

#### Ways to improve
Use a different feature selection/feature engineering method.