In [1]:
import os
import csv
import pandas as pd

from tqdm import tqdm
from collections import Counter

In [2]:
file_list = os.listdir('./Keystrokes/files/')
file_list.remove('readme.txt')
file_list.remove('metadata_participants.txt')
len(file_list)

168593

In [3]:
# preproc files
for file in tqdm(file_list):

    f = open('./Keystrokes/files/' + file, 'r', encoding='windows-1252')

    # read file
    content = f.read()
    f.close()

    # replace \n
    content = content.replace('\t\n\t', '\tnewline\t')
    content = content.replace('\t\t\t', '\ttab\t')

    # save file
    f = open('./Keystroke_prc/' + file, 'w', encoding='windows-1252')
    f.write(content)
    f.close()


100%|██████████| 168593/168593 [03:27<00:00, 811.22it/s]


In [4]:

def keystrokes_to_session(key_df: pd.DataFrame) -> pd.DataFrame:
    """
    Transform a keystroke dataframe into session dataframe.
    """
    grp_df = key_df.groupby(['PARTICIPANT_ID', 'TEST_SECTION_ID'])

    # iterate over each group
    press_code = 1
    release_code = 0

    session_data = []
    
    for name, seq_group in grp_df:
        # iterate through seq_group
        sequence = []
        
        for index, row in seq_group.iterrows():
            press_time = row['PRESS_TIME']
            release_time = row['RELEASE_TIME']
            keycode = row['KEYCODE']
            sequence.append([press_time, press_code, keycode])
            sequence.append([release_time, release_code, keycode])

        # sort sequence by timea
        sequence.sort(key=lambda x: x[0])

        # append to session_data
        session_data.append([name[0], name[1], sequence])


    sessions_df = pd.DataFrame(session_data, columns=['PARTICIPANT_ID', 'TEST_SECTION_ID', 'SEQUENCE'])

    return sessions_df


In [5]:
from collections import defaultdict

failed_files = []
failed_due_to_nan = []

# test_section_id: test_section_text
test_section_dict = defaultdict(str)
unique_count = Counter()


for file in tqdm(file_list):
    try: 
        df = pd.read_csv('./Keystroke_prc/' + file , sep='\t', encoding="windows-1252", quoting=csv.QUOTE_NONE)

        gr_df = df.groupby(['TEST_SECTION_ID', 'SENTENCE'])


        for name, seq_group in gr_df:
            test_section_dict[name[0]] = name[1]
            unique_count[name[0]] += 1

        # # transform to session
        # sessions_df = keystrokes_to_session(df)
        # sessions_df.to_csv('./Keystroke_sessions/' + file, index=False)
        
        if df['KEYCODE'].isnull().values.any():
            failed_due_to_nan.append((file, df['KEYCODE'].isnull().sum(), len(df)))
            continue
    except:
        failed_files.append(file)

100%|██████████| 168593/168593 [05:51<00:00, 479.54it/s]


In [None]:
# dataframe with test_section_id and test_section_text
test_section_df = pd.DataFrame.from_dict(test_section_dict, orient='index', columns=['SENTENCE'])[:100]
test_section_df.to_excel('./Keystroke_test_section.xlsx', index=True)

In [16]:
print(len(test_section_dict))
print(len(unique_count))


In [7]:
print("Failed due to nan",  len(failed_due_to_nan))
print("Failed due to other", len(failed_files))

Failed due to nan 0
Failed due to other 0


In [11]:
df = pd.read_csv('./Keystroke_sessions/' + file_list[0], encoding="windows-1252")
df.to_excel('./' + file_list[0] + '.xlsx', index=True)