In [1]:
import pandas as pd
import os

In [3]:
# read in original

original = pd.read_csv('./../../FreeText-Dataset-31-USERS.csv')

In [4]:
DIR = "./firefox_unisolated_resistFP_120ms_NJ"

dirs = [
    DIR + "_freetext_1-7/",
    DIR + "_freetext_8-14/",
    DIR + "_freetext_15-21/",
    DIR + "_freetext_22-28/",
    DIR + "_freetext_29-31/",
]

In [5]:
# read in timings
timings_list = []

for dir in dirs[:-1]:
    for set in range(0, 105):
        path = os.path.join(dir, f'./text-timing-data({set}).csv')
        df = pd.read_csv(path)
        timings_list.append(df)

for set in range(0, 45):
    path = os.path.join(dirs[-1], f'./text-timing-data({set}).csv')
    df = pd.read_csv(path)
    timings_list.append(df)


assert len(timings_list) == 31 * 15


In [6]:
# read in keys
users = range(1, 8)
keys_list = []


for (set, dir) in enumerate(dirs[:-1]):
    for user in users:
        for set_i in range(1, 16):
            path = os.path.join(
                dir, f'free-text-output/user{user + (set * 7)}-{set_i}.csv')
            df = pd.read_csv(path)
            keys_list.append(df)

for user in range(29, 32):
    for set_i in range(1, 16):
        path = os.path.join(
            dirs[-1], f'free-text-output/user{user}-{set_i}.csv')
        df = pd.read_csv(path)
        keys_list.append(df)


assert len(keys_list) == len(timings_list) == 31 * 15


In [7]:
# combine keys and timings

dataframes = []

# for each user, create individuel dataset
for user in range(1, 32):
    user_dataframes = []

    # for each set, create individual dataset
    for set in range(1, 16):
        idx = ((user - 1) * 15) + (set - 1)
        last = idx
        timings = timings_list[idx]
        keys = keys_list[idx]

        # trim warmups
        warmups = timings.iloc[:8]['key'].to_list()
        assert warmups == ['Control'] * 8 or warmups == ['Backspace'] * 8
        timings = timings.iloc[8:]
        
        # ground timings on first event
        timestamps = []
        distances = []
        first = None
        last = None
        for i, x in enumerate(timings['timestamp']):
            if first is None:
                first = x
                last = x
                timestamps.append(0)
                distances.append(0)
                continue
            else:
                grounded = x - first
                distance = x - last
                last = x

                assert grounded >= 0
                assert distance >= 0

                timestamps.append(grounded)
                distances.append(distance)

        assert len(timings['timestamp']) == len(timestamps)
        assert len(timings['timestamp']) == len(distances)

        # get orignal data for user and set
        originals = original.loc[(original['user'] == user) & (original['set'] == set)]

        assert len(timestamps) == len(keys) == len(originals)

        # create dataset
        df = pd.DataFrame({
            'user': user,
            'set': set,
            'timestamp': timestamps,
            'distance': distances,
            'key': keys['key'].to_list(),
            'pressed_key': timings['key'].to_list()
        })

        # check dimeensions
        assert len(df) == len(originals)

        # check keys match
        assert df['key'].to_list() == originals['key'].to_list()

        user_dataframes.append(df)

    # combine all datasets to one user dataset
    assert len(user_dataframes) == 15
    user_dataframe = pd.concat(user_dataframes)
    
    dataframes.append(user_dataframe)


In [9]:
# combine into one dataset
complete = pd.concat(dataframes)

complete['pressed_key'] = complete['pressed_key'].map({' ': "Space"}).fillna(complete['pressed_key'])

assert len(complete) == len(original)
assert original['to_press'].to_list() == complete['pressed_key'].to_list()

complete.to_csv('./complete.csv', index=False)