### Preprocessing Aalto Mobile Keystrokes Dataset for Keystroke Biometric Authentication

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [2]:
RAW_KEY = "./raw_data/csv_raw_and_processed/Data_Raw/keystrokes.csv"
RAW_KEY_HEAD = "./raw_data/csv_raw_and_processed/Data_Raw/keystrokes_header.csv"
RAW_TEST = "./raw_data/csv_raw_and_processed/Data_Raw/test_sections.csv"
RAW_TEST_HEAD = "./raw_data/csv_raw_and_processed/Data_Raw/test_sections_header.csv"

PRC_PATH = "./prc_data/mobile_users/"
PRC_BENCH = "./prc_data/mobile_bench/"
PRC_BENCH_REST = "./prc_data/mobile_bench_rest/"

In [3]:
# empty folders
prc_files = os.listdir(PRC_PATH)
for file in prc_files:
  os.remove(PRC_PATH + file)

# empty clean folder
prc_files = os.listdir(PRC_BENCH)
for file in prc_files:
  os.remove(PRC_BENCH + file)

In [4]:
# assemble the keystrokes dataframe
keystrokes_df = pd.read_csv(RAW_KEY, header=None, escapechar='\\', encoding='ISO-8859-1')
key_header = pd.read_csv(RAW_KEY_HEAD, header=None)

keystrokes_df.columns = key_header.iloc[1:, 0].values
keystrokes_df = keystrokes_df[['TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE']]
keystrokes_df.head()

Unnamed: 0,TEST_SECTION_ID,PRESS_TIME,RELEASE_TIME,KEYCODE
0,1,1536208819372,1536208819391,229
1,1,1536208819590,1536208819599,229
2,1,1536208819699,1536208819707,229
3,1,1536208819733,1536208819741,229
4,1,1536208820102,1536208820113,229


In [5]:
# groupby TEST_SECTION_ID and transform to sequence
seq_df = keystrokes_df.groupby('TEST_SECTION_ID').apply(lambda x: list(zip(x["PRESS_TIME"], x["RELEASE_TIME"], x["KEYCODE"]))).reset_index()
seq_df = seq_df.rename(columns={0: "SEQUENCE"})
seq_df.head()

Unnamed: 0,TEST_SECTION_ID,SEQUENCE
0,1,"[(1536208819372, 1536208819391, 229), (1536208..."
1,2,"[(1536208830934, 1536208830950, 229), (1536208..."
2,3,"[(1536208854834, 1536208854852, 229), (1536208..."
3,4,"[(1536208882492, 1536208882502, 229), (1536208..."
4,5,"[(1536208891467, 1536208891478, 229), (1536208..."


In [6]:
# transform the sequence into (timestamp, event_type, keycode)

def event_func(seqe):
  new_seq = []
  # for s in seqe:
  #   new_seq.append((s[0], 1, s[2]))
  #   new_seq.append((s[1], 0, s[2]))
  new_seq = sorted(seqe, key=lambda x: x[0])
  return new_seq

seq_df["SEQUENCE"] = seq_df["SEQUENCE"].apply(event_func)

In [7]:
# assemble the test_section df
mobile_test_section_df = pd.read_csv(RAW_TEST, escapechar='\\', quotechar='"', encoding='ISO-8859-1', header=None)
header_df = pd.read_csv(RAW_TEST_HEAD)
# put header on test sections df
mobile_test_section_df.columns = header_df.iloc[:, 0].values
mobile_test_section_df.head()

Unnamed: 0,TEST_SECTION_ID,SENTENCE_ID,PARTICIPANT_ID,USER_INPUT,INPUT_TIME,EDIT_DISTANCE,ERROR_RATE,WPM,INPUT_LENGTH,ERROR_LEN,POTENTIAL_WPM,POTENTIAL_LENGTH,DEVICE
0,1,901,1,It's not looking too good is it?,8174,0,0.0,45.51015414729631,31,32,48.117966627861854,31,N
1,2,1348,1,"Once state owned, Telecom is now half owned by...",20182,0,0.0,41.02665741750074,69,70,41.25644631973746,66,N
2,3,1252,1,These factors dictate creak limits to the conc...,23969,4,6.25,30.539446785431185,61,64,33.90567267985221,65,N
3,4,978,1,I have forwarded to Kelly,5230,1,3.8461538461538463,57.36137667304016,25,26,61.52584085315833,25,N
4,5,517,1,"Thank to you soon, Gerrard",5210,9,33.33333333333333,59.88483685220729,26,27,64.06570841889116,26,N


In [8]:
# join by test_section_id
joined_df = seq_df.merge(mobile_test_section_df[['TEST_SECTION_ID', "PARTICIPANT_ID"]], on='TEST_SECTION_ID', how='left')
joined_part_df = joined_df[['PARTICIPANT_ID', 'TEST_SECTION_ID', "SEQUENCE"]].groupby(['PARTICIPANT_ID'])
joined_part_df.head()

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SEQUENCE
0,1,1,"[(1536208819372, 1, 229), (1536208819391, 0, 2..."
1,1,2,"[(1536208830934, 1, 229), (1536208830950, 0, 2..."
2,1,3,"[(1536208854834, 1, 229), (1536208854852, 0, 2..."
3,1,4,"[(1536208882492, 1, 229), (1536208882502, 0, 2..."
4,1,5,"[(1536208891467, 1, 229), (1536208891478, 0, 2..."
...,...,...,...
1648877,274176,1811849,"[(1547802187014, 1, 229), (1547802187027, 0, 2..."
1648881,274174,1811853,"[(1547802192975, 1, 229), (1547802192983, 0, 2..."
1648885,274178,1811858,"[(1547802201308, 1, 229), (1547802201317, 0, 2..."
1648886,274169,1811859,"[(1547802206625, 1, 76), (1547802206631, 0, 76..."


In [9]:
for name, group in joined_part_df:
    if len(group) >= 15:
      valid = True
      for i, row in group.iterrows():
          if len(row['SEQUENCE']) < 50:
              valid = False
              break
      if valid:
        group.to_csv(f'{PRC_PATH}/{name}.csv', index=False)

#### Create the Typeformer Benchmark

In [10]:
import json
test_users = json.load(open('typeformer_bench.json'))
# transform keys into ints
test_users = {int(k): list(map(int, v)) for k, v in test_users.items()}

In [11]:
for user, test_id_list in test_users.items():
  curr_group = joined_part_df.get_group(user)
  curr_group = curr_group[curr_group["TEST_SECTION_ID"].isin(test_id_list)]
  curr_group.to_csv(f"{PRC_BENCH}/{user}.csv", index=False)

In [12]:
len(os.listdir(PRC_BENCH))

1000

In [14]:
# all mobile users
all_mobile = os.listdir(PRC_PATH)
bench_mobile = os.listdir(PRC_BENCH)
bench_rest_mobile = list(set(all_mobile) - set(bench_mobile))

# copy the files into PRC_BENCH_REST
for file in bench_rest_mobile:
  os.system(f"cp {PRC_PATH}/{file} {PRC_BENCH_REST}/{file}")

In [15]:
len(os.listdir(PRC_BENCH_REST))

14726

In [16]:
len(os.listdir(PRC_PATH))

14964