# Importing Libraries and Preprocessing Function (add `INDEX`)

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.1f' % x)
import matplotlib.pyplot as plt

import re
import os

import sys

from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, models

In [14]:
def processing_folder(folder_path, train_size=0.8):
    path = os.path.join(os.getcwd(), folder_path)
    files = sorted(os.listdir(path), key=lambda x: int(re.findall(r'\d+', x)[0]))
    train_samples = []
    test_samples = []
    samples = []
    cols = []
    for i, path in enumerate(files):
        path = os.path.join(os.getcwd(), folder_path, path)
        with open(path) as f:
            lines = f.readlines()
        ## extract column names (once)
        if i == 1:
            ls = lines[0].split('\t')
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                cols = ls
        ## extracting all samples from the current file
        sample = []
        curr_text_id = ''
        curr_index = -1
        for line in lines[1:]:
            ls = line.split('\t')
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                if ls[1] != curr_text_id:
                    curr_index = 0
                    curr_text_id = ls[1]
                else:
                    curr_index += 1
                ls.append(curr_index)
                sample.append(ls)
        ##  split the current data into train-test-sets
        split_index = int(train_size * len(sample))
        train_samples = train_samples + sample[:split_index]
        test_samples = test_samples + sample[split_index:]
        samples = samples + sample
    ## forming dataframes
    df_all = pd.DataFrame(samples)
    df_train = pd.DataFrame(train_samples)
    df_test = pd.DataFrame(test_samples)
    ## renaming columns
    cols = cols + ['INDEX']
    df_all.columns, df_train.columns, df_test.columns = cols, cols, cols
    return df_all, df_train, df_test

# The Dataset

In [33]:
folder_path = "../data/keystroke_sample_old"
data, train_data, test_data = processing_folder(folder_path)

In [20]:
## number of users used in this dataset
print(f"Entire dataset: {len(data['PARTICIPANT_ID'].unique())} users; \
\nTrain dataset: {len(train_data['PARTICIPANT_ID'].unique())} users; \
\nTest dataset: {len(test_data['PARTICIPANT_ID'].unique())} users.")

Entire dataset: 149 users; 
Train dataset: 149 users; 
Test dataset: 149 users.


In [22]:
train_data.head(3)

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE,INDEX
0,5,7,The others raise their eyebrows.,The others raise their eyebrows,204,1471934383592,1471934383760,SHIFT,16,0
1,5,7,The others raise their eyebrows.,The others raise their eyebrows,203,1471934383701,1471934383760,T,84,1
2,5,7,The others raise their eyebrows.,The others raise their eyebrows,205,1471934383838,1471934383910,h,72,2


# Keyboard Layout Encoding (QWERTY)

In [23]:
first_row = [27, 27, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 0, 0, 145, 126, 0, 0, 0, 0, 0]
space = [0] * 23
second_row = [192, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 189, 187, 8, 0, 45, 36, 33, 0, 144, 111, 106, 109]
third_row = [9, 81, 87, 69, 82, 84, 89, 85, 73, 79, 80, 219, 221, 220, 0, 46, 35, 34, 0, 103, 104, 105, 107]
fourth_row = [20, 65, 83, 68, 70, 71, 72, 74, 75, 76, 186, 222, 13, 13, 0, 0, 0, 0, 0, 100, 101, 102, 107]
fifth_row = [16, 16, 90, 88, 67, 86, 66, 78, 77, 188, 190, 191, 16, 16, 0, 0, 38, 0, 0, 97, 98, 99, 13]
sixth_row = [17, 17, 191, 18, 32, 32, 32, 32, 32, 18, 92, 93, 17, 17, 0, 37, 40, 39, 0, 96, 96, 110, 13]
qwerty_keyboard = pd.DataFrame({'1st': first_row,
                                'space': space,
                                '2nd': second_row,
                                '3rd': third_row,
                                '4th': fourth_row,
                                '5th': fifth_row,
                                '6th': sixth_row}).transpose()
qwerty_keyboard.index = list(range(7))
qwerty_keyboard

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,27,27,112,113,114,115,116,117,118,119,...,123,0,0,145,126,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,192,49,50,51,52,53,54,55,56,57,...,8,0,45,36,33,0,144,111,106,109
3,9,81,87,69,82,84,89,85,73,79,...,220,0,46,35,34,0,103,104,105,107
4,20,65,83,68,70,71,72,74,75,76,...,13,0,0,0,0,0,100,101,102,107
5,16,16,90,88,67,86,66,78,77,188,...,16,0,0,38,0,0,97,98,99,13
6,17,17,191,18,32,32,32,32,32,18,...,17,0,37,40,39,0,96,96,110,13


In [24]:
## function turning the pandas dataframe keyboard into dictionary
def keycode_position(keyboard):
    keyboard_dict = {}
    for row in keyboard.index:
        for col, entry in enumerate(keyboard.iloc[row, :]):
            if entry in keyboard_dict:
                keyboard_dict[entry].append([row, col])
            else:
                keyboard_dict[entry] = [[row, col]]
    return keyboard_dict

## function determining the keycode distances (if multiple entry exists, assumes the shortest one)
def keycode_distance(keyboard_pos, keycode1, keycode2):
    def manhattan_dist(arr1, arr2):
        return abs(arr1[0] - arr2[0]) + abs(arr1[1] - arr2[1])
    distance = 30 ## any integer larger than 22+6
    if keycode1 in keyboard_pos and keycode2 in keyboard_pos:
        for arr1 in keyboard_pos[keycode1]:
            for arr2 in keyboard_pos[keycode2]:
                curr_dist = manhattan_dist(arr1, arr2)
                if curr_dist < distance:
                    distance = curr_dist
        if distance < 5:
            return distance
    return 5

## function determining the distance of a list of keys to the home keys (ASDF JKL;)
def home_distance(keyboard_pos, keycode_list):
    '''
    In QWERTY keyboard, F and H are the home keys with keycodes 70 and 74 resp.
    '''
    sum = 0
    for key in keycode_list:
        sum += min([keycode_distance(keyboard_pos, 70, key), keycode_distance(keyboard_pos, 74, key)])
    return sum/len(keycode_list)

In [25]:
qwerty_keyboard_pos = keycode_position(qwerty_keyboard)
keycode_distance(qwerty_keyboard_pos, 85, 117), home_distance(qwerty_keyboard_pos, [72])

(3, 1.0)

# Preprocessing - CNN

In [26]:
def feature_extractor(data, keycode_distance, home_distance, keyboard_pos):
    df = data[['PARTICIPANT_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
    df = df.astype('int64')
    df = df.rename(columns={'PARTICIPANT_ID': 'USER'})

    df['K1'] = df['KEYCODE']
    df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
    df['I1'] = df['INDEX']
    df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
    df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
    df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
    df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)

    keycode_dist = []
    home_dist = []
    for row in df.index:
        keycode_dist.append(keycode_distance(keyboard_pos, df['K1'][row], df['K2'][row]))
        home_dist.append(home_distance(keyboard_pos, [df['K1'][row], df['K2'][row]]))
    df['KD'] = keycode_dist
    df['HD'] = home_dist

    df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX'])
    df = df.iloc[:-1, :]
    return df


def generate_keycode_dict(top=42, add_UNK=True):
    keycode_dict = {keycode: i for i, keycode in enumerate(data['KEYCODE'].astype('int32').value_counts()[:top].to_dict().keys())}
    if add_UNK:
        keycode_dict[0] = len(keycode_dict)
    return keycode_dict


def single_kdi_image(curr_chunk, mat_length, keycode_dict):
    mat_kd = np.zeros((mat_length, mat_length))
    mat_hd = np.zeros((mat_length, mat_length))
    mat_index = np.zeros((mat_length, mat_length))
    mat_hl1 = np.zeros((mat_length, mat_length))
    mat_il = np.zeros((mat_length, mat_length))
    mat_hl2 = np.zeros((mat_length, mat_length))
    count = np.zeros((mat_length, mat_length))

    ## form input image
    for row in curr_chunk.index[:-1]:
        i = curr_chunk['K1'][row]
        j = curr_chunk['K2'][row]
        if i in keycode_dict:
            i = keycode_dict[i]
        else:
            i = keycode_dict[0]
        if j in keycode_dict:
            j = keycode_dict[j]
        else:
            j = keycode_dict[0]

        mat_kd[i, j] += curr_chunk['KD'][row]
        mat_hd[i, j] += curr_chunk['HD'][row]
        mat_index[i, j] += curr_chunk['I1'][row]
        mat_hl1[i, j] += curr_chunk['HL1'][row]
        mat_il[i, j] += curr_chunk['IL'][row]
        mat_hl2[i, j] += curr_chunk['HL2'][row]
        count[i, j] += 1
    mat_kd = np.divide(mat_kd, count, out=np.zeros_like(mat_kd), where=count!=0)
    mat_hd = np.divide(mat_hd, count, out=np.zeros_like(mat_hd), where=count!=0)
    mat_index = np.divide(mat_index, count, out=np.zeros_like(mat_index), where=count!=0)
    mat_hl1 = np.divide(mat_hl1, count, out=np.zeros_like(mat_hl1), where=count!=0)
    mat_il = np.divide(mat_il, count, out=np.zeros_like(mat_il), where=count!=0)
    mat_hl2 = np.divide(mat_hl2, count, out=np.zeros_like(mat_hl2), where=count!=0)

    kdi_image = np.stack([mat_kd, mat_hd, mat_index, mat_hl1, mat_il, mat_hl2], axis=-1)

    ## form output vector
    row = curr_chunk.index[-1]
    b1_a1 = curr_chunk['IL'][row]
    b2_b1 = curr_chunk['HL2'][row]
    
    return kdi_image, np.array([b1_a1, b2_b1])


def generate_kdi_images(data, mat_length, window, shift):
    keycode_dict = generate_keycode_dict(top=mat_length-1)    ## -1 to offset the 0:mat_length in dict
    window_length = window + 1
    input_arr = []
    output_arr = []
    for user in data['USER'].unique():
        curr_ds = data[data['USER'] == user]
        i = 0
        while i + window_length < len(curr_ds.index):
            curr_chunk = curr_ds.loc[curr_ds.index[i:i+window_length]]
            curr_image, curr_output = single_kdi_image(curr_chunk, mat_length, keycode_dict)
            input_arr.append(curr_image)
            output_arr.append(curr_output)
            i = i + shift
        if i < len(curr_ds.index) - 1:
            curr_chunk = curr_ds.loc[curr_ds.index[i:]]
            curr_image, curr_output = single_kdi_image(curr_chunk, mat_length, keycode_dict)
            input_arr.append(curr_image)
            output_arr.append(curr_output)
    return np.stack(input_arr, axis=0), np.stack(output_arr, axis=0)

# Generate `tf.data.Dataset` Object for Training

In [27]:
mat_length = 40
window = 30
shift = 5
batch_size = 128

train_df = feature_extractor(train_data, keycode_distance, home_distance, qwerty_keyboard_pos)
test_df = feature_extractor(test_data, keycode_distance, home_distance, qwerty_keyboard_pos)

train_kdi, train_output = generate_kdi_images(train_df, mat_length=mat_length, window=window, shift=shift)
test_kdi, test_output = generate_kdi_images(test_df, mat_length=mat_length, window=window, shift=shift)

train_kdi_ds = tf.data.Dataset.from_tensor_slices(train_kdi).batch(batch_size)
train_output_ds = tf.data.Dataset.from_tensor_slices(train_output).batch(batch_size)

test_kdi_ds = tf.data.Dataset.from_tensor_slices(test_kdi).batch(batch_size)
test_output_ds = tf.data.Dataset.from_tensor_slices(test_output).batch(batch_size)

trainset = tf.data.Dataset.zip((train_kdi_ds, train_output_ds))
testset = tf.data.Dataset.zip((test_kdi_ds, test_output_ds))

2022-12-11 15:37:59.281568: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-11 15:37:59.281950: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Pro


In [28]:
for a in test_kdi_ds.take(1):
    tensor = a
for a in test_output_ds.take(1):
    output = a
tensor.shape, output.shape

(TensorShape([128, 40, 40, 6]), TensorShape([128, 2]))

# Training: CNN Model

In [29]:
## CNN model structure
model = models.Sequential()
model.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(mat_length, mat_length, 6)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(2, activation='tanh'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 38, 38, 64)        3520      
                                                                 
 conv2d_1 (Conv2D)           (None, 36, 36, 64)        36928     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 18, 18, 64)       0         
 )                                                               
                                                                 
 conv2d_2 (Conv2D)           (None, 16, 16, 128)       73856     
                                                                 
 conv2d_3 (Conv2D)           (None, 14, 14, 128)       147584    
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 7, 7, 128)        0         
 2D)                                                    

In [30]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='mse', metrics=['mse'])
EPOCH = 5
history = model.fit(trainset, epochs=EPOCH, validation_data=testset)

Epoch 1/5


2022-12-11 15:38:56.193454: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-11 15:38:56.194104: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-12-11 15:39:05.602139: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='mae', metrics=['mae'])
EPOCH = 5
history = model.fit(trainset, epochs=EPOCH, validation_data=testset)

Epoch 1/5


2022-12-11 15:39:35.822281: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-12-11 15:39:41.684231: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
train_df.describe()

Unnamed: 0,USER,K1,K2,I1,I2,HL1,IL,HL2,KD,HD
count,88037.0,88037.0,88037.0,88037.0,88037.0,88037.0,88037.0,88037.0,88037.0,88037.0
mean,258.6,66.3,66.3,27.3,27.3,118.6,-30.4,118.6,3.4,2.2
std,143.2,32.2,32.2,19.4,19.4,266.6,827577.3,266.6,1.5,0.9
min,5.0,8.0,8.0,0.0,0.0,-63.0,-86613766.0,-63.0,0.0,0.0
25%,124.0,59.0,59.0,12.0,12.0,80.0,11.0,80.0,2.0,1.5
50%,274.0,72.0,72.0,24.0,24.0,104.0,80.0,104.0,4.0,2.0
75%,382.0,80.0,80.0,40.0,40.0,133.0,187.0,133.0,5.0,2.5
max,500.0,226.0,226.0,135.0,135.0,44252.0,86190442.0,44252.0,5.0,5.0


# Conclusion

Using MSE loss function exagerated the effect from outliers; changing to MAE, the loss looks much more reasonable. However, the training loss and validation loss is not improving at all, it might be because of the large standard deviation in the `IL` time latency feature.

NEXT: 
* Try different model
* Try looking into discarding outliers