In [None]:
!nvidia-smi

Sat Dec  3 16:08:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   27C    P0    49W / 400W |   1418MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Importing Libraries --> `train_data`, `test_data`

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import os
import datetime

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, models

Mounted at /content/drive/


In [None]:
def processing_folder(folder_path, train_size=0.8):
    os.chdir(folder_path)
    files = sorted(os.listdir(folder_path), key=lambda x: int(re.findall(r'\d+', x)[0]))
    train_samples = []
    test_samples = []
    samples = []
    cols = []
    for i, path in enumerate(files):
        with open(path, encoding='utf-8',           ##https://stackoverflow.com/questions/12468179/unicodedecodeerror-utf8-codec-cant-decode-byte-0x9c
                 errors='ignore') as f:
            lines = f.readlines()
        ## extract column names (once)
        if i == 1:
            ls = lines[0].split('\t')
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                cols = ls
        ## extracting all samples from the current file
        sample = []
        curr_text_id = ''
        curr_index = -1
        for line in lines[1:]:
            ls = line.split('\t')
            # if len(ls) != 9:
            #     print(path, line)
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                if ls[1] != curr_text_id:
                    curr_index = 0
                    curr_text_id = ls[1]
                else:
                    curr_index += 1
                ls.append(curr_index)
                sample.append(ls)
        ##  split the current data into train-test-sets
        split_index = int(train_size * len(sample))
        train_samples = train_samples + sample[:split_index]
        test_samples = test_samples + sample[split_index:]
        samples = samples + sample
    ## forming dataframes
    df_all = pd.DataFrame(samples)
    df_train = pd.DataFrame(train_samples)
    df_test = pd.DataFrame(test_samples)
    ## renaming columns
    cols = cols + ['INDEX']
    df_all.columns, df_train.columns, df_test.columns = cols, cols, cols
    return df_all, df_train, df_test

In [None]:
## --> 7001
folder_path = "/content/drive/MyDrive/COMP576/keystroke-samples"
## baby sample
folder_path = "/content/drive/MyDrive/COMP576/raw-keystroke"
## mini sample
folder_path = "/content/drive/MyDrive/COMP576/train-dev-test"
data, train_data, test_data = processing_folder(folder_path)

> Baby sample takes about 10 seconds
> Large sample takes about 5 minutes (30X)

In [None]:
data.head(3)

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE,INDEX
0,2004,20212,I think that will help.,I think that will help,964481,1471960160759,1471960161309,SHIFT,16,0
1,2004,20212,I think that will help.,I think that will help,964485,1471960161197,1471960161324,I,73,1
2,2004,20212,I think that will help.,I think that will help,964490,1471960161356,1471960161479,,32,2


In [None]:
df = feature_extractor(data)
df = df.astype(int)

In [None]:
df['IL'].std()

830783.5129967456

In [None]:
print(f"There are in total {len(train_data['PARTICIPANT_ID'].unique())} many users contained in the train dataset")
print(f"There are in total {len(test_data['PARTICIPANT_ID'].unique())} many users contained in the test dataset")
print(f"There are in total {len(data['PARTICIPANT_ID'].unique())} many users contained in the entire dataset")
print(f"There are in total {len(data)} many keystrokes contained in the dataset")

There are in total 360 many users contained in the train dataset
There are in total 360 many users contained in the test dataset
There are in total 360 many users contained in the entire dataset
There are in total 263807 many keystrokes contained in the dataset


> Baby sample contains 360 users with 263807 keystrokes

> Main sample contains 2329 users with 1701206 keystrokes



# Feature Engineering (unit token: 2 keycodes) --> `train_df`, `test_df`, `data_df`

> Expensive preprocessing step: for baby sample $\approx 500$ users, takes 5-6 min 

In [None]:
def feature_extractor(data, keyboard=None, user_int=True, keycode_int=True):
    df = data[['PARTICIPANT_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
    df = df.astype('float64')
    if user_int:
        df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
    df = df.rename(columns={'PARTICIPANT_ID': 'USER'})

    df['K1'] = df['KEYCODE']
    if keycode_int:
        df['K1'] = df['K1'].astype('int64')
    df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
    if keycode_int:
        df['K2'] = df['K2'].astype('int64')
    df['I1'] = df['INDEX']
    df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
    df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
    df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
    df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)

    if keyboard:
        keycode_dist = []
        home_dist = []
        for row in df.index:
            keycode_dist.append(keyboard['keycode'](keyboard['pos'], df['K1'][row], df['K2'][row]))
            home_dist.append(keyboard['home'](keyboard['pos'], [df['K1'][row], df['K2'][row]]))
        df['KD'] = keycode_dist
        df['HD'] = home_dist

    df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX'])
    df = df.iloc[:-1, :]
    return df

def extract_avg_pair(df, drop_origin=True, rename_avg=True, round_avg=True):
    df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
    df['HL1_avg'] = df['HL1']
    df['IL_avg'] = df['IL']
    df['HL2_avg'] = df['HL2']
    for pair in df['K1_K2'].unique():
        avg_df = df[df['K1_K2'] == pair][['HL1', 'IL', 'HL2']].mean()
        mask = df['K1_K2'] == pair
        df.loc[mask, 'HL1_avg'] = avg_df['HL1']
        df.loc[mask, 'IL_avg'] = avg_df['IL']
        df.loc[mask, 'HL2_avg'] = avg_df['HL2']
    if round_avg:
        df['HL1_avg'] = round(df['HL1_avg'])
        df['IL_avg'] = round(df['IL_avg'])
        df['HL2_avg'] = round(df['HL2_avg'])
    if drop_origin:
        df = df.drop(columns=['HL1', 'IL', 'HL2', 'K1_K2'])
    if drop_origin and rename_avg:
        df = df.rename(columns={'HL1_avg':'HL1', 'IL_avg':'IL', 'HL2_avg':'HL2'})
    return df

def general_preprocess_pair(data, return_avg=False):
    df = feature_extractor(data)
    if return_avg:
        df = extract_avg_pair(df)
    return df

In [None]:
train_df = general_preprocess_pair(train_data)
test_df = general_preprocess_pair(test_data)
# data_df = general_preprocess_pair(data)

# Structure ONE

## Srtucture ONE: Hyperparameters

In [None]:
## Hyperparameters
n_steps = 30
window_length = n_steps
shift = 1
batch_size = 128
unique_keycode = 82

onehot_encoder = OneHotEncoder().fit(pd.concat([train_df[['K1', 'K2']], test_df[['K1', 'K2']]]).astype(str))
# onehot_encoder.transform(train_df[['K1', 'K2']].astype(str)).toarray().shape

unit_time_depth = unique_keycode * 2 + 4    ## 4 is for ['I1', 'I2', 'HL1_avg', 'HL2_avg']

EPOCH = 10

In [None]:
onehot_encoder.transform(train_df[['K1', 'K2']].astype(str)).toarray().shape

(210898, 164)

## Structure ONE: Preparation --> `train_ds`, `test_ds`

> shape: 
* Input: (`batch_size`, `window_length`, `num_features`), e.g. (128, 30, 168)
* Output: (`batch_size`, `window_length`), e.g. (128, 30)

In [None]:
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def structure_one(preprocessed_data, encoder):
    df = preprocessed_data[['K1', 'K2', 'I1', 'I2', 'HL1_avg', 'IL_avg', 'HL2_avg']]
    df = np.concatenate([encoder.transform(df[['K1', 'K2']].astype(str)).toarray(), df[['I1', 'I2', 'HL1_avg', 'IL_avg', 'HL2_avg']]], axis=1)
    # df = pd.concat([pd.get_dummies(df[['K1', 'K2']].astype(str)), df[['I1', 'I2', 'HL1_avg', 'IL_avg', 'HL2_avg']]], axis=1)
    
    dataset = tf.data.Dataset.from_tensor_slices(df).window(size=window_length, shift=shift, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length)).batch(batch_size)
    dataset = dataset.map(lambda windows: (windows[:, :, :-1], windows[:, :, -1]))

    return dataset

In [None]:
train_ds = structure_one(train_df, onehot_encoder)
test_ds = structure_one(test_df, onehot_encoder)

In [None]:
for a, b in train_ds.take(1):
    print(a.shape, b.shape)

(128, 30, 168) (128, 30)


## Structure ONE: Model

In [None]:
model_ONE_base = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, unit_time_depth], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(1))
])
model_ONE_base.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_2 (GRU)                 (None, None, 128)         114432    
                                                                 
 gru_3 (GRU)                 (None, None, 128)         99072     
                                                                 
 time_distributed_1 (TimeDis  (None, None, 1)          129       
 tributed)                                                       
                                                                 
Total params: 213,633
Trainable params: 213,633
Non-trainable params: 0
_________________________________________________________________


> overfitting: test error significantly higher than train error.

In [None]:
model_ONE_1 = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, unit_time_depth], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(1, activity_regularizer=tf.keras.regularizers.L2(0.01)))
])
model_ONE_1.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_4 (GRU)                 (None, None, 128)         114432    
                                                                 
 gru_5 (GRU)                 (None, None, 128)         99072     
                                                                 
 time_distributed_2 (TimeDis  (None, None, 1)          129       
 tributed)                                                       
                                                                 
Total params: 213,633
Trainable params: 213,633
Non-trainable params: 0
_________________________________________________________________


> Fixed overfitting, but training is too slow (need to (1)trim down regularization (`model2`) or (2)add more layer (`model3`)).

In [None]:
model_ONE_2 = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, unit_time_depth], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(1, activity_regularizer=tf.keras.regularizers.L2(0.001)))
])
model_ONE_2.summary()



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_6 (GRU)                 (None, None, 128)         114432    
                                                                 
 gru_7 (GRU)                 (None, None, 128)         99072     
                                                                 
 time_distributed_3 (TimeDis  (None, None, 1)          129       
 tributed)                                                       
                                                                 
Total params: 213,633
Trainable params: 213,633
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_ONE_3 = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, unit_time_depth], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(32, activity_regularizer=tf.keras.regularizers.L2(0.001))),
    keras.layers.TimeDistributed(keras.layers.Dense(1))
])
model_ONE_3.summary()



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_8 (GRU)                 (None, None, 128)         114432    
                                                                 
 gru_9 (GRU)                 (None, None, 128)         99072     
                                                                 
 time_distributed_4 (TimeDis  (None, None, 32)         4128      
 tributed)                                                       
                                                                 
 time_distributed_5 (TimeDis  (None, None, 1)          33        
 tributed)                                                       
                                                                 
Total params: 217,665
Trainable params: 217,665
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_ONE_4 = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, unit_time_depth], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(64, activity_regularizer=tf.keras.regularizers.L2(0.01))),
    keras.layers.TimeDistributed(keras.layers.Dense(32, activity_regularizer=tf.keras.regularizers.L2(0.01))),
    keras.layers.TimeDistributed(keras.layers.Dense(1))
])
model_ONE_4.summary()



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_10 (GRU)                (None, None, 128)         114432    
                                                                 
 gru_11 (GRU)                (None, None, 128)         99072     
                                                                 
 time_distributed_6 (TimeDis  (None, None, 64)         8256      
 tributed)                                                       
                                                                 
 time_distributed_7 (TimeDis  (None, None, 32)         2080      
 tributed)                                                       
                                                                 
 time_distributed_8 (TimeDis  (None, None, 1)          33        
 tributed)                                                       
                                                      

### Callbakcs, Metrics

In [None]:
def structure_one_metric(loss='poisson'):
    if loss == 'mse':
        def mse_metric(Y_true, Y_pred):
            return keras.metrics.mean_squared_error(Y_true[:, -1], Y_pred[:, -1])
        return mse_metric
    elif loss == 'mae':
        def mae_metric(Y_true, Y_pred):
            return keras.metrics.mean_absolute_error(Y_true[:, -1], Y_pred[:, -1])
        return mae_metric
    else:
        def poisson_metric(Y_true, Y_pred):
            return tf.keras.metrics.poisson(Y_true[:, -1], Y_pred[:, -1])
        return poisson_metric

In [None]:
## functionalize callbacks

def create_checkpoint_callback(experiment_name, 
                               save_weights_only=True, 
                               monitor='val_loss', 
                               mode='min', 
                               save_best_only=True):
    path = '/content/drive/MyDrive/COMP576/training-logs'
    checkpoint_filepath = path + "/" + "checkpoints" + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                             save_weights_only=save_weights_only,
                                                             monitor=monitor,
                                                             mode=mode,
                                                             save_best_only=save_best_only)
    print(f"Saving Model Checkpoint files to :{checkpoint_filepath}")
    return checkpoint_callback

def create_tensorboard_callback(experiment_name):
    path = '/content/drive/MyDrive/COMP576/training-logs'
    log_dir = path + "/" + "tensorboard" + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    print(f"Saving TensorBoard log files to :{log_dir}")
    return tensorboard_callback

def create_earlystopping_callback(monitor='val_loss',
                                  patience=5):
    return tf.keras.callbacks.EarlyStopping(monitor=monitor, patience=patience)

def get_callbacks(experiment_name):
    earlystopping = create_earlystopping_callback()
    modelcheckpoint = create_checkpoint_callback(experiment_name=experiment_name)
    tensorboard = create_tensorboard_callback(experiment_name=experiment_name)
    return [earlystopping, modelcheckpoint, tensorboard]

### LOSS + Metric: `mse`

In [None]:
model_ONE_base.compile(optimizer='adam',
              loss='mse',
              metrics=[structure_one_metric('mse')])

history = model_ONE_base.fit(train_ds, epochs=10, 
                              validation_data=test_ds,
                              callbacks=get_callbacks('ONE_base'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_base/20221203-033649
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_base/20221203-033649
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_ONE_base.evaluate(test_ds)



[21468.765625, 21424.38671875]

In [None]:
for a, b in test_ds.take(1):
    X, y_true = a, b
model_ONE_base.predict(X), y_true

In [None]:
model_ONE_base.load_weights('/content/drive/MyDrive/COMP576/training-logs/checkpoint')
model_ONE_base.evaluate(test_ds)



[21468.765625, 21424.38671875]

### LOSS + Metric: `mae`

In [None]:
model_ONE_base.compile(optimizer='adam',
              loss='mae',
              metrics=[structure_one_metric('mae')])

history = model_ONE_base.fit(train_ds, epochs=10, 
                    validation_data=test_ds,
                    callbacks=get_callbacks('ONE_base_mae'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_base_mae/20221203-042738
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_base_mae/20221203-042738
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 223/1648 [===>..........................] - ETA: 4:18 - loss: 5.1092 - mae_metric: 5.1259

In [None]:
model_ONE_1.compile(optimizer='adam',
              loss='mae',
              metrics=[structure_one_metric('mae')])

history1 = model_ONE_1.fit(train_ds, epochs=10, 
                      validation_data=test_ds,
                      callbacks=get_callbacks('ONE_1_mae'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_1_mae/20221203-051829
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_1_mae/20221203-051829
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_ONE_2.compile(optimizer='adam',
              loss='mae',
              metrics=[structure_one_metric('mae')])

history2 = model_ONE_2.fit(train_ds, epochs=10, 
                      validation_data=test_ds,
                      callbacks=get_callbacks('ONE_2_mae'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_2_mae/20221203-060925
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_2_mae/20221203-060925
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
model_ONE_3.compile(optimizer='adam',
              loss='mae',
              metrics=[structure_one_metric('mae')])

history3 = model_ONE_3.fit(train_ds, epochs=10, 
                      validation_data=test_ds,
                      callbacks=get_callbacks('ONE_3_mae'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_3_mae/20221203-062451
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_3_mae/20221203-062451
Epoch 1/3
Epoch 2/3
Epoch 3/3
 130/1648 [=>............................] - ETA: 4:37 - loss: 12.7055 - mae_metric: 11.9840

In [None]:
model_ONE_4.compile(optimizer='adam',
              loss='mae',
              metrics=[structure_one_metric('mae')])

history4 = model_ONE_4.fit(train_ds, epochs=10, 
                      validation_data=test_ds,
                      callbacks=get_callbacks('ONE_4_mae'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_4_mae/20221203-064022
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_4_mae/20221203-064022
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

### LOSS + Metric: Huber

In [None]:
model_ONE_base.compile(optimizer='adam',
              loss=tf.keras.losses.Huber(delta=10),
              metrics='mae')

history = model_ONE_base.fit(train_ds, epochs=10, 
                    validation_data=test_ds,
                    callbacks=get_callbacks('ONE_base_huber'))

Saving Model Checkpoint files to :/content/drive/MyDrive/COMP576/training-logs/checkpoints/ONE_base_huber/20221203-073147
Saving TensorBoard log files to :/content/drive/MyDrive/COMP576/training-logs/tensorboard/ONE_base_huber/20221203-073147
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


# Feature Engineering (unit token: 1 keycode) --> `train_df`, `test_df`

In [None]:
def feature_extractor(data, keyboard=None, user_int=True, keycode_int=True, drop_user=True):
    df = data[['PARTICIPANT_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
    df = df.astype('float64')
    if user_int:
        df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
    df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
    if drop_user:
        df = df.drop(columns=['USER'])

    if keycode_int:
        df['KEYCODE'] = df['KEYCODE'].astype('int64')
    df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
    df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
    df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
    df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']

    if keyboard:
        keycode_dist = []
        home_dist = []
        for row in df.index:
            keycode_dist.append(keyboard['keycode'](keyboard['pos'], df['K1'][row], df['K2'][row]))
            home_dist.append(keyboard['home'](keyboard['pos'], [df['K1'][row], df['K2'][row]]))
        df['KD'] = keycode_dist
        df['HD'] = home_dist

    df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME'])
    df = df.iloc[:-1, :]
    return df

In [None]:
train_df = feature_extractor(train_data)
test_df = feature_extractor(test_data)

In [None]:
train_df.head(3)

Unnamed: 0,KEYCODE,INDEX,HL,IL,PL,RL
0,16,0.0,550.0,-112.0,438.0,15.0
1,73,1.0,127.0,32.0,159.0,155.0
2,32,2.0,123.0,0.0,123.0,196.0


# Structure TypeNet

## Structure TypeNet: Hyperparameters

In [None]:
## Hyperparameters
n_steps = 30
window_length = n_steps + 1
shift = 1
batch_size = 128
unique_keycode = 82

onehot_encoder = OneHotEncoder().fit(train_df[['KEYCODE']].astype(str))

unit_time_depth = unique_keycode + 5    ## 5 is for ['INDEX', 'HL', 'IL', 'PL', 'RL']

## Structure TypeNet: Preparation --> `train_ds`, `test_ds`

In [None]:
def structure_typenet(df, encoder, zip_in_out=True):
    df = np.concatenate([encoder.transform(df[['KEYCODE']].astype(str)).toarray(), df[['INDEX', 'HL', 'IL', 'PL', 'RL']]], axis=1)
    
    dataset = tf.data.Dataset.from_tensor_slices(df).window(size=window_length, shift=shift, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length)).batch(batch_size)
    ds_in = dataset.map(lambda window: (window[:, :n_steps, :], window[:, -1, :-4]))
    ds_out = dataset.map(lambda window: window[:, -1, -4:-2])

    if zip_in_out:
        dataset = tf.data.Dataset.zip((ds_in, ds_out))
        return dataset
    return ds_in, ds_out

In [None]:
train_ds = structure_typenet(train_df, onehot_encoder)
test_ds = structure_typenet(test_df, onehot_encoder)

## Structure TypeNet: Model

In [None]:
input_1 = keras.layers.Input(shape=[None, unit_time_depth], name='0-N char with timestamps')
input_2 = keras.layers.Input(shape=[unit_time_depth-4], name='N+1 char without timestamps')
batch_1 = keras.layers.BatchNormalization()(input_1)
lstm_1 = keras.layers.LSTM(128, return_sequences=True, recurrent_dropout=0.2)(batch_1)
dropout_1 = keras.layers.Dropout(0.5)(lstm_1)
batch_2 = keras.layers.BatchNormalization()(dropout_1)
lstm_2 = keras.layers.LSTM(128, recurrent_dropout=0.2)(batch_2)
concat = keras.layers.concatenate([lstm_2, input_2])
reshape = keras.layers.Reshape((128+unit_time_depth-4, 1))(concat)
gru_1 = keras.layers.GRU(64, recurrent_dropout=0.2)(reshape)
output = keras.layers.Dense(2)(gru_1)
model = keras.Model(inputs=[input_1, input_2], outputs=[output])

model.summary()



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 0-N char with timestamps (Inpu  [(None, None, 87)]  0           []                               
 tLayer)                                                                                          
                                                                                                  
 batch_normalization_2 (BatchNo  (None, None, 87)    348         ['0-N char with timestamps[0][0]'
 rmalization)                                                    ]                                
                                                                                                  
 lstm_2 (LSTM)                  (None, None, 128)    110592      ['batch_normalization_2[0][0]']  
                                                                                            

In [None]:
model.compile(optimizer='adam',
              loss='mae',
              metrics=['mae'])

history = model.fit(train_ds, epochs=3, 
                    validation_data=test_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


> `model1`: update from `model` by
  1. changing the GRU layer to 2 dense layer
  2. change the customize metric to `mae` (since we are not doing the sequence-to-sequence prediction here any more)

In [None]:
input_1 = keras.layers.Input(shape=[None, unit_time_depth], name='0-N char with timestamps')
input_2 = keras.layers.Input(shape=[unit_time_depth-4], name='N+1 char without timestamps')
batch_1 = keras.layers.BatchNormalization()(input_1)
lstm_1 = keras.layers.LSTM(128, return_sequences=True, recurrent_dropout=0.2)(batch_1)
dropout_1 = keras.layers.Dropout(0.5)(lstm_1)
batch_2 = keras.layers.BatchNormalization()(dropout_1)
lstm_2 = keras.layers.LSTM(128, recurrent_dropout=0.2)(batch_2)
concat = keras.layers.concatenate([lstm_2, input_2])
dense_1 = keras.layers.Dense(100)(concat)
dense_2 = keras.layers.Dense(30)(dense_1)
output = keras.layers.Dense(2)(dense_2)
model1 = keras.Model(inputs=[input_1, input_2], outputs=[output])

model1.summary()



Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 0-N char with timestamps (Inpu  [(None, None, 87)]  0           []                               
 tLayer)                                                                                          
                                                                                                  
 batch_normalization_6 (BatchNo  (None, None, 87)    348         ['0-N char with timestamps[0][0]'
 rmalization)                                                    ]                                
                                                                                                  
 lstm_6 (LSTM)                  (None, None, 128)    110592      ['batch_normalization_6[0][0]']  
                                                                                            

In [None]:
model1.compile(optimizer='adam',
              loss='mae',
              metrics=['mae'])

history1 = model1.fit(train_ds, epochs=3, 
                      validation_data=test_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


> Update from `model1` by
  1. change loss to Poisson (`model2`)
  2. add regularization to Dense layer

In [None]:
input_1 = keras.layers.Input(shape=[None, unit_time_depth], name='0-N char with timestamps')
input_2 = keras.layers.Input(shape=[unit_time_depth-4], name='N+1 char without timestamps')
batch_1 = keras.layers.BatchNormalization()(input_1)
lstm_1 = keras.layers.LSTM(128, return_sequences=True, recurrent_dropout=0.2)(batch_1)
dropout_1 = keras.layers.Dropout(0.5)(lstm_1)
batch_2 = keras.layers.BatchNormalization()(dropout_1)
lstm_2 = keras.layers.LSTM(128, recurrent_dropout=0.2)(batch_2)
concat = keras.layers.concatenate([lstm_2, input_2])
reshape = keras.layers.Reshape((128+unit_time_depth-4, 1))(concat)
gru_1 = keras.layers.GRU(128, return_sequences=True, recurrent_dropout=0.2)(reshape)
dense_1 = keras.layers.TimeDistributed(keras.layers.Dense(64), activity_regularizer=tf.keras.regularizers.L2(0.01))(gru_1)
gru_2 = keras.layers.GRU(32, recurrent_dropout=0.2)(dense_1)
output = keras.layers.Dense(2)(gru_2)
model3 = keras.Model(inputs=[input_1, input_2], outputs=[output])

model3.summary()



Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 0-N char with timestamps (Inpu  [(None, None, 87)]  0           []                               
 tLayer)                                                                                          
                                                                                                  
 batch_normalization_14 (BatchN  (None, None, 87)    348         ['0-N char with timestamps[0][0]'
 ormalization)                                                   ]                                
                                                                                                  
 lstm_14 (LSTM)                 (None, None, 128)    110592      ['batch_normalization_14[0][0]'] 
                                                                                            