In [5]:
from google.colab import drive
drive.mount('/content/drive/')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import os
import datetime
from pytz import timezone

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
import tensorflow.keras as keras

Mounted at /content/drive/


# Import Dataset

In [10]:
def processing_folder(folder_path, sample_size, train_size, dev_size):
    os.chdir(folder_path)
    files = sorted(os.listdir(folder_path), key=lambda x: int(re.findall(r'\d+', x)[0]))
    train_samples = []
    dev_samples = []
    test_samples = []
    samples = []
    cols = []
    # text_count_id = 0
    for i, path in enumerate(files):
        if i >= sample_size:
            break
        with open(path, encoding='utf-8',           ##https://stackoverflow.com/questions/12468179/unicodedecodeerror-utf8-codec-cant-decode-byte-0x9c
                 errors='ignore') as f:
            lines = f.readlines()
        ## extract column names (once)
        if i == 0:
            ls = lines[0].split('\t')
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                cols = ls
        ## extracting all samples from the current file
        sample = []
        curr_text_id = ''
        curr_index = -1
        for line in lines[1:]:
            ls = line.split('\t')
            # ## debugdding block (to detect misaligned rows in files)
            # if len(ls) != 9:
            #     print(path, line)
            if re.findall(r'\w+|\d+', ls[-1]):
                ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
                if ls[1] != curr_text_id:
                    curr_index = 0
                    curr_text_id = ls[1]
                    # text_count_id += 1
                else:
                    curr_index += 1
                # ls.extend([curr_index, text_count_id])
                ls.append(curr_index)
                sample.append(ls)
        ##  split the current data into train-test-sets
        split_index_1 = int(train_size * len(sample))
        split_index_2 = split_index_1 + int(dev_size * len(sample))
        train_samples = train_samples + sample[:split_index_1]
        dev_samples = dev_samples + sample[split_index_1:split_index_2]
        test_samples = test_samples + sample[split_index_2:]
        samples = samples + sample
    ## forming dataframes: total data, train, test, dev
    data = pd.DataFrame(samples)
    train_data = pd.DataFrame(train_samples)
    dev_data = pd.DataFrame(dev_samples)
    test_data = pd.DataFrame(test_samples)
    ## renaming columns
    # cols = cols + ['INDEX'] + ['TEXT_COUNT_ID']
    cols = cols + ['INDEX']
    data.columns, train_data.columns, dev_data.columns, test_data.columns = cols, cols, cols, cols
    ## construct onehot encoders from train data
    train_data['K1'], train_data['K2'] = train_data['KEYCODE'], train_data['KEYCODE']
    KEYCODE_enc = OneHotEncoder(handle_unknown='ignore').fit(train_data[['KEYCODE']])
    K1_enc = OneHotEncoder(handle_unknown='ignore').fit(train_data[['K1']])
    K2_enc = OneHotEncoder(handle_unknown='ignore').fit(train_data[['K2']])
    K1_K2_enc = OneHotEncoder(handle_unknown='ignore').fit(train_data[['K1', 'K2']])
    train_data = train_data.drop(columns=['K1', 'K2'])
    return data, train_data, dev_data, test_data, KEYCODE_enc, K1_K2_enc, K1_enc, K2_enc

In [37]:
## DEPRECATED: added dev_size to the newest function

# def processing_folder(folder_path, sample_size, train_size):
#     os.chdir(folder_path)
#     files = sorted(os.listdir(folder_path), key=lambda x: int(re.findall(r'\d+', x)[0]))
#     train_samples = []
#     test_samples = []
#     samples = []
#     cols = []
#     # text_count_id = 0
#     for i, path in enumerate(files):
#         if i >= sample_size:
#             break
#         with open(path, encoding='utf-8',           ##https://stackoverflow.com/questions/12468179/unicodedecodeerror-utf8-codec-cant-decode-byte-0x9c
#                  errors='ignore') as f:
#             lines = f.readlines()
#         ## extract column names (once)
#         if i == 0:
#             ls = lines[0].split('\t')
#             if re.findall(r'\w+|\d+', ls[-1]):
#                 ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
#                 cols = ls
#         ## extracting all samples from the current file
#         sample = []
#         curr_text_id = ''
#         curr_index = -1
#         for line in lines[1:]:
#             ls = line.split('\t')
#             # ## debugdding block (to detect misaligned rows in files)
#             # if len(ls) != 9:
#             #     print(path, line)
#             if re.findall(r'\w+|\d+', ls[-1]):
#                 ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
#                 if ls[1] != curr_text_id:
#                     curr_index = 0
#                     curr_text_id = ls[1]
#                     # text_count_id += 1
#                 else:
#                     curr_index += 1
#                 # ls.extend([curr_index, text_count_id])
#                 ls.append(curr_index)
#                 sample.append(ls)
#         ##  split the current data into train-test-sets
#         split_index = int(train_size * len(sample))
#         train_samples = train_samples + sample[:split_index]
#         test_samples = test_samples + sample[split_index:]
#         samples = samples + sample
#     ## forming dataframes
#     df_all = pd.DataFrame(samples)
#     df_train = pd.DataFrame(train_samples)
#     df_test = pd.DataFrame(test_samples)
#     ## renaming columns
#     # cols = cols + ['INDEX'] + ['TEXT_COUNT_ID']
#     cols = cols + ['INDEX']
#     df_all.columns, df_train.columns, df_test.columns = cols, cols, cols
#     ## construct onehot encoders from train data
#     df_train['K1'], df_train['K2'] = df_train['KEYCODE'], df_train['KEYCODE']
#     KEYCODE_enc = OneHotEncoder(handle_unknown='ignore').fit(df_train[['KEYCODE']])
#     K1_enc = OneHotEncoder(handle_unknown='ignore').fit(df_train[['K1']])
#     K2_enc = OneHotEncoder(handle_unknown='ignore').fit(df_train[['K2']])
#     K1_K2_enc = OneHotEncoder(handle_unknown='ignore').fit(df_train[['K1', 'K2']])
#     df_train = df_train.drop(columns=['K1', 'K2'])
#     return df_all, df_train, df_test, KEYCODE_enc, K1_K2_enc, K1_enc, K2_enc

In [22]:
# def processing_folder(folder_path, sample_size, train_size):
#     os.chdir(folder_path)
#     files = sorted(os.listdir(folder_path), key=lambda x: int(re.findall(r'\d+', x)[0]))
#     train_samples = []
#     test_samples = []
#     samples = []
#     cols = []
#     for i, path in enumerate(files):
#         if i >= sample_size:
#             break
#         with open(path, encoding='utf-8',           ##https://stackoverflow.com/questions/12468179/unicodedecodeerror-utf8-codec-cant-decode-byte-0x9c
#                  errors='ignore') as f:
#             lines = f.readlines()
#         ## extract column names (once)
#         if i == 0:
#             ls = lines[0].split('\t')
#             if re.findall(r'\w+|\d+', ls[-1]):
#                 ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
#                 cols = ls
#         ## extracting all samples from the current file
#         sample = []
#         curr_text_id = ''
#         curr_index = -1
#         for line in lines[1:]:
#             ls = line.split('\t')
#             # if len(ls) != 9:
#             #     print(path, line)
#             if re.findall(r'\w+|\d+', ls[-1]):
#                 ls[-1] = re.findall(r'\w+|\d+', ls[-1])[0]
#                 if ls[1] != curr_text_id:
#                     curr_index = 0
#                     curr_text_id = ls[1]
#                 else:
#                     curr_index += 1
#                 ls.append(curr_index)
#                 sample.append(ls)
#         ##  split the current data into train-test-sets
#         split_index = int(train_size * len(sample))
#         train_samples = train_samples + sample[:split_index]
#         test_samples = test_samples + sample[split_index:]
#         samples = samples + sample
#     ## forming dataframes
#     df_all = pd.DataFrame(samples)
#     df_train = pd.DataFrame(train_samples)
#     df_test = pd.DataFrame(test_samples)
#     ## renaming columns
#     cols = cols + ['INDEX']
#     df_all.columns, df_train.columns, df_test.columns = cols, cols, cols
#     ## construct onehot encoders
#     df_all['K1'], df_all['K2'] = df_all['KEYCODE'], df_all['KEYCODE']
#     uni_encoder = OneHotEncoder().fit(df_all[['KEYCODE']])
#     di_encoder = OneHotEncoder().fit(df_all[['K1', 'K2']]) 
#     return df_all, df_train, df_test, uni_encoder, di_encoder

# Keyboard Layout Encoding

In [3]:
def get_qwerty_keyboard():
    first_row = [27, 27, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 0, 0, 145, 126, 0, 0, 0, 0, 0]
    space = [0] * 23
    second_row = [192, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 189, 187, 8, 0, 45, 36, 33, 0, 144, 111, 106, 109]
    third_row = [9, 81, 87, 69, 82, 84, 89, 85, 73, 79, 80, 219, 221, 220, 0, 46, 35, 34, 0, 103, 104, 105, 107]
    fourth_row = [20, 65, 83, 68, 70, 71, 72, 74, 75, 76, 186, 222, 13, 13, 0, 0, 0, 0, 0, 100, 101, 102, 107]
    fifth_row = [16, 16, 90, 88, 67, 86, 66, 78, 77, 188, 190, 191, 16, 16, 0, 0, 38, 0, 0, 97, 98, 99, 13]
    sixth_row = [17, 17, 191, 18, 32, 32, 32, 32, 32, 18, 92, 93, 17, 17, 0, 37, 40, 39, 0, 96, 96, 110, 13]
    qwerty_keyboard = pd.DataFrame({'1st': first_row,
                                    'space': space,
                                    '2nd': second_row,
                                    '3rd': third_row,
                                    '4th': fourth_row,
                                    '5th': fifth_row,
                                    '6th': sixth_row}).transpose()
    qwerty_keyboard.index = list(range(7))
    return qwerty_keyboard

class Keyboard:
    def __init__(self, keyboard_df=get_qwerty_keyboard()):
        self.keyboard = keyboard_df
        self.keycode_pos = self.get_keycode_pos()
    
    def get_keycode_pos(self):
        '''
        Generates Python dictionary encoding the keyboard keycode positions, i.e.
              - keys = javascript keycode
              - values = [i, j] of the corresponding keycode position on the keyboard
        Return: Python dict
        '''
        keyboard_dict = {}
        for row in self.keyboard.index:
            for col, entry in enumerate(self.keyboard.iloc[row, :]):
                if entry in keyboard_dict:
                    keyboard_dict[entry].append([row, col])
                else:
                    keyboard_dict[entry] = [[row, col]]
        return keyboard_dict
        
    def keycode_distance(self, keycode1, keycode2):
        '''
        Given a pair of keycodes, return their relative distance on the keyboard
        '''
        keycode1 = int(keycode1)
        keycode2 = int(keycode2)
        def manhattan_dist(arr1, arr2):
            return abs(arr1[0] - arr2[0]) + abs(arr1[1] - arr2[1])
        distance = 30 ## any integer larger than 22+6
        if keycode1 in self.keycode_pos and keycode2 in self.keycode_pos:
            for arr1 in self.keycode_pos[keycode1]:
                for arr2 in self.keycode_pos[keycode2]:
                    curr_dist = manhattan_dist(arr1, arr2)
                    if curr_dist < distance:
                        distance = curr_dist
            if distance < 5:
                return distance
        return 5
    
    def home_distance(self, keycode_list):
        '''
        Computes the AVERAGE distance of a list of keycodes to the home keys, where
        In QWERTY keyboard, F and J are the home keys with keycodes 70 and 74 resp.
        '''
        sum = 0
        for key in keycode_list:
            key = int(key)
            sum += min([self.keycode_distance(70, key), self.keycode_distance(74, key)])
        return sum/len(keycode_list)
    
    def keyboard_dict(self):
        return {'keycode': self.keycode_distance, 'home': self.home_distance}

In [24]:
# def get_qwerty_keyboard():
#     first_row = [27, 27, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 0, 0, 145, 126, 0, 0, 0, 0, 0]
#     space = [0] * 23
#     second_row = [192, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 189, 187, 8, 0, 45, 36, 33, 0, 144, 111, 106, 109]
#     third_row = [9, 81, 87, 69, 82, 84, 89, 85, 73, 79, 80, 219, 221, 220, 0, 46, 35, 34, 0, 103, 104, 105, 107]
#     fourth_row = [20, 65, 83, 68, 70, 71, 72, 74, 75, 76, 186, 222, 13, 13, 0, 0, 0, 0, 0, 100, 101, 102, 107]
#     fifth_row = [16, 16, 90, 88, 67, 86, 66, 78, 77, 188, 190, 191, 16, 16, 0, 0, 38, 0, 0, 97, 98, 99, 13]
#     sixth_row = [17, 17, 191, 18, 32, 32, 32, 32, 32, 18, 92, 93, 17, 17, 0, 37, 40, 39, 0, 96, 96, 110, 13]
#     qwerty_keyboard = pd.DataFrame({'1st': first_row,
#                                     'space': space,
#                                     '2nd': second_row,
#                                     '3rd': third_row,
#                                     '4th': fourth_row,
#                                     '5th': fifth_row,
#                                     '6th': sixth_row}).transpose()
#     qwerty_keyboard.index = list(range(7))
#     return qwerty_keyboard

# class Keyboard:
#     def __init__(self, keyboard_df):
#         self.keyboard = keyboard_df
#         self.keycode_pos = self.get_keycode_pos()
    
#     def get_keycode_pos(self):
#         '''
#         Generates Python dictionary encoding the keyboard keycode positions, i.e.
#               - keys = javascript keycode
#               - values = [i, j] of the corresponding keycode position on the keyboard
#         Return: Python dict
#         '''
#         keyboard_dict = {}
#         for row in self.keyboard.index:
#             for col, entry in enumerate(self.keyboard.iloc[row, :]):
#                 if entry in keyboard_dict:
#                     keyboard_dict[entry].append([row, col])
#                 else:
#                     keyboard_dict[entry] = [[row, col]]
#         return keyboard_dict
        
#     def keycode_distance(self, keycode1, keycode2):
#         '''
#         Given a pair of keycodes, return their relative distance on the keyboard
#         '''
#         def manhattan_dist(arr1, arr2):
#             return abs(arr1[0] - arr2[0]) + abs(arr1[1] - arr2[1])
#         distance = 30 ## any integer larger than 22+6
#         if keycode1 in self.keycode_pos and keycode2 in self.keycode_pos:
#             for arr1 in self.keycode_pos[keycode1]:
#                 for arr2 in self.keycode_pos[keycode2]:
#                     curr_dist = manhattan_dist(arr1, arr2)
#                     if curr_dist < distance:
#                         distance = curr_dist
#             if distance < 5:
#                 return distance
#         return 5
    
#     def home_distance(self, keycode_list):
#         '''
#         Computes the AVERAGE distance of a list of keycodes to the home keys, where
#         In QWERTY keyboard, F and J are the home keys with keycodes 70 and 74 resp.
#         '''
#         sum = 0
#         for key in keycode_list:
#             sum += min([self.keycode_distance(70, key), self.keycode_distance(74, key)])
#         return sum/len(keycode_list)
    
#     def keyboard_dict(self):
#         return {'keycode': self.keycode_distance, 'home': self.home_distance}

# Extractor

In [None]:
class Extractor:
    def __init__(self, sub_data, keyboard_dict=Keyboard().keyboard_dict(), latencies=['HL', 'PL', 'IL', 'RL']):
        self.keyboard_dict = keyboard_dict
        self.latencies = latencies

        self.unigraph = self.unigraph_extractor(sub_data)
        self.digraph = self.digraph_extractor(sub_data)
    
    def unigraph_extractor(self, df, user_str=True, keycode_str=True, drop_user=False):
        '''
        Generates unigraph related features and returns the dataframe
        '''
        df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
        df = df.astype('float64')
        if user_str:
            df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64').astype(str)
        df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
        if drop_user:
            df = df.drop(columns=['USER'])
        if keycode_str:
            df['KEYCODE'] = df['KEYCODE'].astype('int64').astype(str)
        ## construct new features
        if 'HL' in self.latencies:
            df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
        if 'IL' in self.latencies:
            df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
        if 'RL' in self.latencies:
            df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
        if 'PL' in self.latencies:
            df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
        ## dropping rows where the NEXT row has INDEX==0 (indicating a transition to next sentence)
        shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], df['TEST_SECTION_ID'][-1:]], ignore_index=True) - df['TEST_SECTION_ID']
        mask = shift_txt == 0
        df = df.loc[mask]
        ## cleaning irrelavant info
        df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'TEST_SECTION_ID'])
        df = df.iloc[:-1, :]
        return df
    
    def digraph_extractor(self, df, user_str=True, keycode_str=True, drop_user=False):
        '''
        Generates digraph related features and returns the dataframe
        '''
        df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
        df = df.astype('float64')
        if user_str:
            df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64').astype(str)
        df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
        if drop_user:
            df = df.drop(columns=['USER'])
        ## construct new features
        df['K1'] = df['KEYCODE']
        df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
        if keycode_str:
            df['K1'] = df['K1'].astype('int64').astype(str)
            df['K2'] = df['K2'].astype('int64').astype(str)
        df['I1'] = df['INDEX']
        df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
        if 'HL' in self.latencies:
            df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
            df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)
        if 'IL' in self.latencies:
            df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
        if 'RL' in self.latencies:
            df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
        if 'PL' in self.latencies:
            df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
        ## dropping instances where I2 is zero (indicating a transition to next sentence)
        shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], df['TEST_SECTION_ID'][-1:]], ignore_index=True) - df['TEST_SECTION_ID']
        mask = shift_txt == 0
        df = df.loc[mask]
        ## cleaning irrelavant info
        df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX', 'TEST_SECTION_ID'])
        df = df.iloc[:-1, :]
        return df  
    
    def unigraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
        '''
        Generates unigraph with values replaced by average behavior
        Input:
            avg_mode: str, takes value in 'mean', 'median' (if not == 'mean', default to 'median')
        Output:
            unigraph dataframe with Average User data
        '''
        if data:
            df = data.copy()
        else:
            df = self.unigraph.copy()
        for XL in self.latencies:
            df[XL+'_avg'] = df[XL]
        for keycode in df['KEYCODE'].unique():
            mask = df['KEYCODE'] == keycode
            if avg_mode == 'mean':
                avg_df = df.loc[mask, self.latencies].mean()
            else:
                avg_df = df.loc[mask, self.latencies].median()
            for XL in self.latencies:
                df.loc[mask, XL+'_avg'] = avg_df[XL]
        if round_avg:
            for XL in self.latencies:
                df[XL+'_avg'] = round(df[XL+'_avg'])
        if drop_origin:
            df = df.drop(columns=self.latencies)
        if drop_origin and rename_avg:
            df = df.rename(columns=lambda name: name[:2] if '_avg' in name else name)
        return df
    
    ## https://towardsdatascience.com/do-you-use-apply-in-pandas-there-is-a-600x-faster-way-d2497facfa66
    def digraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
        '''
        Generates digraph with values replaced by average behavior
        Input:
            avg_mode: str, takes value in 'mean', 'median' (if not == 'mean', default to 'median')
        Output:
            digraph dataframe with Average User data
        '''
        if data:
            df = data.copy()
        else:
            df = self.digraph.copy()
        df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
        latencies = self.latencies.copy()
        if 'HL' in latencies:
            latencies.remove('HL')
            latencies.insert(0, 'HL2')
            latencies.insert(0, 'HL1')
        for XL in latencies:
            df[XL+'_avg'] = df[XL]
        for pair in df['K1_K2'].unique():
            mask = df['K1_K2'] == pair
            if avg_mode == 'mean':
                avg_df = df.loc[mask, latencies].mean()
            else:
                avg_df = df.loc[mask, latencies].median()
            for XL in latencies:
                df.loc[mask, XL+'_avg'] = avg_df[XL]
        if round_avg:
            for XL in latencies:
                df[XL+'_avg'] = round(df[XL+'_avg'])
        if drop_origin:
            df = df.drop(columns=latencies+['K1_K2'])
        if drop_origin and rename_avg:
            df = df.rename(columns=lambda name: re.search(r'(.{2,3})(_avg)', name).group(1) if '_avg' in name else name)
        return df
 
    def unigraph_keyboard(self, avg_mode=None):
        '''
        Returns a unigraph dataframe with added keyboard layout features
        '''
        if avg_mode:
            df = self.unigraph_avg(avg_mode)
        else:
            df = self.unigraph.copy()
        home_dist = []
        for row in df.index:
            home_dist.append(self.keyboard_dict['home']([df['KEYCODE'][row]]))
        df['HD'] = home_dist
        # cols = list(df.columns[:-3]) + list(df.columns[-1:]) + list(df.columns[-3:-1])
        num_cols = len(df.columns)
        cols = list(df.columns[:num_cols-1-len(self.latencies)]) + list(df.columns[-1:]) + list(df.columns[-1-len(self.latencies):-1])
        df = df[cols]
        return df
    
    def digraph_keyboard(self, avg_mode=None):
        '''
        Returns a digraph dataframe with added keyboard layout features
        '''
        if avg_mode:
            df = self.digraph_avg(avg_mode)
        else:
            df = self.digraph.copy()
        keycode_dist = []
        home_dist = []
        for row in df.index:
            keycode_dist.append(self.keyboard_dict['keycode'](df['K1'][row], df['K2'][row]))
            home_dist.append(self.keyboard_dict['home']([df['K1'][row], df['K2'][row]]))
        df['KD'] = keycode_dist
        df['HD'] = home_dist
        # cols = list(df.columns[:-5]) + list(df.columns[-2:]) + list(df.columns[-5:-2])
        num_cols = len(df.columns)
        cols = list(df.columns[:num_cols-2-(len(self.latencies)+1)]) + list(df.columns[-2:]) + list(df.columns[-2-(len(self.latencies)+1):-2])
        df = df[cols]
        return df
    
    def IQR_filter(self, data, fold):
        Q3 = data.quantile(.75)
        Q1 = data.quantile(.25)
        IQR = Q3 - Q1
        max = Q3 + fold * IQR
        min = Q1 - fold * IQR
        return min, max

    def ABS_filter(self, data, bounds):
        num_bottom, num_top = bounds
        min = data.sort_values()[:num_bottom+1].values[-1]
        max = data.sort_values(ascending=False)[:num_top+1].values[-1]
        return min, max
    
    def unigraph_filtered(self, avg_mode, encode_keyboard, filter, bounds_dict):
        '''
        Input:
            avg_mode: str, takes value in ['median', 'mean', None(default)]
            encode_keyboard: boolean
            filter: str, takes value in ['ABS'(default), 'IQR']
            bounds_dict: a python dictionary with keys=latencies, 
                                                  values=needed params
                      ==> for IQR: values = folds (i.e. scaling IQR by fold*IQR)
                      ==> for ABS: values = [num_bottoms, num_tops]
        '''
        filter_latencies = list(bounds_dict.keys())
        if encode_keyboard:
            df = self.unigraph_keyboard(avg_mode)
        elif avg_mode:
            df = self.unigraph_avg(avg_mode)
        else:
            df = self.unigraph.copy()
        for latency in filter_latencies:
            for user in df['USER'].unique():
                mask_user = df['USER'] == user
                mask_nonuser = df['USER'] != user
                subdf = df.loc[mask_user, latency]
                if filter == 'IQR':
                    min, max = self.IQR_filter(subdf, bounds_dict[latency])
                else:
                    min, max = self.ABS_filter(subdf, bounds_dict[latency])
                mask_max = df[latency] <= max
                mask_min = df[latency] >= min
                df = df.loc[mask_user & mask_max & mask_min | mask_nonuser]
        return df
    
    def digraph_filtered(self, avg_mode, encode_keyboard, filter, bounds_dict):
        '''
        Input:
            avg_mode: str, takes value in ['median', 'mean', None(default)]
            encode_keyboard: boolean
            filter: str, takes value in ['ABS'(default), 'IQR']
            bounds_dict: a python dictionary with keys=latencies, 
                                                  values=needed params
                      ==> for IQR: values = folds (i.e. scaling IQR by fold*IQR)
                      ==> for ABS: values = [num_bottoms, num_tops]
        '''
        filter_latencies = list(bounds_dict.keys())
        if encode_keyboard:
            df = self.digraph_keyboard(avg_mode)
        elif avg_mode:
            df = self.digraph_avg(avg_mode)
        else:
            df = self.digraph.copy()
        for latency in filter_latencies:
            for user in df['USER'].unique():
                mask_user = df['USER'] == user
                mask_nonuser = df['USER'] != user
                subdf = df.loc[mask_user, latency]
                if filter == 'IQR':
                    min, max = self.IQR_filter(subdf, bounds_dict[latency])
                else:
                    min, max = self.ABS_filter(subdf, bounds_dict[latency])
                mask_max = df[latency] <= max
                mask_min = df[latency] >= min
                df = df.loc[mask_user & mask_max & mask_min | mask_nonuser]
        return df

In [25]:
## DEPRECATED: added docstring and reordered the functions into unigraph-digraph groups order

# class Extractor:
#     def __init__(self, sub_data, keyboard_dict=Keyboard().keyboard_dict(), latencies=['HL', 'PL', 'IL', 'RL']):
#         self.keyboard_dict = keyboard_dict
#         self.latencies = latencies

#         self.unigraph = self.unigraph_extractor(sub_data)
#         self.digraph = self.digraph_extractor(sub_data)
    
#     def unigraph_extractor(self, df, user_str=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_str:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64').astype(str)
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         if keycode_str:
#             df['KEYCODE'] = df['KEYCODE'].astype('int64').astype(str)
#         ## construct new features
#         if 'HL' in self.latencies:
#             df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## dropping rows where the NEXT row has INDEX==0 (indicating a transition to next sentence)
#         shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], df['TEST_SECTION_ID'][-1:]], ignore_index=True) - df['TEST_SECTION_ID']
#         mask = shift_txt == 0
#         df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     def digraph_extractor(self, df, user_str=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_str:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64').astype(str)
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         ## construct new features
#         df['K1'] = df['KEYCODE']
#         df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
#         if keycode_str:
#             df['K1'] = df['K1'].astype('int64').astype(str)
#             df['K2'] = df['K2'].astype('int64').astype(str)
#         df['I1'] = df['INDEX']
#         df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
#         if 'HL' in self.latencies:
#             df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#             df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## dropping instances where I2 is zero (indicating a transition to next sentence)
#         shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], df['TEST_SECTION_ID'][-1:]], ignore_index=True) - df['TEST_SECTION_ID']
#         mask = shift_txt == 0
#         df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     ## https://towardsdatascience.com/do-you-use-apply-in-pandas-there-is-a-600x-faster-way-d2497facfa66
#     def digraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.digraph.copy()
#         df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
#         latencies = self.latencies.copy()
#         if 'HL' in latencies:
#             latencies.remove('HL')
#             latencies.insert(0, 'HL2')
#             latencies.insert(0, 'HL1')
#         for XL in latencies:
#             df[XL+'_avg'] = df[XL]
#         for pair in df['K1_K2'].unique():
#             mask = df['K1_K2'] == pair
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, latencies].mean()
#             else:
#                 avg_df = df.loc[mask, latencies].median()
#             for XL in latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=latencies+['K1_K2'])
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: re.search(r'(.{2,3})(_avg)', name).group(1) if '_avg' in name else name)
#         return df

#     def unigraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.unigraph.copy()
#         for XL in self.latencies:
#             df[XL+'_avg'] = df[XL]
#         for keycode in df['KEYCODE'].unique():
#             mask = df['KEYCODE'] == keycode
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, self.latencies].mean()
#             else:
#                 avg_df = df.loc[mask, self.latencies].median()
#             for XL in self.latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in self.latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=self.latencies)
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: name[:2] if '_avg' in name else name)
#         return df
    
#     def digraph_keyboard(self, avg_mode=None):
#         if avg_mode:
#             df = self.digraph_avg(avg_mode)
#         else:
#             df = self.digraph.copy()
#         keycode_dist = []
#         home_dist = []
#         for row in df.index:
#             keycode_dist.append(self.keyboard_dict['keycode'](df['K1'][row], df['K2'][row]))
#             home_dist.append(self.keyboard_dict['home']([df['K1'][row], df['K2'][row]]))
#         df['KD'] = keycode_dist
#         df['HD'] = home_dist
#         # cols = list(df.columns[:-5]) + list(df.columns[-2:]) + list(df.columns[-5:-2])
#         num_cols = len(df.columns)
#         cols = list(df.columns[:num_cols-2-(len(self.latencies)+1)]) + list(df.columns[-2:]) + list(df.columns[-2-(len(self.latencies)+1):-2])
#         df = df[cols]
#         return df
    
#     def unigraph_keyboard(self, avg_mode=None):
#         if avg_mode:
#             df = self.unigraph_avg(avg_mode)
#         else:
#             df = self.unigraph.copy()
#         home_dist = []
#         for row in df.index:
#             home_dist.append(self.keyboard_dict['home']([df['KEYCODE'][row]]))
#         df['HD'] = home_dist
#         # cols = list(df.columns[:-3]) + list(df.columns[-1:]) + list(df.columns[-3:-1])
#         num_cols = len(df.columns)
#         cols = list(df.columns[:num_cols-1-len(self.latencies)]) + list(df.columns[-1:]) + list(df.columns[-1-len(self.latencies):-1])
#         df = df[cols]
#         return df
    
#     def IQR_filter(self, data, fold):
#         Q3 = data.quantile(.75)
#         Q1 = data.quantile(.25)
#         IQR = Q3 - Q1
#         max = Q3 + fold * IQR
#         min = Q1 - fold * IQR
#         return min, max

#     def ABS_filter(self, data, bounds):
#         num_bottom, num_top = bounds
#         min = data.sort_values()[:num_bottom+1].values[-1]
#         max = data.sort_values(ascending=False)[:num_top+1].values[-1]
#         return min, max
    
#     def unigraph_filtered(self, avg_mode, encode_keyboard, filter, bounds_dict):
#         '''
#         Input:
#             avg_mode: str, takes value in ['median', 'mean', None(default)]
#             encode_keyboard: boolean
#             filter: str, takes value in ['ABS'(default), 'IQR']
#             bounds_dict: a python dictionary with keys=latencies, 
#                                                   values=needed params
#                       ==> for IQR: values = folds (i.e. scaling IQR by fold*IQR)
#                       ==> for ABS: values = [num_bottoms, num_tops]
#         '''
#         filter_latencies = list(bounds_dict.keys())
#         if encode_keyboard:
#             df = self.unigraph_keyboard(avg_mode)
#         elif avg_mode:
#             df = self.unigraph_avg(avg_mode)
#         else:
#             df = self.unigraph.copy()
#         for latency in filter_latencies:
#             for user in df['USER'].unique():
#                 mask_user = df['USER'] == user
#                 mask_nonuser = df['USER'] != user
#                 subdf = df.loc[mask_user, latency]
#                 if filter == 'IQR':
#                     min, max = self.IQR_filter(subdf, bounds_dict[latency])
#                 else:
#                     min, max = self.ABS_filter(subdf, bounds_dict[latency])
#                 mask_max = df[latency] <= max
#                 mask_min = df[latency] >= min
#                 df = df.loc[mask_user & mask_max & mask_min | mask_nonuser]
#         return df
    
#     def digraph_filtered(self, avg_mode, encode_keyboard, filter, bounds_dict):
#         '''
#         Input:
#             avg_mode: str, takes value in ['median', 'mean', None(default)]
#             encode_keyboard: boolean
#             filter: str, takes value in ['ABS'(default), 'IQR']
#             bounds_dict: a python dictionary with keys=latencies, 
#                                                   values=needed params
#                       ==> for IQR: values = folds (i.e. scaling IQR by fold*IQR)
#                       ==> for ABS: values = [num_bottoms, num_tops]
#         '''
#         filter_latencies = list(bounds_dict.keys())
#         if encode_keyboard:
#             df = self.digraph_keyboard(avg_mode)
#         elif avg_mode:
#             df = self.digraph_avg(avg_mode)
#         else:
#             df = self.digraph.copy()
#         for latency in filter_latencies:
#             for user in df['USER'].unique():
#                 mask_user = df['USER'] == user
#                 mask_nonuser = df['USER'] != user
#                 subdf = df.loc[mask_user, latency]
#                 if filter == 'IQR':
#                     min, max = self.IQR_filter(subdf, bounds_dict[latency])
#                 else:
#                     min, max = self.ABS_filter(subdf, bounds_dict[latency])
#                 mask_max = df[latency] <= max
#                 mask_min = df[latency] >= min
#                 df = df.loc[mask_user & mask_max & mask_min | mask_nonuser]
#         return df

In [26]:
## Deprecated : delete is_testset variable, deal with preprocessing more uniformly

# class Extractors:
#     def __init__(self, sub_data, keyboard_dict, latencies, is_testset):
#         self.keyboard_dict = keyboard_dict
#         self.latencies = latencies
#         self.is_testset = is_testset

#         self.unigraph = self.unigraph_extractor(sub_data)
#         self.digraph = self.digraph_extractor(sub_data)
    
#     def unigraph_extractor(self, df, user_int=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         if keycode_str:
#             df['KEYCODE'] = df['KEYCODE'].astype('int64').astype(str)
#         ## construct new features
#         if 'HL' in self.latencies:
#             df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## dropping rows where the NEXT row has INDEX==0 (indicating a transition to next sentence)
#         if self.is_testset:
#             shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], pd.Series([0])], ignore_index=True) - df['TEST_SECTION_ID']
#             mask = list((shift_txt == 0)[:-1]) + [True]
#         else:
#             mask =  list((df['INDEX'] != 0)[1:]) + [False]
#         df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     def digraph_extractor(self, df, user_int=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         ## construct new features
#         df['K1'] = df['KEYCODE']
#         df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
#         if keycode_str:
#             df['K1'] = df['K1'].astype('int64').astype(str)
#             df['K2'] = df['K2'].astype('int64').astype(str)
#         df['I1'] = df['INDEX']
#         df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
#         if 'HL' in self.latencies:
#             df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#             df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## dropping instances where I2 is zero (indicating a transition to next sentence)
#         if self.is_testset:
#             shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], pd.Series([0])], ignore_index=True) - df['TEST_SECTION_ID']
#             mask = list((shift_txt == 0)[:-1]) + [True]
#         else:
#             mask = df['I2'] != 0
#         df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     ## https://towardsdatascience.com/do-you-use-apply-in-pandas-there-is-a-600x-faster-way-d2497facfa66
#     def digraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.digraph.copy()
#         df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
#         latencies = self.latencies.copy()
#         if 'HL' in latencies:
#             latencies.remove('HL')
#             latencies.insert(0, 'HL2')
#             latencies.insert(0, 'HL1')
#         for XL in latencies:
#             df[XL+'_avg'] = df[XL]
#         for pair in df['K1_K2'].unique():
#             mask = df['K1_K2'] == pair
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, latencies].mean()
#             else:
#                 avg_df = df.loc[mask, latencies].median()
#             for XL in latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=latencies+['K1_K2'])
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: re.search(r'(.{2,3})(_avg)', name).group(1) if '_avg' in name else name)
#         return df

#     def unigraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.unigraph.copy()
#         for XL in self.latencies:
#             df[XL+'_avg'] = df[XL]
#         for keycode in df['KEYCODE'].unique():
#             mask = df['KEYCODE'] == keycode
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, self.latencies].mean()
#             else:
#                 avg_df = df.loc[mask, self.latencies].median()
#             for XL in self.latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in self.latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=self.latencies)
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: name[:2] if '_avg' in name else name)
#         return df
    
#     def digraph_encode_keyboard(self):
#         df = self.digraph
#         keycode_dist = []
#         home_dist = []
#         for row in df.index:
#             keycode_dist.append(self.keyboard_dict['keycode'](df['K1'][row], df['K2'][row]))
#             home_dist.append(self.keyboard_dict['home']([df['K1'][row], df['K2'][row]]))
#         df['KD'] = keycode_dist
#         df['HD'] = home_dist
#         cols = list(df.columns[:-5]) + list(df.columns[-2:]) + list(df.columns[-5:-2])
#         df = df[cols]
#         return df
    
#     def unigraph_encode_keyboard(self):
#         df = self.unigraph
#         home_dist = []
#         for row in df.index:
#             home_dist.append(self.keyboard_dict['home']([df['KEYCODE'][row]]))
#         df['HD'] = home_dist
#         cols = list(df.columns[:-3]) + list(df.columns[-1:]) + list(df.columns[-3:-1])
#         df = df[cols]
#         return df
    
#     def filter_by_IQRs(self, data, folds, latencies):
#         df = data.copy()
#         for fold, latency in zip(folds, latencies):
#             for user in data['USER'].unique():
#                 mask_user = data['USER'] == user
#                 mask_non_user = data['USER'] != user
#                 Q3 = data.loc[mask_user, latency].quantile(.75)
#                 Q1 = data.loc[mask_user, latency].quantile(.25)
#                 IQR = Q3 - Q1
#                 max = Q3 + fold * IQR
#                 min = Q1 - fold * IQR
#                 mask_max = data[latency] <= max
#                 mask_min = data[latency] >= min
#                 df = df.loc[mask_user & mask_max & mask_min | mask_non_user]
#         return df

In [27]:
# # Deprecated : testset extractor has problems (sentence transition timestamps extraction)

# class Extractors:
#     def __init__(self, sub_data, keyboard_dict, latencies):
#         self.keyboard_dict = keyboard_dict
#         self.latencies = latencies
#         # self.is_testset = is_testset

#         self.unigraph = self.unigraph_extractor(sub_data)
#         self.digraph = self.digraph_extractor(sub_data)
    
#     def unigraph_extractor(self, df, user_int=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         if keycode_str:
#             df['KEYCODE'] = df['KEYCODE'].astype('int64').astype(str)
#         ## construct new features
#         if 'HL' in self.latencies:
#             df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         # ## dropping rows where the NEXT row has INDEX==0 (indicating a transition to next sentence)
#         # if self.is_testset:
#         #     shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], pd.Series([0])], ignore_index=True) - df['TEST_SECTION_ID']
#         #     mask = list((shift_txt == 0)[:-1]) + [True]
#         # else:
#         #     mask =  list((df['INDEX'] != 0)[1:]) + [False]
#         # df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     def digraph_extractor(self, df, user_int=True, keycode_str=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'TEST_SECTION_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         ## construct new features
#         df['K1'] = df['KEYCODE']
#         df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
#         if keycode_str:
#             df['K1'] = df['K1'].astype('int64').astype(str)
#             df['K2'] = df['K2'].astype('int64').astype(str)
#         df['I1'] = df['INDEX']
#         df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
#         if 'HL' in self.latencies:
#             df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#             df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)
#         if 'IL' in self.latencies:
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'RL' in self.latencies:
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         if 'PL' in self.latencies:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         # ## dropping instances where I2 is zero (indicating a transition to next sentence)
#         # if self.is_testset:
#         #     shift_txt = pd.concat([df['TEST_SECTION_ID'][1:], pd.Series([0])], ignore_index=True) - df['TEST_SECTION_ID']
#         #     mask = list((shift_txt == 0)[:-1]) + [True]
#         # else:
#         #     mask = df['I2'] != 0
#         # df = df.loc[mask]
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX', 'TEST_SECTION_ID'])
#         df = df.iloc[:-1, :]
#         return df
    
#     ## https://towardsdatascience.com/do-you-use-apply-in-pandas-there-is-a-600x-faster-way-d2497facfa66
#     def digraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.digraph.copy()
#         df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
#         latencies = self.latencies.copy()
#         if 'HL' in latencies:
#             latencies.remove('HL')
#             latencies.insert(0, 'HL2')
#             latencies.insert(0, 'HL1')
#         for XL in latencies:
#             df[XL+'_avg'] = df[XL]
#         for pair in df['K1_K2'].unique():
#             mask = df['K1_K2'] == pair
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, latencies].mean()
#             else:
#                 avg_df = df.loc[mask, latencies].median()
#             for XL in latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=latencies+['K1_K2'])
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: re.search(r'(.{2,3})(_avg)', name).group(1) if '_avg' in name else name)
#         return df

#     def unigraph_avg(self, avg_mode, data=None, drop_origin=True, rename_avg=True, round_avg=True):
#         if data:
#             df = data.copy()
#         else:
#             df = self.unigraph.copy()
#         for XL in self.latencies:
#             df[XL+'_avg'] = df[XL]
#         for keycode in df['KEYCODE'].unique():
#             mask = df['KEYCODE'] == keycode
#             if avg_mode == 'mean':
#                 avg_df = df.loc[mask, self.latencies].mean()
#             else:
#                 avg_df = df.loc[mask, self.latencies].median()
#             for XL in self.latencies:
#                 df.loc[mask, XL+'_avg'] = avg_df[XL]
#         if round_avg:
#             for XL in self.latencies:
#                 df[XL+'_avg'] = round(df[XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=self.latencies)
#         if drop_origin and rename_avg:
#             df = df.rename(columns=lambda name: name[:2] if '_avg' in name else name)
#         return df
    
#     def digraph_encode_keyboard(self):
#         df = self.digraph
#         keycode_dist = []
#         home_dist = []
#         for row in df.index:
#             keycode_dist.append(self.keyboard_dict['keycode'](df['K1'][row], df['K2'][row]))
#             home_dist.append(self.keyboard_dict['home']([df['K1'][row], df['K2'][row]]))
#         df['KD'] = keycode_dist
#         df['HD'] = home_dist
#         cols = list(df.columns[:-5]) + list(df.columns[-2:]) + list(df.columns[-5:-2])
#         df = df[cols]
#         return df
    
#     def unigraph_encode_keyboard(self):
#         df = self.unigraph
#         home_dist = []
#         for row in df.index:
#             home_dist.append(self.keyboard_dict['home']([df['KEYCODE'][row]]))
#         df['HD'] = home_dist
#         cols = list(df.columns[:-3]) + list(df.columns[-1:]) + list(df.columns[-3:-1])
#         df = df[cols]
#         return df
    
#     def filter_by_IQRs(self, data, folds, latencies):
#         df = data.copy()
#         for fold, latency in zip(folds, latencies):
#             for user in data['USER'].unique():
#                 mask_user = data['USER'] == user
#                 mask_non_user = data['USER'] != user
#                 Q3 = data.loc[mask_user, latency].quantile(.75)
#                 Q1 = data.loc[mask_user, latency].quantile(.25)
#                 IQR = Q3 - Q1
#                 max = Q3 + fold * IQR
#                 min = Q1 - fold * IQR
#                 mask_max = data[latency] <= max
#                 mask_min = data[latency] >= min
#                 df = df.loc[mask_user & mask_max & mask_min | mask_non_user]
#         return df
    
#     # def unigraph_filter_outliers(self, quantile_perc):
#     #     data = self.unigraph.copy()
#     #     return data.loc[(data['HL'] <= data['HL'].quantile(quantile_perc))
#     #                   & (data['PL'] <= data['PL'].quantile(quantile_perc))]
    
#     # def digraph_filter_outliers(self, quantile_perc):
#     #     data = self.digraph.copy()
#     #     return data.loc[(data['HL1'] <= data['HL1'].quantile(quantile_perc))
#     #                   & (data['HL2'] <= data['HL2'].quantile(quantile_perc))
#     #                   & (data['PL'] <= data['PL'].quantile(quantile_perc))]

In [None]:
## DEPRECATED: delete key_only (which is just unigraph with 1 output time latency)

# class Extractors:
#     def __init__(self, sub_data, keyboard_dict, avg_mode, add_layout, remove_outliers, conn_latency):
#         self.keyboard_dict = keyboard_dict
#         self.add_layout = add_layout
#         self.XL = conn_latency
#         self.outliers_perc = remove_outliers
#         self.avg_mode = avg_mode        ## takes value in ['mean', 'median', None/False]

#         self.unigraph = self.unigraph_extractor(sub_data)
#         self.digraph = self.digraph_extractor(sub_data)

#         if self.outliers_perc:
#             self.unigraph = self.unigraph_filter_outliers(self.outliers_perc)
#             self.digraph = self.digraph_filter_outliers(self.outliers_perc)

#         if avg_mode:
#             self.unigraph = self.unigraph_avg()
#             self.digraph = self.digraph_avg()
        
#         if add_layout:
#             self.unigraph = self.unigraph_encode_keyboard()
#             self.digraph = self.digraph_encode_keyboard()
        
#         self.key_only = self.key_extractor()
    
#     def unigraph_extractor(self, df, user_int=True, keycode_int=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         if keycode_int:
#             df['KEYCODE'] = df['KEYCODE'].astype('int64')
#         ## construct new features
#         df['HL'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#         if self.XL == 'IL':
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         elif self.XL == 'RL':
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         else:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME'])
#         df = df.iloc[:-1, :]
#         return df
    
#     def digraph_extractor(self, df, user_int=True, keycode_int=True, drop_user=False):
#         df = df[['PARTICIPANT_ID', 'PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX']]
#         df = df.astype('float64')
#         if user_int:
#             df['PARTICIPANT_ID'] = df['PARTICIPANT_ID'].astype('int64')
#         df = df.rename(columns={'PARTICIPANT_ID': 'USER'})
#         if drop_user:
#             df = df.drop(columns=['USER'])
#         ## construct new features
#         df['K1'] = df['KEYCODE']
#         df['K2'] = pd.concat([df['KEYCODE'][1:], pd.Series([0])], ignore_index=True)
#         if keycode_int:
#             df['K1'] = df['K1'].astype('int64')
#             df['K2'] = df['K2'].astype('int64')
#         df['I1'] = df['INDEX']
#         df['I2'] = pd.concat([df['INDEX'][1:], pd.Series([0])], ignore_index=True)
#         df['HL1'] = df['RELEASE_TIME'] - df['PRESS_TIME']
#         df['HL2'] = pd.concat([df['HL1'][1:], pd.Series([0])], ignore_index=True)
#         if self.XL == 'IL':
#             df['IL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         elif self.XL == 'RL':
#             df['RL'] = pd.concat([df['RELEASE_TIME'][1:], pd.Series([0])], ignore_index=True) - df['RELEASE_TIME']
#         else:
#             df['PL'] = pd.concat([df['PRESS_TIME'][1:], pd.Series([0])], ignore_index=True) - df['PRESS_TIME']
#         ## cleaning irrelavant info
#         df = df.drop(columns=['PRESS_TIME', 'RELEASE_TIME', 'KEYCODE', 'INDEX'])
#         df = df.iloc[:-1, :]
#         return df
    
#     def key_extractor(self):
#         return self.unigraph.drop(columns=[self.XL])
    
#     ## https://towardsdatascience.com/do-you-use-apply-in-pandas-there-is-a-600x-faster-way-d2497facfa66
#     def digraph_avg(self, drop_origin=True, rename_avg=True, round_avg=True):
#         df = self.digraph
#         df['K1_K2'] = df[['K1', 'K2']].apply(tuple, axis=1)
#         df['HL1_avg'] = df['HL1']
#         df['HL2_avg'] = df['HL2']
#         df[self.XL+'_avg'] = df[self.XL]
#         for pair in df['K1_K2'].unique():
#             if self.avg_mode == 'mean':
#                 avg_df = df[df['K1_K2'] == pair][['HL1', self.XL, 'HL2']].mean()
#             else:
#                 avg_df = df[df['K1_K2'] == pair][['HL1', self.XL, 'HL2']].median()
#             mask = df['K1_K2'] == pair
#             df.loc[mask, 'HL1_avg'] = avg_df['HL1']
#             df.loc[mask, 'HL2_avg'] = avg_df['HL2']
#             df.loc[mask, self.XL+'_avg'] = avg_df[self.XL]
#         if round_avg:
#             df['HL1_avg'] = round(df['HL1_avg'])
#             df['HL2_avg'] = round(df['HL2_avg'])
#             df[self.XL+'_avg'] = round(df[self.XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=['HL1', 'HL2', self.XL, 'K1_K2'])
#         if drop_origin and rename_avg:
#             df = df.rename(columns={'HL1_avg':'HL1', 'HL2_avg':'HL2', self.XL+'_avg':self.XL})
#         return df

#     def unigraph_avg(self, drop_origin=True, rename_avg=True, round_avg=True):
#         df = self.unigraph
#         df['HL_avg'] = df['HL']
#         df[self.XL+'_avg'] = df[self.XL]
#         for keycode in df['KEYCODE'].unique():
#             if self.avg_mode == 'mean':
#                 avg_df = df[df['KEYCODE'] == keycode][['HL', self.XL]].mean()
#             else:
#                 avg_df = df[df['KEYCODE'] == keycode][['HL', self.XL]].median()
#             mask = df['KEYCODE'] == keycode
#             df.loc[mask, 'HL_avg'] = avg_df['HL']
#             df.loc[mask, self.XL+'_avg'] = avg_df[self.XL]
#         if round_avg:
#             df['HL_avg'] = round(df['HL_avg'])
#             df[self.XL+'_avg'] = round(df[self.XL+'_avg'])
#         if drop_origin:
#             df = df.drop(columns=['HL', self.XL])
#         if drop_origin and rename_avg:
#             df = df.rename(columns={'HL_avg':'HL', self.XL+'_avg':self.XL})
#         return df
    
#     def digraph_encode_keyboard(self):
#         df = self.digraph
#         keycode_dist = []
#         home_dist = []
#         for row in df.index:
#             keycode_dist.append(self.keyboard_dict['keycode'](df['K1'][row], df['K2'][row]))
#             home_dist.append(self.keyboard_dict['home']([df['K1'][row], df['K2'][row]]))
#         df['KD'] = keycode_dist
#         df['HD'] = home_dist
#         cols = list(df.columns[:-5]) + list(df.columns[-2:]) + list(df.columns[-5:-2])
#         df = df[cols]
#         return df
    
#     def unigraph_encode_keyboard(self):
#         df = self.unigraph
#         home_dist = []
#         for row in df.index:
#             home_dist.append(self.keyboard_dict['home']([df['KEYCODE'][row]]))
#         df['HD'] = home_dist
#         cols = list(df.columns[:-3]) + list(df.columns[-1:]) + list(df.columns[-3:-1])
#         df = df[cols]
#         return df
    
#     def unigraph_filter_outliers(self, quantile_perc):
#         data = self.unigraph
#         return data.loc[(data['HL'] <= data['HL'].quantile(quantile_perc))
#                       & (data['PL'] <= data['PL'].quantile(quantile_perc))]
    
#     def digraph_filter_outliers(self, quantile_perc):
#         data = self.digraph
#         return data.loc[(data['HL1'] <= data['HL1'].quantile(quantile_perc))
#                       & (data['HL2'] <= data['HL2'].quantile(quantile_perc))
#                       & (data['PL'] <= data['PL'].quantile(quantile_perc))]

# KDS - Sequential Input Generator

In [28]:
class KDS:
    def __init__(self, df, 
                 n_steps, shift, batch_size, 
                 nonkeycodeB_features, output_features,
                 encoders, enc_names, do_onehot=True):
        self.df = df
        self.window_length = n_steps + 1
        self.n_steps = n_steps
        self.shift = shift
        self.batch = batch_size

        self.inputB_features = nonkeycodeB_features        ## inputB_features are the features input to keycode_embed layer, not including keycodes
        self.output_features = output_features
        
        for i, user in enumerate(self.df['USER'].unique()):
            mask = self.df['USER'] == user
            user_df = self.df.loc[mask, :]
            ## One-hot on 'KEYCODE' (mode=='uni') OR 'K1', 'K2' (mode!='uni')
            if len(encoders) == 1 and enc_names[0] == 'KEYCODE':
                keycode_np = encoders[0].transform(user_df[['KEYCODE']].astype(str)).toarray()
            elif len(encoders) == 1 and enc_names[0] == 'K1_K2':
                keycode_np = encoders[0].transform(user_df[['K1', 'K2']].astype(str)).toarray()
            else:
                k1_onehot = encoders[0].transform(user_df[['K1']].astype(str)).toarray()
                k2_onehot = encoders[1].transform(user_df[['K2']].astype(str)).toarray()
                keycode_np = k1_onehot + k2_onehot
            curr_df = np.concatenate([keycode_np, self.df.loc[mask, self.inputB_features+self.output_features]], axis=1)
            ## get the TFDS dataset of inputs (inputA, inputB) and output
            curr_in, curr_out = self.get_dataset(curr_df)
            if i == 0:
                self.ds_in = curr_in
                self.ds_out = curr_out
            else:
                self.ds_in = self.ds_in.concatenate(curr_in)
                self.ds_out = self.ds_out.concatenate(curr_out)
        ## zip the TFDS inputs and output for easy access at training
        self.ds = tf.data.Dataset.zip((self.ds_in, self.ds_out))
        
        for inputA, inputB in self.ds_in.take(1):
            self.inputA = inputA.shape
            self.inputB = inputB.shape
        
        for output in self.ds_out.take(1):
            self.output = output.shape

    def get_dataset(self, df):
        dataset = tf.data.Dataset.from_tensor_slices(df).window(size=self.window_length, shift=self.shift, drop_remainder=True)
        dataset = dataset.flat_map(lambda window: window.batch(self.window_length)).batch(self.batch)
        ds_in = dataset.map(lambda window: (window[:, :self.n_steps, :], window[:, -1, :-len(self.output_features)]))
        ds_in = ds_in.prefetch(tf.data.AUTOTUNE)
        ds_out = dataset.map(lambda window: window[:, -1, -len(self.output_features):])
        ds_out = ds_out.prefetch(tf.data.AUTOTUNE)
        return ds_in, ds_out

In [29]:
# class KDS:
#     def __init__(self, df, output_dim, n_steps, shift, batch_size, encoders, enc_names, do_onehot=True):
#         self.df = df
#         self.window_length = n_steps + 1
#         self.n_steps = n_steps
#         self.shift = shift
#         self.batch = batch_size
#         self.output_dim = output_dim
        
#         for i, user in enumerate(self.df['USER'].unique()):
#             mask = self.df['USER'] == user
#             curr_df = self.df.loc[mask, :].drop(columns=['USER'])
#             ## One-hot on 'KEYCODE'
#             if len(encoders) == 1 and enc_names[0] == 'KEYCODE':
#                 curr_df = np.concatenate([encoders[0].transform(curr_df[['KEYCODE']].astype(str)).toarray(), curr_df.drop(columns=['KEYCODE'])], axis=1)
#             elif len(encoders) == 1 and enc_names[0] == 'K1_K2':
#                 curr_df = np.concatenate([encoders[0].transform(curr_df[['K1', 'K2']].astype(str)).toarray(), curr_df.drop(columns=['K1', 'K2'])], axis=1)
#             else:
#                 k1_onehot = encoders[0].transform(curr_df[['K1']].astype(str)).toarray()
#                 k2_onehot = encoders[1].transform(curr_df[['K2']].astype(str)).toarray()
#                 curr_df = np.concatenate([k1_onehot+k2_onehot, curr_df.drop(columns=['K1', 'K2'])], axis=1)
#             ## get the TFDS dataset of inputs (inputA, inputB) and output
#             curr_in, curr_out = self.get_dataset(curr_df)
#             if i == 0:
#                 self.ds_in = curr_in
#                 self.ds_out = curr_out
#             else:
#                 self.ds_in = self.ds_in.concatenate(curr_in)
#                 self.ds_out = self.ds_out.concatenate(curr_out)
#         ## zip the TFDS inputs and output for easy access at training
#         self.ds = tf.data.Dataset.zip((self.ds_in, self.ds_out))
        
#         for inputA, inputB in self.ds_in.take(1):
#             self.inputA = inputA.shape
#             self.inputB = inputB.shape
        
#         for output in self.ds_out.take(1):
#             self.output = output.shape

#     def get_dataset(self, df):
#         dataset = tf.data.Dataset.from_tensor_slices(df).window(size=self.window_length, shift=self.shift, drop_remainder=True)
#         dataset = dataset.flat_map(lambda window: window.batch(self.window_length)).batch(self.batch)
#         ds_in = dataset.map(lambda window: (window[:, :self.n_steps, :], window[:, -1, :-self.output_dim]))
#         ds_in = ds_in.prefetch(tf.data.AUTOTUNE)
#         ds_out = dataset.map(lambda window: window[:, -1, -self.output_dim:])
#         ds_out = ds_out.prefetch(tf.data.AUTOTUNE)
#         return ds_in, ds_out

In [30]:
## DEPRECATED: modified needed argument names and updated one-hot encoding methods

# class KDS:
#     def __init__(self, df, n_steps, shift, batch_size, encoder, mode='uni', do_onehot=True):
#         self.df = df
#         self.window_length = n_steps + 1
#         self.n_steps = n_steps
#         self.shift = shift
#         self.batch = batch_size
        
#         for i, user in enumerate(self.df['USER'].unique()):
#             mask = self.df['USER'] == user
#             curr_df = self.df.loc[mask, :].drop(columns=['USER'])
#             ## One-hot on 'KEYCODE' (mode=='uni') OR 'K1', 'K2' (mode!='uni')
#             if mode == 'uni':
#                 curr_df = np.concatenate([encoder.transform(curr_df[['KEYCODE']].astype(str)).toarray(), curr_df.drop(columns=['KEYCODE'])], axis=1)
#             else:
#                 curr_df = np.concatenate([encoder.transform(curr_df[['K1', 'K2']].astype(str)).toarray(), curr_df.drop(columns=['K1', 'K2'])], axis=1)
#             curr_in, curr_out = self.get_dataset(curr_df)
#             if i == 0:
#                 self.ds_in = curr_in
#                 self.ds_out = curr_out
#             else:
#                 self.ds_in = self.ds_in.concatenate(curr_in)
#                 self.ds_out = self.ds_out.concatenate(curr_out)
#         self.ds = tf.data.Dataset.zip((self.ds_in, self.ds_out))
        
#         for inputA, inputB in self.ds_in.take(1):
#             self.inputA = inputA.shape
#             self.inputB = inputB.shape
        
#         for output in self.ds_out.take(1):
#             self.output = output.shape

#     def get_dataset(self, df):
#         dataset = tf.data.Dataset.from_tensor_slices(df).window(size=self.window_length, shift=self.shift, drop_remainder=True)
#         dataset = dataset.flat_map(lambda window: window.batch(self.window_length)).batch(self.batch)
#         ds_in = dataset.map(lambda window: (window[:, :self.n_steps, :], window[:, -1, :-2]))
#         ds_in = ds_in.prefetch(tf.data.AUTOTUNE)
#         ds_out = dataset.map(lambda window: window[:, -1, -2:])
#         ds_out = ds_out.prefetch(tf.data.AUTOTUNE)
#         return ds_in, ds_out

# KDI - Image-like Input Generator

In [None]:
class KDI:
    def __init__(self, train_data, df, 
                 n_steps, shift, batch_size, mat_length, 
                 inputA_features, inputB_features, output_features, 
                 inputB_type='image', encoders=None, keep_smaller_window=False, add_UNK=True):
        self.train_data = train_data
        self.df = df
        self.n_steps = n_steps
        self.shift = shift
        self.batch_size = batch_size
        self.mat_length = mat_length

        self.inputA_features = inputA_features
        self.inputB_features = inputB_features
        self.inputB_type = inputB_type                ## 'label, 'onehot', or 'image', else default to 'image'
        self.output_features = output_features        ## output_features ('HL', 'XL' in ['PL', 'RL', 'IL'])

        self.encoders = encoders
        self.keep_smaller_window = keep_smaller_window
        self.add_UNK = add_UNK

        self.keycode_dict = self.keycode_topfreq_dict(top=self.mat_length-1)

        self.inputA, self.inputB, self.output = self.kdi_training_data()
        self.ds = self.generate_kdi()


    def keycode_topfreq_dict(self, top):
        '''
        generate dictionary for the most popular `top` many keycodes using training data
        '''
        keycode_dict = {keycode: i for i, keycode in enumerate(self.train_data['KEYCODE'].astype('int32').value_counts()[:top].to_dict().keys())}
        if self.add_UNK:
            keycode_dict[0] = len(keycode_dict)
        return keycode_dict
  

    def single_input_image(self, curr_chunk, features, mat_length, keycode_dict):
        '''
        Helper function to generate a single image (with # of color channels == # of features)
        '''
        mat_dict = {}
        for feature in features:
            mat_dict['mat_'+feature] = np.zeros((mat_length, mat_length))
        mat_dict['count'] = np.zeros((mat_length, mat_length))

        for row in curr_chunk.index:
            i = int(curr_chunk.loc[row, 'K1'])
            j = int(curr_chunk.loc[row, 'K2'])
            if i in keycode_dict:
                pos_i = keycode_dict[i]
            else:
                pos_i = keycode_dict[0]   ## pos_i = top (the last key-value pair)
            if j in keycode_dict:
                pos_j = keycode_dict[j]
            else:
                pos_j = keycode_dict[0]
            for feature in features:
                if feature != 'HL':
                    mat_dict['mat_'+feature][pos_i, pos_j] += curr_chunk.loc[row, feature]
                else:
                    mat_dict['mat_'+feature][pos_i, pos_j] += (i + j) / 2
            mat_dict['count'][pos_i, pos_j] += 1
        mask_nonzero = mat_dict['count'] != 0
        mat_ls = []
        for feature in features:
            mat_dict['mat_'+feature][mask_nonzero] = mat_dict['mat_'+feature][mask_nonzero] / mat_dict['count'][mask_nonzero]
            mat_ls.append(mat_dict['mat_'+feature])
        return np.stack(mat_ls, axis=-1)
    

    def single_kdi_input(self, curr_chunk):
        '''
        Generates the group of inputA, inputB, and Output of the current chunk
        '''
        last_index = curr_chunk.index[-1]
        output_ls = []
        for feature in self.output_features:
            output_ls.append(curr_chunk.loc[last_index, feature])
        output_np = np.array(output_ls)
        ## inputA 
        inputA = self.single_input_image(curr_chunk.iloc[:-1], self.inputA_features, self.mat_length, self.keycode_dict)
        ## inputB
        if self.inputB_type == 'label':
            inputB = np.array(curr_chunk.loc[last_index, self.inputB_features + ['K1', 'K2']])
        elif self.inputB_type == 'onehot' and self.encoders:
            if len(self.encoders) == 1:
                inputB_keycode = self.encoders[0].transform(curr_chunk.loc[[last_index], ['K1', 'K2']].astype(str)).toarray()
            else:
                inputB_k1 = self.encoders[0].transform(curr_chunk.loc[[last_index], ['K1']].astype(str)).toarray()
                inputB_k2 = self.encoders[1].transform(curr_chunk.loc[[last_index], ['K2']].astype(str)).toarray()
                inputB_keycode = inputB_k1 + inputB_k2
            inputB = np.concatenate([inputB_keycode, np.array(curr_chunk.loc[last_index, self.inputB_features])], axis=1)
        else:
            inputB = self.single_input_image(curr_chunk.iloc[-1:], self.inputB_features, self.mat_length, self.keycode_dict)
        return inputA, inputB, output_np
    

    def kdi_training_data(self):
        '''
        Generates numpy arrays of inputA=(total_images, mat_length, mat_length, # of features), inputB, output
        '''
        window_length = self.n_steps + 1
        inputA_arr, inputB_arr, output_arr = [], [], []
        for user in self.df['USER'].unique():
            curr_df = self.df[self.df['USER'] == user]
            i = 0
            while i+window_length < len(curr_df):
                curr_chunk = curr_df.iloc[i:i+window_length]
                curr_inputA, curr_inputB, curr_output = self.single_kdi_input(curr_chunk)
                inputA_arr.append(curr_inputA)
                inputB_arr.append(curr_inputB)
                output_arr.append(curr_output)
                i += self.shift
            if self.keep_smaller_window and i < len(curr_df) - 1:    ## i cannot be curr_df[-1:] of length 1, since impossible to split into input and output data
                curr_chunk = curr_df.iloc[i:]
                curr_inputA, curr_inputB, curr_output = self.single_kdi_input(curr_chunk)
                inputA_arr.append(curr_inputA)
                inputB_arr.append(curr_inputB)
                output_arr.append(curr_output)
        return np.stack(inputA_arr, axis=0), np.stack(inputB_arr, axis=0), np.stack(output_arr, axis=0)
    

    def generate_kdi(self):
        '''
        Prepared tf.data object for training (batched)
        '''
        dataset = tf.data.Dataset.from_tensor_slices(({'inputA': self.inputA, 'inputB': self.inputB}, 
                                                      self.output)).batch(self.batch_size)
        return dataset

In [31]:
## DEPRECATED: changed default inputB_type to 'image' in the code (i.e. the `else` column in if-elif-else ststement)
## added docstrings

# class KDI:
#     def __init__(self, train_data, df, 
#                  n_steps, shift, batch_size, mat_length, 
#                  inputA_features, inputB_features, output_features, 
#                  inputB_type='image', encoders=None, keep_smaller_window=False, add_UNK=True):
#         self.train_data = train_data
#         self.df = df
#         self.n_steps = n_steps
#         self.shift = shift
#         self.batch_size = batch_size
#         self.mat_length = mat_length

#         self.inputA_features = inputA_features
#         self.inputB_features = inputB_features
#         self.inputB_type = inputB_type                ## 'image', or 'onehot', else default to 'int'
#         self.output_features = output_features        ## output_features ('HL', 'XL' in ['PL', 'RL', 'IL'])

#         self.encoders = encoders
#         self.keep_smaller_window = keep_smaller_window
#         self.add_UNK = add_UNK

#         self.keycode_dict = self.keycode_topfreq_dict(top=self.mat_length-1)

#         self.inputA, self.inputB, self.output = self.kdi_training_data()
#         self.ds = self.generate_kdi()


#     def keycode_topfreq_dict(self, top):
#         '''
#         generate dictionary for the most popular `top` many keycodes using training data
#         '''
#         keycode_dict = {keycode: i for i, keycode in enumerate(self.train_data['KEYCODE'].astype('int32').value_counts()[:top].to_dict().keys())}
#         if self.add_UNK:
#             keycode_dict[0] = len(keycode_dict)
#         return keycode_dict
  

#     def single_input_image(self, curr_chunk, features, mat_length, keycode_dict):
#         mat_dict = {}
#         for feature in features:
#             mat_dict['mat_'+feature] = np.zeros((mat_length, mat_length))
#         mat_dict['count'] = np.zeros((mat_length, mat_length))

#         for row in curr_chunk.index:
#             i = int(curr_chunk.loc[row, 'K1'])
#             j = int(curr_chunk.loc[row, 'K2'])
#             if i in keycode_dict:
#                 pos_i = keycode_dict[i]
#             else:
#                 pos_i = keycode_dict[0]   ## pos_i = top (the last key-value pair)
#             if j in keycode_dict:
#                 pos_j = keycode_dict[j]
#             else:
#                 pos_j = keycode_dict[0]
#             for feature in features:
#                 if feature != 'HL':
#                     mat_dict['mat_'+feature][pos_i, pos_j] += curr_chunk.loc[row, feature]
#                 else:
#                     mat_dict['mat_'+feature][pos_i, pos_j] += (i + j) / 2
#             mat_dict['count'][pos_i, pos_j] += 1
#         mask_nonzero = mat_dict['count'] != 0
#         mat_ls = []
#         for feature in features:
#             mat_dict['mat_'+feature][mask_nonzero] = mat_dict['mat_'+feature][mask_nonzero] / mat_dict['count'][mask_nonzero]
#             mat_ls.append(mat_dict['mat_'+feature])
#         return np.stack(mat_ls, axis=-1)
    

#     def single_kdi_input(self, curr_chunk):
#         last_index = curr_chunk.index[-1]
#         output_ls = []
#         for feature in self.output_features:
#             output_ls.append(curr_chunk.loc[last_index, feature])
#         output_np = np.array(output_ls)
#         ## inputA 
#         inputA = self.single_input_image(curr_chunk.iloc[:-1], self.inputA_features, self.mat_length, self.keycode_dict)
#         ## inputB
#         if self.inputB_type == 'image':
#             inputB = self.single_input_image(curr_chunk.iloc[-1:], self.inputB_features, self.mat_length, self.keycode_dict)
#         elif self.inputB_type == 'onehot' and self.encoders:
#             if len(self.encoders) == 1:
#                 inputB_keycode = self.encoders[0].transform(curr_chunk.loc[[last_index], ['K1', 'K2']].astype(str)).toarray()
#             else:
#                 inputB_k1 = self.encoders[0].transform(curr_chunk.loc[[last_index], ['K1']].astype(str)).toarray()
#                 inputB_k2 = self.encoders[1].transform(curr_chunk.loc[[last_index], ['K2']].astype(str)).toarray()
#                 inputB_keycode = inputB_k1 + inputB_k2
#             inputB = np.concatenate([inputB_keycode, np.array(curr_chunk.loc[last_index, self.inputB_features])], axis=1)
#         else:
#             inputB = np.array(curr_chunk.loc[last_index, self.inputB_features + ['K1', 'K2']])
#         return inputA, inputB, output_np
    

#     def kdi_training_data(self):
#         window_length = self.n_steps + 1
#         inputA_arr, inputB_arr, output_arr = [], [], []
#         for user in self.df['USER'].unique():
#             curr_df = self.df[self.df['USER'] == user]
#             i = 0
#             while i+window_length < len(curr_df):
#                 curr_chunk = curr_df.iloc[i:i+window_length]
#                 curr_inputA, curr_inputB, curr_output = self.single_kdi_input(curr_chunk)
#                 inputA_arr.append(curr_inputA)
#                 inputB_arr.append(curr_inputB)
#                 output_arr.append(curr_output)
#                 i += self.shift
#             if self.keep_smaller_window and i < len(curr_df) - 1:    ## i cannot be curr_df[-1:] of length 1, since impossible to split into input and output data
#                 curr_chunk = curr_df.iloc[i:]
#                 curr_inputA, curr_inputB, curr_output = self.single_kdi_input(curr_chunk)
#                 inputA_arr.append(curr_inputA)
#                 inputB_arr.append(curr_inputB)
#                 output_arr.append(curr_output)
#         return np.stack(inputA_arr, axis=0), np.stack(inputB_arr, axis=0), np.stack(output_arr, axis=0)
    

#     def generate_kdi(self):
#         dataset = tf.data.Dataset.from_tensor_slices(({'inputA': self.inputA, 'inputB': self.inputB}, 
#                                                       self.output)).batch(self.batch_size)
#         return dataset

# Callbacks

In [32]:
## functionalize callbacks
def create_checkpoint_callback(experiment_name, 
                               avg_mode,
                               save_weights_only=True, 
                               monitor='val_loss', 
                               mode='min', 
                               save_best_only=True):
    path = '/content/drive/MyDrive/COMP576/experiments'
    now_time = datetime.datetime.now(timezone('America/Chicago'))
    checkpoint_filepath = path + "/" + "checkpoints" + "/" + experiment_name + "/" + now_time.strftime("%Y%m%d-%H%M%S")
    checkpoint_filepath = checkpoint_filepath + '-avg' if avg_mode else checkpoint_filepath
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                             save_weights_only=save_weights_only,
                                                             monitor=monitor,
                                                             mode=mode,
                                                             save_best_only=save_best_only)
    print(f"Saving ModelCheckpoint files to :{checkpoint_filepath}")
    return checkpoint_callback

def create_lr_scheduler(max_cap):
    def lr_finder(epoch):
        num1 = 4 - (epoch - 1) // 3
        num2 = 1 + (epoch - 1) % 3 * 3
        lr = round(0.1 ** num1 * num2, 7)
        if max_cap and lr > max_cap:
            return max_cap
        return lr
    return tf.keras.callbacks.LearningRateScheduler(lr_finder)

def create_tensorboard_callback(experiment_name, avg_mode):
    path = '/content/drive/MyDrive/COMP576/experiments'
    now_time = datetime.datetime.now(timezone('America/Chicago'))
    log_dir = path + "/" + "tensorboard" + "/" + experiment_name + "/" + now_time.strftime("%Y%m%d-%H%M%S")
    log_dir = log_dir + '-avg' if avg_mode else log_dir
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    print(f"Saving TensorBoard log files to :{log_dir}")
    return tensorboard_callback

def create_earlystopping_callback(patience, monitor='val_loss'):
    return tf.keras.callbacks.EarlyStopping(monitor=monitor, patience=patience)

def get_callbacks(experiment_name, patience, avg_mode):
    earlystopping = create_earlystopping_callback(patience)
    modelcheckpoint = create_checkpoint_callback(experiment_name=experiment_name, avg_mode=avg_mode)
    tensorboard = create_tensorboard_callback(experiment_name=experiment_name, avg_mode=avg_mode)
    return [earlystopping, modelcheckpoint, tensorboard]

# Baseline Model

In [33]:
def typenet_base(inputA):
    name = 'TypeNet-base'
    batch_1 = keras.layers.BatchNormalization()(inputA)
    lstm_1 = keras.layers.LSTM(128, return_sequences=True)(batch_1)
    dropout_1 = keras.layers.Dropout(0.5)(lstm_1)
    batch_2 = keras.layers.BatchNormalization()(dropout_1)
    lstm_2 = keras.layers.LSTM(128)(batch_2)
    return name, lstm_2

def typenet_gru_base(inputA, gru_units):
    name = 'TypeNet-base'
    batch_1 = keras.layers.BatchNormalization()(inputA)
    gru_1 = keras.layers.GRU(gru_units, return_sequences=True)(batch_1)
    dropout_1 = keras.layers.Dropout(0.5)(gru_1)
    batch_2 = keras.layers.BatchNormalization()(dropout_1)
    gru_2 = keras.layers.GRU(gru_units)(batch_2)
    return name, gru_2

def concate_RNN_base(concat, feature_dim, output_dim, gru_units, gru_units_2):
    name = 'ConcatRNN-base'
    reshape = keras.layers.Reshape((gru_units+feature_dim-output_dim, 1))(concat)
    gru_1 = keras.layers.GRU(gru_units_2)(reshape)
    return name, gru_1

def create_model(feature_dim, output_dim, user_embedding=None, keycode_embedding=None, concat_model=None):
    inputA = keras.layers.Input(shape=[None, feature_dim], name='InputA')
    inputB = keras.layers.Input(shape=[feature_dim-output_dim], name='InputB')
    
    if user_embedding:
        user_name, user_embeded = user_embedding(inputA)
    else:
        user_name, user_embeded = 'no-user', inputA
    if keycode_embedding:
        keycode_name, keycode_embeded = keycode_embedding(inputB)
    else:
        keycode_name, keycode_embeded = 'no-keycode', inputB
    concat = keras.layers.concatenate([user_embeded, keycode_embeded])

    if concat_model:
        concat_name, concat_output = concat_model(concat, feature_dim, output_dim)
    else:
        concat_name, concat_output = 'no-concat', concat
    output = keras.layers.Dense(output_dim)(concat_output)

    model_name = user_name + '_' + keycode_name + '_' + concat_name
    return keras.Model(inputs=[inputA, inputB], outputs=[output], name=model_name)

# Miscellaneous Code

In [34]:
def lr_vs_loss(history):
    lrs = history.history['lr']
    loss = history.history['val_loss']
    plt.semilogx(lrs, loss);

def preprocess_message(df, df_name):
    print(f"In total {len(df['PARTICIPANT_ID'].unique())} users in the {df_name} dataset, with {len(df)} keystroke samples")

# IMPORT CHECK

In [35]:
print("\n\nHello from KDR_Preprocessing!")



Hello from KDR_Preprocessing!
