# Hyperparameters Tunning

In [1]:
import pandas as pd
import numpy as np
import sklearn
import altair as alt
import matplotlib.pyplot as plt
import os
import tabulate
import seaborn as sns

sns.set()

In [2]:
alt.data_transformers.enable('default', max_rows=None) 
# alt.renderers.enable('default')
# alt.renderers.enable('notebook')

DataTransformerRegistry.enable('default')

In [3]:
# Save temp altair json files in separate folder
os.makedirs('tmp/altdata', exist_ok=True)

def custom(data):
    return alt.pipe(data, alt.to_json(filename='tmp/altdata/{prefix}-{hash}.{extension}') )

alt.data_transformers.register('custom', custom)
alt.data_transformers.enable('custom')

DataTransformerRegistry.enable('custom')

In [68]:
def concat_datasets(datasets):
    return pd.concat(
        (dataset[[
            'UserName', 
            'AccessKey', 
            'Topic', 
            'Task',
            'ReviewText', 
            'ReviewMeta'
        ]] for dataset in datasets),
        ignore_index=True)

def read_dataset(files = ('data/ReviewAMT_500_t.csv', 'data/GayMarriage_400.csv', 'data/GunControl_400.csv')):
    df_atm = pd.read_csv(files[0], sep='\t')
    df_gay = pd.read_csv(files[1], sep='\t')
    df_gun = pd.read_csv(files[2], sep='\t')

    df_atm.rename(columns = {'ReviewTopic': 'Topic'}, inplace=True)
    df_atm['Task'] = df_atm['Task'].map(
        {
            "Fake Review": 'fake', 
            "True Review": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        })
    df_gay['Task'] = df_gay['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    df_gun['Task'] = df_gun['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    return concat_datasets((df_atm, df_gay, df_gun))

df = read_dataset()

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    5200 non-null   object
 1   AccessKey   5200 non-null   object
 2   Topic       5200 non-null   object
 3   Task        5200 non-null   object
 4   ReviewText  5200 non-null   object
 5   ReviewMeta  5200 non-null   object
dtypes: object(6)
memory usage: 243.9+ KB


In [70]:
df.head(3)

Unnamed: 0,UserName,AccessKey,Topic,Task,ReviewText,ReviewMeta
0,A002160837SWJFPIAI7L7,392aa3e372054948a5cabd637b2e239b,AMT,copy_1,Famous Daves is a good place to go for some go...,0 MouseUp 0 0;535 KeyDown 16;776 KeyDown 70;79...
1,A002160837SWJFPIAI7L7,392aa3e372054948a5cabd637b2e239b,AMT,copy_2,The Original Shrimp Place is a good place to c...,0 MouseUp 0 0;491 KeyDown 16;778 KeyDown 84;82...
2,A002160837SWJFPIAI7L7,392aa3e372054948a5cabd637b2e239b,AMT,fake,The Original Shrimp Place is a good place to c...,0 MouseUp 0 0;849 KeyDown 16;966 KeyDown 84;10...


In [71]:
def read_group_sessions(name='data/keystroke_sessions-2020-03-20.csv'):
    _a = pd.read_csv(name)
    # _a.rename(columns={"AccessKey": "UserName", }
    _a['UserName'] = _a['AccessKey']
    _a.drop(columns=['attempt_id', 'ReviewDate'], inplace=True)
    _a['Task'] = ''
    _a['Topic'] = 'LA'
    attempts = {}
    for i, row in _a.iterrows():
        key = row['UserName']
        if key not in attempts: attempts[key] = 0
        attempts[key] += 1
        _a.loc[i, 'Task'] = f"copy_{attempts[key]}"

    return _a

def add_text_length_column(df):
    df['TextLenght'] = df.apply(lambda x: len(x['ReviewText'].split()), axis = 1)

df_sessions = read_group_sessions()
df_sessions.head(3)

Unnamed: 0,ReviewText,ReviewMeta,AccessKey,UserName,Task,Topic
0,Linear algebra is central to almost all areas ...,1582990688239 KeyDown 16;1582990688600 KeyDown...,9d891be4-e43e-49f9-88bb-25314e670850,9d891be4-e43e-49f9-88bb-25314e670850,copy_1,LA
1,Linear algebra is central to almost all areas ...,1582996141506 KeyDown 16;1582996141595 KeyDown...,9d891be4-e43e-49f9-88bb-25314e670850,9d891be4-e43e-49f9-88bb-25314e670850,copy_2,LA
2,Linear algebra is central to almost all areas ...,1582996489340 KeyDown 16;1582996489541 KeyDown...,97fc22cf-aa61-4dcf-91a8-97d8fbf4a983,97fc22cf-aa61-4dcf-91a8-97d8fbf4a983,copy_1,LA


In [48]:
# df = concat_datasets((df, df_sessions))
# add_text_length_column(df)
# df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5231 entries, 0 to 5230
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    5231 non-null   object
 1   AccessKey   5231 non-null   object
 2   Topic       5231 non-null   object
 3   Task        5231 non-null   object
 4   ReviewText  5231 non-null   object
 5   ReviewMeta  5231 non-null   object
 6   TextLenght  5231 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 286.2+ KB


In [49]:
def process_keystroke(line):
    res = line.split(' ')
    if len(res) > 3:
        return f'UNKNOWN: {line}'
    
    time, command, key = res
    return (time, command, key, chr(int(key)))

def code_to_str(keycode):
    keycode = int(keycode)
    mappings = {
        16: 'shift',
        8: 'backspace',
        32: 'space',
        188: 'comma',
        190: 'dot'
    }
    return mappings.get(keycode, chr(keycode))

In [50]:
from collections import deque 

def process_keystrokes(inputs, humanize=False, distinguish_shift=True):      
    MOUSE_UP = 'MouseUp'
    KEY_DOWN = 'KeyDown'
    KEY_UP = 'KeyUp'
    TRANSITION_2 = 'transition_2'
    dwells = {}
    transitions_1 = {}
    transitions_2 = {}
    key_downs = {}
    keys_queue = deque([])
    last_key_up = None
    
    def record_key(code1, code2, value, collection):
        key = ""
        if collection is transitions_2:
            key = key + "-"
        if (16 in key_downs or last_key_up == 16) and code1 != 16 and distinguish_shift:
            key = key + "[shift]"
        if code2:
            key = key + f"{code1}_{code2}"
        else:
            key = key + f"{code1}"

        if key not in collection: 
            collection[key] = []
        collection[key].append(value)

    for keystroke in inputs.split(';'):
        res = keystroke.split(' ')
        
        if len(res) < 3: continue  
        if res[1] == 'MouseUp': continue

        time, command, code = res
        time = int(time)
        code = int(code)
        
        if command == KEY_DOWN:
            if keys_queue:
                prev_code, prev_time_down, prev_time_up = keys_queue[0]

                if prev_time_up: 
                    record_key(prev_code, code, time - prev_time_up, transitions_2)

                record_key(prev_code, code, time - prev_time_down, transitions_1)
            
            key_downs[code] = time
            keys_queue.appendleft([code, time, None])
            
        if command == KEY_UP:
            following_key = None
            for i_key in keys_queue:
                if i_key[0] == code:
                    i_key[2] = time
                    record_key(code, None, time - i_key[1], dwells)
                    
                    if following_key and following_key[1] < i_key[2]:
                        record_key(i_key[0], following_key[0], following_key[1] - i_key[2], transitions_2)
                    break

                following_key = i_key

            if code in key_downs: del(key_downs[code])
            last_key_up = code

    if humanize:
        new_dwells = {}
        new_transitions_1 = {}
        new_transitions_2 = {}
        shift_h = "[shift]"

        for key, inputs in dwells.items():
            prefix = ""
            if shift_h in key:
                key = key[7:]
                prefix = shift_h
            new_key = prefix + f"{code_to_str(int(key))}"
            new_dwells[new_key] = sorted(inputs)

        for key, inputs in transitions_1.items():
            try:
                k1, k2 = key.split("_")
            except:
                continue

            prefix = ""
            if shift_h in k1:
                k1 = k1[7:]
                prefix = shift_h

            key = f"{prefix}{code_to_str(int(k1))}_{code_to_str(int(k2))}"
            new_transitions_1[key] = sorted(inputs)
        for key, inputs in transitions_2.items(): 
            try:
                k1, k2 = key.split("_")
            except:
                continue
            k1 = k1[1:] # here is a "-" sign

            prefix = ""
            if shift_h in k1:
                k1 = k1[7:]
                prefix = shift_h

            key = f"-{prefix}{code_to_str(int(k1))}_{code_to_str(int(k2))}"
            new_transitions_2[key] = inputs #sorted(inputs)

        return new_dwells, new_transitions_1, new_transitions_2

    return dwells, transitions_1, transitions_2


In [51]:
ss = "1583128026026 KeyDown 65;1583128026177 KeyUp 65;1583128026810 KeyDown 65;1583128026937 KeyUp 65;1583128027529 KeyDown 16;1583128028034 KeyDown 72;1583128028169 KeyUp 72;1583128028801 KeyDown 69;1583128028953 KeyUp 69;1583128029298 KeyDown 76;1583128029408 KeyUp 76;1583128029586 KeyDown 76;1583128029713 KeyUp 76;1583128030929 KeyUp 16;1583128031681 KeyDown 16;1583128032402 KeyDown 79;1583128032561 KeyUp 79;1583128034281 KeyUp 16"

dwells, transitions_1, transitions_2 = process_keystrokes(ss, humanize=True)
dwells

{'A': [127, 151],
 '[shift]H': [135],
 '[shift]E': [152],
 '[shift]L': [110, 127],
 'shift': [2600, 3400],
 '[shift]O': [159]}

In [108]:
from sklearn.model_selection import train_test_split

# def split_datasets(df):
#     df1 = df.loc[df['Task'] == 'copy_1']
#     df1.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     df2 = df.loc[df['Task'] == 'copy_2']
#     df2.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     train_X, val_X = train_test_split(df1, test_size=0.2, random_state=42)
#     train_y = df2[df2['UserName'].isin(train_X['UserName'].tolist())]
#     val_y = df2[df2['UserName'].isin(val_X['UserName'].tolist())]
#     return train_X, val_X, train_y, val_y

def split_datasets(df):
    df_new = df.copy()
    # df_new = df_new.drop_duplicates(subset =["UserName", "Task"], keep = False, inplace = False) 
    # df1 = df_new.loc[df['Task'].isin(['copy_1', 'false', 'true', 'copy_3'])]
    test_column = 'copy_2'
    df1 = df_new.loc[df['Task'] != test_column]
    df2 = df_new.loc[df['Task'] == test_column]
    return df1, None, df2, None

df_train, df_val, train_y, val_y = split_datasets(concat_datasets((df, df_sessions)))
# df_train, df_val, train_y, val_y = split_datasets(df)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3920 entries, 0 to 5229
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    3920 non-null   object
 1   AccessKey   3920 non-null   object
 2   Topic       3920 non-null   object
 3   Task        3920 non-null   object
 4   ReviewText  3920 non-null   object
 5   ReviewMeta  3920 non-null   object
dtypes: object(6)
memory usage: 214.4+ KB


In [109]:
def timestamp_values(array, coef=1.5):
    array = np.array(array)
    q1 = np.quantile(array, 0.25)
    q3 = np.quantile(array, 0.75)
    iqr = q3 - q1
    array = array[(array > (q1 - coef * iqr)) & (array < (q3 + coef * iqr))]
    return array.mean(), array.std(), array.var()

timestamp_values([1, 11, 23, 14, 23, 1111])

(14.4, 8.2365041127896, 67.84)

In [110]:
FEATURES = ['T_H', 'H_E', 'A_N', 'T_space', 'R_E', 'I_N', 'E', 'A', 'R', 'I', 'O', 'T', 'N']
FEATURES = [
    'S_space', 'space_A', 'D_space', 'E_space', 'space_T', 'backspace_backspace', 
    'T_H', 'H_E', 'A_N', 'T_space', 'R_E', 'I_N', 
    'E', 'A', 'R', 'I', 'O', 'T', 'N', 'S', 'space'
    ]

FEATURES = ['backspace_backspace', "E_space", "space_T", "T_H", "space_A", "S_space", "T_space", "H_E", "D_space", "A_N", "I_N", "R_E", "N_space", "E_R", "Y_space", "space_I", "space_O", "O_space", "space_W", "space_S", "O_N", "N_D", "E_N", "R_space", "O_U", "space", "E", "backspace", "T", "A", "O", "I", "N", "S", "R"]#, "H", "L", "D", "U", "C", "M", "G", "Y", "F", "W", "P", "B", "comma", "V", "dot", "K"]

USE_TRANSITION_2 = True
N_NEIGHBORS=1

def process_and_unite_keystrokes(
    inputs, 
    humanize=True, 
    distinguish_shift=True,
    normalizator = timestamp_values,
    normalizator_coef = 1.5,
    transition_2=False): 
    a,b,c = process_keystrokes(inputs, humanize, distinguish_shift)
    raw_data = {**a , **b, **c}
    res = {}

    features = FEATURES
    if transition_2:
        features = list(map(lambda x: x if "_" not in x else f"-{x}", features))

    for feature in features:
        if feature in raw_data: 
            mean, std, var = normalizator(raw_data[feature], normalizator_coef)
        else: 
            mean, std, var = None, None, None

        res[f"{feature}_mean"] = mean 
        res[f"{feature}_std"] = std 
        # res[f"{feature}_var"] = var

    return res

def mutate_dataset(df, 
    user_id_column = "UserName", 
    keystrokes_column="ReviewMeta", 
    transition_2=False):
    res = []
    for index, inputs in df.iterrows():
        data = process_and_unite_keystrokes(inputs[keystrokes_column], transition_2=transition_2)
        res.append({'user_id': inputs[user_id_column], **data})
    return pd.DataFrame(res)

def standartize_dataset(res):
    modifiers = {}
    for column in res.columns.difference(['user_id']):
        mean_v = res[column].mean()

        # Fill n/a values with mean
        res[column].fillna(mean_v, inplace=True)

        min_v = res[column].min()
        max_v = res[column].max()
        res[column] = res[column].apply(lambda x: (x - min_v)/(max_v - min_v))

        # Save modifiers so that they could be applied to 
        modifiers[column] = { 'mean': mean_v, 'min': min_v, 'max': max_v }

    return res, modifiers

def standartize_by_modifiers(res, modifiers, features = FEATURES): 
    for column in res.columns.difference(['user_id']):
        # Fill n/a values with mean
        res[column].fillna(modifiers[column]['mean'], inplace=True)

        min_v = modifiers[column]['min']
        max_v = modifiers[column]['max']
        res[column] = res[column].apply(lambda x: (x - min_v)/(max_v - min_v))
    return res

In [111]:
df_train_new, modifiers = standartize_dataset(mutate_dataset(df_train, transition_2=USE_TRANSITION_2))
train_y_new = standartize_by_modifiers(mutate_dataset(train_y, transition_2=USE_TRANSITION_2), modifiers)
train_y_new.head(3)

Unnamed: 0,user_id,-backspace_backspace_mean,-backspace_backspace_std,-E_space_mean,-E_space_std,-space_T_mean,-space_T_std,-T_H_mean,-T_H_std,-space_A_mean,...,B_mean,B_std,comma_mean,comma_std,V_mean,V_std,dot_mean,dot_std,K_mean,K_std
0,A002160837SWJFPIAI7L7,0.014901,0.00594,0.091995,0.028029,0.082801,0.012082,0.248217,0.217384,0.072532,...,0.62326,0.197888,0.35,0.182182,0.467476,0.121124,0.49589,0.027346,0.393548,0.172294
1,A0436270XG2E3RS5T61O,0.033394,0.030202,0.120144,0.055283,0.120019,0.020126,0.57288,0.073518,0.069613,...,0.268605,0.176101,0.711455,0.018961,0.460538,0.016005,0.479628,0.191362,0.38956,0.057543
2,A109TOWKB3DY3P,0.015475,0.002943,0.088783,0.021113,0.071003,0.017613,0.52748,0.091219,0.060966,...,0.818414,0.050663,0.482,0.086754,0.586188,0.393103,0.457205,0.137864,0.453959,0.040014


In [142]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

def knn_model(df, df_new, y_column): 
    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, p=1)
    knn.fit(df[df.columns.difference([y_column])], df[y_column])
    score = knn.score(df_new[df_new.columns.difference([y_column])], df_new[y_column])
    print(f"KNN Score: {score}")
    return knn

def knn_pca_model(df, df_new, y_column):
    train_data = df[df.columns.difference([y_column])]
    test_data = df_new[df_new.columns.difference([y_column])]
    pca = PCA(n_components = 0.9).fit(train_data)
    print(pca.transform(train_data).shape)

    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, p=1)
    knn.fit(pca.transform(train_data), df[y_column])

    print(pca.transform(test_data).shape)
    score = knn.score(pca.transform(test_data), df_new[y_column])
    print(f"KNN+PCA Score: {score}")
    return knn, pca

knn = knn_model(df_train_new, train_y_new, y_column='user_id')
knn_pca, pca = knn_pca_model(df_train_new, train_y_new, y_column='user_id')
knn

KNN Score: 0.9145690312738368
(3920, 40)
(1311, 40)
KNN+PCA Score: 0.6773455377574371


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')

In [144]:
res = knn.predict(train_y_new[train_y_new.columns.difference(['user_id'])])
# _rev = train_y_new.copy() 
# _rev['predicted_user_id'] = res

In [145]:
pd.set_option('display.max_rows', 200)
_rev[_rev.user_id != _rev.predicted_user_id]['user_id']

5                             A10T740JW5MQDA
13                            A12E72AAMBWN9O
39                            A19M5ECA9D9H80
41                            A1AK0C9AYPTABN
42                             A1B7O88JBR3NU
44                            A1BDBKDS7F7OMW
46                            A1BSALLPX6MSI4
53                            A1CY5KVC753NQ9
60                            A1FSSSGYX9OZS2
89                            A1NM7ZPZ3NH412
91                            A1NW8TTOYMEJEP
92                            A1OB960PNAUAAS
93                            A1OEHMFL5A0G29
104                           A1SGSI91JYSV9Q
115                           A1TU5JYP6K71GZ
120                           A1UPWRVJKFV8EH
126                           A1W6X3VNHHVIND
130                           A1XD70FK0LGCYZ
148                           A21EPR8KW0GWX2
154                           A231ZXYHPSNUJQ
159                           A246AT3K8XQZ0N
167                           A261I77RAWTJM4
176       

In [120]:
len(train_y_new.user_id.unique())

1071