# Hyperparameters Tunning

In [28]:
import pandas as pd
import numpy as np
import sklearn
import altair as alt
import matplotlib.pyplot as plt
import os
import tabulate
import seaborn as sns

sns.set()

In [29]:
alt.data_transformers.enable('default', max_rows=None) 
# alt.renderers.enable('default')
# alt.renderers.enable('notebook')

DataTransformerRegistry.enable('default')

In [30]:
# Save temp altair json files in separate folder
os.makedirs('tmp/altdata', exist_ok=True)

def custom(data):
    return alt.pipe(data, alt.to_json(filename='tmp/altdata/{prefix}-{hash}.{extension}') )

alt.data_transformers.register('custom', custom)
alt.data_transformers.enable('custom')

DataTransformerRegistry.enable('custom')

In [31]:
def read_dataset(files = ('data/ReviewAMT_500_t.csv', 'data/GayMarriage_400.csv', 'data/GunControl_400.csv')):
    df_atm = pd.read_csv(files[0], sep='\t')
    df_gay = pd.read_csv(files[1], sep='\t')
    df_gun = pd.read_csv(files[2], sep='\t')

    df_atm.rename(columns = {'ReviewTopic': 'Topic'}, inplace=True)
    df_atm['Task'] = df_atm['Task'].map(
        {
            "Fake Review": 'fake', 
            "True Review": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        })
    df_gay['Task'] = df_gay['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    df_gun['Task'] = df_gun['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    datasets = (df_atm, df_gay, df_gun)
    df = pd.concat(
        (dataset[[
            'UserName', 
            'AccessKey', 
            'Topic', 
            'Task',
            'ReviewText', 
            'ReviewMeta'
        ]] for dataset in datasets),
        ignore_index=True)
    return df

df = read_dataset()

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    5200 non-null   object
 1   AccessKey   5200 non-null   object
 2   Topic       5200 non-null   object
 3   Task        5200 non-null   object
 4   ReviewText  5200 non-null   object
 5   ReviewMeta  5200 non-null   object
dtypes: object(6)
memory usage: 243.9+ KB


In [33]:
def process_keystroke(line):
    res = line.split(' ')
    if len(res) > 3:
        return f'UNKNOWN: {line}'
    
    time, command, key = res
    return (time, command, key, chr(int(key)))

def code_to_str(keycode):
    keycode = int(keycode)
    mappings = {
        16: 'shift',
        8: 'backspace',
        32: 'space',
        188: 'comma',
        190: 'dot'
    }
    return mappings.get(keycode, chr(keycode))

In [34]:
from collections import deque 
def process_keystrokes(inputs, humanize=False, distinguish_shift=True):      
    MOUSE_UP = 'MouseUp'
    KEY_DOWN = 'KeyDown'
    KEY_UP = 'KeyUp'
    TRANSITION_2 = 'transition_2'
    dwells = {}
    transitions_1 = {}
    transitions_2 = {}
    key_downs = {}
    keys_queue = deque([])
    last_key_up = None
    
    def record_key(code1, code2, value, collection):
        key = ""
        if collection is transitions_2:
            key = key + "-"
        if (16 in key_downs or last_key_up == 16) and code1 != 16 and distinguish_shift:
            key = key + "[shift]"
        if code2:
            key = key + f"{code1}_{code2}"
        else:
            key = key + f"{code1}"

        if key not in collection: 
            collection[key] = []
        collection[key].append(value)

    for keystroke in inputs.split(';'):
        res = keystroke.split(' ')
        
        if len(res) < 3: continue  
        if res[1] == 'MouseUp': continue

        time, command, code = res
        time = int(time)
        code = int(code)
        
        if command == KEY_DOWN:
            if keys_queue:
                prev_code, prev_time_down, prev_time_up = keys_queue[0]

                if prev_time_up: 
                    record_key(prev_code, code, time - prev_time_up, transitions_2)

                record_key(prev_code, code, time - prev_time_down, transitions_1)
            
            key_downs[code] = time
            keys_queue.appendleft([code, time, None])
            
        if command == KEY_UP:
            following_key = None
            for i_key in keys_queue:
                if i_key[0] == code:
                    i_key[2] = time
                    record_key(code, None, time - i_key[1], dwells)
                    
                    if following_key and following_key[1] < i_key[2]:
                        record_key(i_key[0], following_key[0], following_key[1] - i_key[2], transitions_2)
                    break

                following_key = i_key

            if code in key_downs: del(key_downs[code])
            last_key_up = code

    if humanize:
        new_dwells = {}
        new_transitions_1 = {}
        new_transitions_2 = {}
        shift_h = "[shift]"

        for key, inputs in dwells.items():
            prefix = ""
            if shift_h in key:
                key = key[7:]
                prefix = shift_h
            new_key = prefix + f"{code_to_str(int(key))}"
            new_dwells[new_key] = sorted(inputs)

        for key, inputs in transitions_1.items():
            try:
                k1, k2 = key.split("_")
            except:
                continue

            prefix = ""
            if shift_h in k1:
                k1 = k1[7:]
                prefix = shift_h

            key = f"{prefix}{code_to_str(int(k1))}_{code_to_str(int(k2))}"
            new_transitions_1[key] = sorted(inputs)
        for key, inputs in transitions_2.items(): 
            try:
                k1, k2 = key.split("_")
            except:
                continue
            k1 = k1[1:] # here is a "-" sign

            prefix = ""
            if shift_h in k1:
                k1 = k1[7:]
                prefix = shift_h

            key = f"-{prefix}{code_to_str(int(k1))}_{code_to_str(int(k2))}"
            new_transitions_2[key] = inputs #sorted(inputs)

        return new_dwells, new_transitions_1, new_transitions_2

    return dwells, transitions_1, transitions_2


In [35]:
ss = "1583128026026 KeyDown 65;1583128026177 KeyUp 65;1583128026810 KeyDown 65;1583128026937 KeyUp 65;1583128027529 KeyDown 16;1583128028034 KeyDown 72;1583128028169 KeyUp 72;1583128028801 KeyDown 69;1583128028953 KeyUp 69;1583128029298 KeyDown 76;1583128029408 KeyUp 76;1583128029586 KeyDown 76;1583128029713 KeyUp 76;1583128030929 KeyUp 16;1583128031681 KeyDown 16;1583128032402 KeyDown 79;1583128032561 KeyUp 79;1583128034281 KeyUp 16"

dwells, transitions_1, transitions_2 = process_keystrokes(ss, humanize=True)
dwells

{'A': [127, 151],
 '[shift]H': [135],
 '[shift]E': [152],
 '[shift]L': [110, 127],
 'shift': [2600, 3400],
 '[shift]O': [159]}

In [36]:
from sklearn.model_selection import train_test_split

# def split_datasets(df):
#     df1 = df.loc[df['Task'] == 'copy_1']
#     df1.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     df2 = df.loc[df['Task'] == 'copy_2']
#     df2.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     train_X, val_X = train_test_split(df1, test_size=0.2, random_state=42)
#     train_y = df2[df2['UserName'].isin(train_X['UserName'].tolist())]
#     val_y = df2[df2['UserName'].isin(val_X['UserName'].tolist())]
#     return train_X, val_X, train_y, val_y

def split_datasets(df):
    df_new = df.drop_duplicates(subset =["UserName", "Task"], keep = False, inplace = False) 
    df1 = df_new.loc[df['Task'].isin(['copy_1', 'copy_2', 'true'])]
    df2 = df_new.loc[df['Task'] == 'fake']
    return df1, None, df2, None

df_train, df_val, train_y, val_y = split_datasets(df)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2496 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    2496 non-null   object
 1   AccessKey   2496 non-null   object
 2   Topic       2496 non-null   object
 3   Task        2496 non-null   object
 4   ReviewText  2496 non-null   object
 5   ReviewMeta  2496 non-null   object
dtypes: object(6)
memory usage: 136.5+ KB


In [37]:
def timestamp_values(array, coef=1.5):
    array = np.array(array)
    q1 = np.quantile(array, 0.25)
    q3 = np.quantile(array, 0.75)
    iqr = q3 - q1
    array = array[(array > (q1 - coef * iqr)) & (array < (q3 + coef * iqr))]
    return array.mean(), array.std(), array.var()

timestamp_values([1, 11, 23, 14, 23, 1111])

(14.4, 8.2365041127896, 67.84)

In [38]:
FEATURES = ['T_H', 'H_E', 'A_N', 'T_space', 'R_E', 'I_N', 'E', 'A', 'R', 'I', 'O', 'T', 'N']
FEATURES = [
    'S_space', 'space_A', 'D_space', 'E_space', 'space_T', 'backspace_backspace', 
    'T_H', 'H_E', 'A_N', 'T_space', 'R_E', 'I_N', 
    'E', 'A', 'R', 'I', 'O', 'T', 'N', 'S', 'space'
    ]

FEATURES = ['backspace_backspace', "E_space", "space_T", "T_H", "space_A", "S_space", "T_space", "H_E", "D_space", "A_N", "I_N", "R_E", "N_space", "E_R", "Y_space", "space_I", "space_O", "O_space", "space_W", "space_S", "O_N", "N_D", "E_N", "R_space", "O_U", "space", "E", "backspace", "T", "A", "O", "I", "N", "S", "R", "H", "L", "D", "U", "C", "M", "G", "Y", "F", "W", "P", "B", "comma", "V", "dot", "K"]

USE_TRANSITION_2 = True
N_NEIGHBORS=3

def process_and_unite_keystrokes(
    inputs, 
    humanize=True, 
    distinguish_shift=True,
    normalizator = timestamp_values,
    normalizator_coef = 1.5,
    transition_2=False): 
    a,b,c = process_keystrokes(inputs, humanize, distinguish_shift)
    raw_data = {**a , **b, **c}
    res = {}

    features = FEATURES
    if transition_2:
        features = list(map(lambda x: x if "_" not in x else f"-{x}", features))

    for feature in features:
        if feature in raw_data: 
            mean, std, var = normalizator(raw_data[feature], normalizator_coef)
        else: 
            mean, std, var = None, None, None

        res[f"{feature}_mean"] = mean 
        res[f"{feature}_std"] = std 
        # res[f"{feature}_var"] = var

    return res

def mutate_dataset(df, 
    user_id_column = "UserName", 
    keystrokes_column="ReviewMeta", 
    transition_2=False):
    res = []
    for index, inputs in df.iterrows():
        data = process_and_unite_keystrokes(inputs[keystrokes_column], transition_2=transition_2)
        res.append({'user_id': inputs[user_id_column], **data})
    return pd.DataFrame(res)

def standartize_dataset(res):
    modifiers = {}
    for column in res.columns.difference(['user_id']):
        mean_v = res[column].mean()

        # Fill n/a values with mean
        res[column].fillna(mean_v, inplace=True)

        min_v = res[column].min()
        max_v = res[column].max()
        res[column] = res[column].apply(lambda x: (x - min_v)/(max_v - min_v))

        # Save modifiers so that they could be applied to 
        modifiers[column] = { 'mean': mean_v, 'min': min_v, 'max': max_v }

    return res, modifiers

def standartize_by_modifiers(res, modifiers, features = FEATURES): 
    for column in res.columns.difference(['user_id']):
        # Fill n/a values with mean
        res[column].fillna(modifiers[column]['mean'], inplace=True)

        min_v = modifiers[column]['min']
        max_v = modifiers[column]['max']
        res[column] = res[column].apply(lambda x: (x - min_v)/(max_v - min_v))
    return res

In [39]:
df_train_new, modifiers = standartize_dataset(mutate_dataset(df_train, transition_2=USE_TRANSITION_2))
train_y_new = standartize_by_modifiers(mutate_dataset(train_y, transition_2=USE_TRANSITION_2), modifiers)
train_y_new.head(3)

Unnamed: 0,user_id,-backspace_backspace_mean,-backspace_backspace_std,-E_space_mean,-E_space_std,-space_T_mean,-space_T_std,-T_H_mean,-T_H_std,-space_A_mean,...,B_mean,B_std,comma_mean,comma_std,V_mean,V_std,dot_mean,dot_std,K_mean,K_std
0,A002160837SWJFPIAI7L7,0.014156,0.005808,0.061186,0.020494,0.123876,0.077483,0.135681,0.166298,0.083225,...,0.637563,0.168824,0.3213,0.216331,0.401061,0.349531,0.402466,0.5284,0.437353,0.127536
1,A0436270XG2E3RS5T61O,0.014554,0.005186,0.113482,0.023985,0.130576,0.020893,0.60848,0.074421,0.074955,...,0.440343,0.137301,0.776655,0.013702,0.473086,0.011665,0.515642,0.375286,0.369412,0.008588
2,A109TOWKB3DY3P,0.013894,0.002557,0.052018,0.014546,0.113401,0.0498,0.531242,0.053522,0.038106,...,0.683885,0.080783,0.429603,0.12825,0.577331,0.333951,0.489142,0.310829,0.469412,0.011569


In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

def knn_model(df, df_new, y_column): 
    knn = KNeighborsClassifier(n_neighbors=1, p=1)
    knn.fit(df[df.columns.difference([y_column])], df[y_column])
    score = knn.score(df_new[df_new.columns.difference([y_column])], df_new[y_column])
    print(f"KNN Score: {score}")
    return knn

def knn_pca_model(df, df_new, y_column):
    pca = PCA(n_components = 0.99)
    knn = KNeighborsClassifier(n_neighbors=3, p=1)
    train_data = df[df.columns.difference([y_column])]
    train_targets = df[y_column]
    print(train_data.shape)

    pca.fit(train_data, train_targets)
    knn.fit(pca.transform(train_data), train_targets)

    test_data = df_new[df_new.columns.difference([y_column])]
    print(test_data.shape)
    score = knn.score(pca.transform(test_data), df_new[y_column])
    print(f"KNN Score: {score}")
    return knn, pca

knn = knn_model(df_train_new, train_y_new, y_column='user_id')
# knn, pca = knn_pca_model(df_train_new, train_y_new, y_column='user_id')
knn

KNN Score: 0.9350961538461539


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')