# Hyperparameters Tunning

In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import altair as alt
import matplotlib.pyplot as plt
import os
import tabulate
import seaborn as sns

sns.set()

In [4]:
alt.data_transformers.enable('default', max_rows=None) 
# alt.renderers.enable('default')
# alt.renderers.enable('notebook')

DataTransformerRegistry.enable('default')

In [5]:
# Save temp altair json files in separate folder
os.makedirs('tmp/altdata', exist_ok=True)

def custom(data):
    return alt.pipe(data, alt.to_json(filename='tmp/altdata/{prefix}-{hash}.{extension}') )

alt.data_transformers.register('custom', custom)
alt.data_transformers.enable('custom')

DataTransformerRegistry.enable('custom')

In [6]:
def concat_datasets(datasets):
    return pd.concat(
        (dataset[[
            'UserName', 
            'AccessKey', 
            'Topic', 
            'Task',
            'ReviewText', 
            'ReviewMeta'
        ]] for dataset in datasets),
        ignore_index=True)

def read_dataset(files = ('data/ReviewAMT_500_t.csv', 'data/GayMarriage_400.csv', 'data/GunControl_400.csv')):
    df_atm = pd.read_csv(files[0], sep='\t')
    df_gay = pd.read_csv(files[1], sep='\t')
    df_gun = pd.read_csv(files[2], sep='\t')

    df_atm.rename(columns = {'ReviewTopic': 'Topic'}, inplace=True)
    df_atm['Task'] = df_atm['Task'].map(
        {
            "Fake Review": 'fake', 
            "True Review": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        })
    df_gay['Task'] = df_gay['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    df_gun['Task'] = df_gun['Task'].map(
        {
            "Fake Essay": 'fake', 
            "True Essay": 'true',
            'Copy_1': 'copy_1',
            'Copy_2': 'copy_2'
        }) 
    return concat_datasets((df_atm, df_gay, df_gun))

df = read_dataset()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    5200 non-null   object
 1   AccessKey   5200 non-null   object
 2   Topic       5200 non-null   object
 3   Task        5200 non-null   object
 4   ReviewText  5200 non-null   object
 5   ReviewMeta  5200 non-null   object
dtypes: object(6)
memory usage: 243.9+ KB


In [121]:
def read_group_sessions(name='data/keystroke_sessions-2020-03-20.csv'):
    _a = pd.read_csv(name)
    # _a.rename(columns={"AccessKey": "UserName", }
    _a['UserName'] = _a['AccessKey']
    _a.drop(columns=['attempt_id', 'ReviewDate'], inplace=True)
    _a['Task'] = ''
    _a['Topic'] = 'LA'
    attempts = {}
    for i, row in _a.iterrows():
        key = row['UserName']
        if key not in attempts: attempts[key] = 0
        attempts[key] += 1
        _a.loc[i, 'Task'] = f"copy_{attempts[key]}"

    return _a

def add_text_length_column(df):
    df['TextLenght'] = df.apply(lambda x: len(x['ReviewText'].split()), axis = 1)

df_sessions = read_group_sessions()
df_sessions.head(3)

Unnamed: 0,ReviewText,ReviewMeta,AccessKey,UserName,Task,Topic
0,Linear algebra is central to almost all areas ...,1582990688239 KeyDown 16;1582990688600 KeyDown...,9d891be4-e43e-49f9-88bb-25314e670850,9d891be4-e43e-49f9-88bb-25314e670850,copy_1,LA
1,Linear algebra is central to almost all areas ...,1582996141506 KeyDown 16;1582996141595 KeyDown...,9d891be4-e43e-49f9-88bb-25314e670850,9d891be4-e43e-49f9-88bb-25314e670850,copy_2,LA
2,Linear algebra is central to almost all areas ...,1582996489340 KeyDown 16;1582996489541 KeyDown...,97fc22cf-aa61-4dcf-91a8-97d8fbf4a983,97fc22cf-aa61-4dcf-91a8-97d8fbf4a983,copy_1,LA


In [10]:
def process_keystroke(line):
    res = line.split(' ')
    if len(res) > 3:
        return f'UNKNOWN: {line}'
    
    time, command, key = res
    return (time, command, key, chr(int(key)))

def code_to_str(keycode):
    keycode = int(keycode)
    mappings = {
        16: 'shift',
        8: 'backspace',
        32: 'space',
        188: 'comma',
        190: 'dot'
    }
    return mappings.get(keycode, chr(keycode))

In [11]:
from collections import deque 

def process_keystrokes(inputs, humanize=False, distinguish_shift=True, small_transitions=True):      
    MOUSE_UP = 'MouseUp'
    KEY_DOWN = 'KeyDown'
    KEY_UP = 'KeyUp'
    TRANSITION_2 = 'transition_2'
    dwells = {}
    transitions_1 = {}
    transitions_2 = {}
    key_downs = {}
    keys_queue = deque([])
    last_key_up = None

    def correct_transitions():
        if small_transitions:
            return transitions_2
        else:
            return transitions_1
    
    def record_key(code1, code2, value, collection):
        key = ""
        if (16 in key_downs or last_key_up == 16) and code1 != 16 and distinguish_shift:
            key = key + "[shift]"
        if code2:
            key = key + f"{code1}_{code2}"
        else:
            key = key + f"{code1}"

        if key not in collection: 
            collection[key] = []
        collection[key].append(value)

    for keystroke in inputs.split(';'):
        res = keystroke.split(' ')
        
        if len(res) < 3: continue  
        if res[1] == 'MouseUp': continue

        time, command, code = res
        time = int(time)
        code = int(code)
        
        if command == KEY_DOWN:
            if keys_queue:
                prev_code, prev_time_down, prev_time_up = keys_queue[0]

                if prev_time_up: 
                    record_key(prev_code, code, time - prev_time_up, transitions_2)

                record_key(prev_code, code, time - prev_time_down, transitions_1)
            
            key_downs[code] = time
            keys_queue.appendleft([code, time, None])
            
        if command == KEY_UP:
            following_key = None
            for i_key in keys_queue:
                if i_key[0] == code:
                    i_key[2] = time
                    record_key(code, None, time - i_key[1], dwells)
                    
                    if following_key and following_key[1] < i_key[2]:
                        record_key(i_key[0], following_key[0], following_key[1] - i_key[2], transitions_2)
                    break

                following_key = i_key

            if code in key_downs: del(key_downs[code])
            last_key_up = code

    if humanize:
        new_dwells = {}
        new_transitions = {}
        shift_h = "[shift]"

        for key, inputs in dwells.items():
            prefix = ""
            if shift_h in key:
                key = key[7:]
                prefix = shift_h
            new_key = prefix + f"{code_to_str(int(key))}"
            new_dwells[new_key] = sorted(inputs)

        for key, inputs in correct_transitions().items():
            try:
                k1, k2 = key.split("_")
            except:
                continue

            prefix = ""
            if shift_h in k1:
                k1 = k1[7:]
                prefix = shift_h

            key = f"{prefix}{code_to_str(int(k1))}_{code_to_str(int(k2))}"
            new_transitions[key] = sorted(inputs)

        return new_dwells, new_transitions

    return dwells, correct_transitions()


In [12]:
ss = "1583128026026 KeyDown 65;1583128026177 KeyUp 65;1583128026810 KeyDown 65;1583128026937 KeyUp 65;1583128027529 KeyDown 16;1583128028034 KeyDown 72;1583128028169 KeyUp 72;1583128028801 KeyDown 69;1583128028953 KeyUp 69;1583128029298 KeyDown 76;1583128029408 KeyUp 76;1583128029586 KeyDown 76;1583128029713 KeyUp 76;1583128030929 KeyUp 16;1583128031681 KeyDown 16;1583128032402 KeyDown 79;1583128032561 KeyUp 79;1583128034281 KeyUp 16"

dwells, transitions = process_keystrokes(ss, humanize=True, small_transitions=True)
print(dwells)
print(transitions)

{'A': [127, 151], '[shift]H': [135], '[shift]E': [152], '[shift]L': [110, 127], 'shift': [2600, 3400], '[shift]O': [159]}
{'A_A': [633], 'A_shift': [592], '[shift]H_E': [632], '[shift]E_L': [345], '[shift]L_L': [178], 'shift_H': [-2895], '[shift]L_shift': [1968], 'shift_O': [-1879]}


In [300]:
from sklearn.model_selection import train_test_split

# def split_datasets(df):
#     df1 = df.loc[df['Task'] == 'copy_1']
#     df1.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     df2 = df.loc[df['Task'] == 'copy_2']
#     df2.drop_duplicates(subset ="UserName", keep = False, inplace = True) 

#     train_X, val_X = train_test_split(df1, test_size=0.2, random_state=42)
#     train_y = df2[df2['UserName'].isin(train_X['UserName'].tolist())]
#     val_y = df2[df2['UserName'].isin(val_X['UserName'].tolist())]
#     return train_X, val_X, train_y, val_y

def split_datasets(df):
    df_new = df.copy()
    df_new = df_new.drop_duplicates(subset =["UserName", "Task"], keep = 'first', inplace = False) 
    # df_new = df_new.drop_duplicates(subset =["UserName", "Task"], keep = False, inplace = False)
    test_column = 'copy_2'
    # test_column = 'fake'
    df1 = df_new.loc[df['Task'] != test_column]
    # df1 = df_new.loc[df['Task'] == 'copy_1']
    df2 = df_new.loc[df['Task'] == test_column]
    return df1, None, df2, None

df_train, df_val, train_y, val_y = split_datasets(concat_datasets((df, df_sessions)))
# df_train, df_val, train_y, val_y = split_datasets(df)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 0 to 5229
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserName    3200 non-null   object
 1   AccessKey   3200 non-null   object
 2   Topic       3200 non-null   object
 3   Task        3200 non-null   object
 4   ReviewText  3200 non-null   object
 5   ReviewMeta  3200 non-null   object
dtypes: object(6)
memory usage: 175.0+ KB


# Hyperparameters

In [287]:
FEATURES = ['space', 'E', 'T', 'A', 'O', 'backspace', 'N', 'I', 'S', 'R', 'H',
       'backspace_backspace', 'L', 'E_space', 'D', 'space_T', 'U', 'C',
       'space_A', 'T_H', 'S_space', 'M', 'shift', 'G', 'Y', 'T_space',
       'F', 'H_E', 'W', 'P', 'D_space', 'A_N', 'I_N', 'R_E', 'N_space',
       'Y_space', 'E_R', 'B', 'space_shift', 'space_I', 'space_O',
       'space_W', 'V', 'dot', 'space_S', 'O_space', 'O_N', 'N_D', 'E_N',
       'R_space', 'dot_space', 'A_T', 'O_U', 'T_O', 'space_C', 'I_T',
       'space_B', 'A_R', 'E_S', 'H_A', 'I_S', 'V_E', 'comma', 'A_L',
       'space_M', 'space_F', 'comma_space', 'R_I', 'O_R',
       'space_backspace', 'L_E', 'S_T', 'N_T', 'N_G', 'space_P',
       'space_H', 'S_E', 'T_I', 'A_space', 'M_E', 'space_G', 'T_E', 'E_D',
       'F_space', 'L_space', 'E_A', 'K', 'A_S', 'B_E', 'M_A', 'L_L',
       'G_space', '[shift]space', '[shift]I', 'space_R', 'space_D', 'U_N',
       'O_F', 'H_O', 'H_I', 'C_O', 'shift_I', 'N_E', 'space_L', 'R_O',
       'space_N', 'O_M', 'O_T', 'L_I', 'space_E', 'D_E', '[shift]T',
       'N_S', 'I_C', 'C_E', 'L_Y', 'T_A', 'G_E', 'I_O', 'U_R', 'E_L',
       'F_O', 'U_S', 'N_O', 'P_E', 'H_space', 'S_O', 'I_L', 'W_E',
       'shift_T', 'C_H', 'C_A', 'P_L', 'U_L', 'E_T', 'W_A', 'A_Y', 'T_R',
       'L_D', 'U_T', 'L_A', 'R_A', 'E_C', 'E_E', 'I_E', 'W_I',
       '[shift]space_shift', 'backspace_space', 'G_U', 'W_H', 'S_H',
       'A_V', 'A_C', 'E_backspace', 'O_O', 'space_space', 'O_P', 'R_Y',
       'O_W', 'I_A', 'R_R', 'Þ', 'L_O', 'S_I', 'O_L', 'A_G', 'D_I', 'I_G',
       'E_V', 'A_M', 'G_H', 'W_O', 'Y_O', 'M_O', 'backspace_shift', 'O_S',
       'G_A', 'P_O', 'K_E', 'R_S', 'backspace_E', 'P_R', 'S_dot', 'E_dot',
       'I_M', 'I_R', 'M_space', 'S_A', 'backspace_T', 'backspace_A', 'X',
       'C_I', 'space_Y', 'E_Y', 'F_E', 'O_D', 'D_O', 'space_U', 'T_S',
       'A_D']

DISTINGUISH_SHIFT=True
USE_TRANSITION_2 = True 
N_NEIGHBORS=1
FEATURES_COUNTER = {}
TRACK_FEATURES_COUNT = False
NORMALIZATOR_COEF=1.5
REMOVE_OUTLIERS = True
USE_MEAN = False 
USE_MEDIAN = True
USE_STD = False
USE_VARIANCE = False 
DISTANCE_MEASURE = 1 # 1 - manhattan, 2 - euql.
from sklearn import impute, preprocessing
DATA_IMPUTER = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
# DATA_TRANSFORMER = preprocessing.MaxAbsScaler(), preprocessing.RobustScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.StandardScaler()
DATA_TRANSFORMER = preprocessing.StandardScaler()

In [290]:
cached_raw_data_dict = {}

In [289]:
def timestamp_values(array, coef=1.5, original_median=True):
    array = np.array(array)

    if REMOVE_OUTLIERS:
        q1 = np.quantile(array, 0.25)
        q3 = np.quantile(array, 0.75)
        iqr = q3 - q1
        array = array[(array > (q1 - coef * iqr)) & (array < (q3 + coef * iqr))]

    return array.mean(), np.median(array), array.std(), array.var()

def process_and_unite_keystrokes(inputs, index, features=FEATURES): 
    humanize = True
    if index not in cached_raw_data_dict:
        a,b = process_keystrokes(inputs, humanize, DISTINGUISH_SHIFT, small_transitions=USE_TRANSITION_2)
        raw_data = {**a , **b}
        cached_raw_data_dict[index]=raw_data 
    else: 
        raw_data = cached_raw_data_dict[index]
    res = {}

    for feature in features:
        if feature in raw_data: 
            mean, median, std, var = timestamp_values(raw_data[feature], NORMALIZATOR_COEF)
        else: 
            mean, median, std, var = None, None, None, None

        if USE_MEAN: res[f"{feature}_mean"] = mean 
        if USE_MEDIAN: res[f"{feature}_median"] = median 
        if USE_STD: res[f"{feature}_std"] = std 
        if USE_VARIANCE: res[f"{feature}_var"] = var

    if TRACK_FEATURES_COUNT:
        # Save statistics of appearance of each key
        for key, timestamps in raw_data.items():
            if key not in FEATURES_COUNTER: FEATURES_COUNTER[key] = 0
            FEATURES_COUNTER[key] += len(timestamps)

    return res

def mutate_dataset(df, 
    user_id_column = "UserName", 
    keystrokes_column="ReviewMeta", 
    transition_2=False,
    features=FEATURES):
    res = []
    for index, inputs in df.iterrows():
        data = process_and_unite_keystrokes(inputs[keystrokes_column], index, features)
        res.append({'user_id': inputs[user_id_column], **data})
    return pd.DataFrame(res)

def to_pandas_dataframe(res_copy, res, y_column='user_id'):
    res_copy = pd.DataFrame(res_copy, columns = res.columns.difference([y_column]))
    res_copy['user_id'] = res['user_id']
    return res_copy

def standartize_dataset(res, imputer=DATA_IMPUTER, transformer=DATA_TRANSFORMER):
    from sklearn import impute, preprocessing

    res_copy = res[res.columns.difference(['user_id'])]
    res_copy = imputer.fit_transform(res_copy)
    res_copy = transformer.fit_transform(res_copy)

    return to_pandas_dataframe(res_copy, res), imputer, transformer

def standartize_by_modifiers(res, imputer, transformer): 
    res_copy = res[res.columns.difference(['user_id'])]
    res_copy = imputer.transform(res_copy)
    res_copy = transformer.transform(res_copy)
    return to_pandas_dataframe(res_copy, res)

In [301]:
n_features = 40
df_train_mutated = mutate_dataset(df_train, transition_2=USE_TRANSITION_2, features=FEATURES[:n_features])
train_y_mutated = mutate_dataset(train_y, transition_2=USE_TRANSITION_2,features=FEATURES[:n_features])
print('Done!')

Done!


In [310]:
df_train_new, imputer, transformer = standartize_dataset(df_train_mutated)
train_y_new = standartize_by_modifiers(train_y_mutated, imputer, transformer)

train_y_new.head(1)

Unnamed: 0,A_N_median,A_median,B_median,C_median,D_median,D_space_median,E_R_median,E_median,E_space_median,F_median,...,Y_space_median,backspace_backspace_median,backspace_median,shift_median,space_A_median,space_I_median,space_T_median,space_median,space_shift_median,user_id
0,0.634472,0.767679,0.639647,0.847244,0.313917,-0.4833,-0.631284,0.4811,-0.564155,0.180844,...,-0.325655,-0.142888,-0.242862,-0.294315,-0.337723,-0.225074,-0.089442,-0.265797,-0.278977,A002160837SWJFPIAI7L7


In [311]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

def knn_model(df, df_new, y_column): 
    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, p=DISTANCE_MEASURE)
    knn.fit(df[df.columns.difference([y_column])], df[y_column])
    score = knn.score(df_new[df_new.columns.difference([y_column])], df_new[y_column])
    print(df.shape)
    print(f"KNN Score: {score}")
    return knn

def knn_pca_model(df, df_new, y_column):
    train_data = df[df.columns.difference([y_column])]
    test_data = df_new[df_new.columns.difference([y_column])]
    pca = PCA(n_components = 0.9).fit(train_data)
    print(pca.transform(train_data).shape)

    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS, p=DISTANCE_MEASURE)
    knn.fit(pca.transform(train_data), df[y_column])

    print(pca.transform(test_data).shape)
    score = knn.score(pca.transform(test_data), df_new[y_column])
    print(f"KNN+PCA Score: {score}")
    return knn, pca

knn = knn_model(df_train_new, train_y_new, y_column='user_id')
knn_pca, pca = knn_pca_model(df_train_new, train_y_new, y_column='user_id')
knn

(3200, 41)
KNN Score: 0.9607843137254902
(3200, 18)
(1071, 18)
KNN+PCA Score: 0.7973856209150327


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')

In [285]:
dd = pd.DataFrame(
    (
        (4, 'Keystokes Mean + STD', 0.5096153846153846),
        (5, 'Keystrokes Mean + Variance', 0.5528846153846154),
        (0, 'Keystrokes Mean', 0.5721153846153846),
        (1, '... + Remove Outliers (baseline)', 0.703125),
        (2, '... + Distinguish [Shift]', 0.7127403846153846),
        (3, '... + Use Transition 2', 0.7271634615384616),
        (7, 'Keystrokes Median', 0.7548076923076923), 
        (8, '... + Manhattan Distance', 0.8858173076923077),
        (8, '... + PCA', 0.6716981132075471)
    ),
    columns=['Id', 'Label', 'Accuracy']
)
alt.layer(
alt.Chart(dd).mark_bar(size=2, color='#9400D3').encode(
    x=alt.X('Accuracy', scale=alt.Scale(zero=False)), 
    y=alt.Y('Label', scale=alt.Scale(zero=True), title=None, sort = list(dd['Label'].values))
    )+ \
alt.Chart(dd).mark_circle(size=50, color='#9400D3').encode(
        x='Accuracy', 
        y=alt.Y('Label', scale=alt.Scale(zero=True), sort = list(dd['Label'].values))
) + \
alt.Chart(dd.loc[dd['Label']=='... + Remove Outliers (baseline)']).mark_bar(size=2, color='#00FF00').encode(
    x=alt.X('Accuracy', scale=alt.Scale(zero=False)), 
    y=alt.Y('Label', scale=alt.Scale(zero=True), sort = list(dd['Label'].values))
    ) + \
alt.Chart(dd.loc[dd['Label']=='... + Remove Outliers (baseline)']).mark_circle(size=50, color='#00FF00').encode(
    x=alt.X('Accuracy', scale=alt.Scale(zero=False)), 
    y=alt.Y('Label', scale=alt.Scale(zero=True), sort = list(dd['Label'].values))
    )
).configure_mark(
        # color=alt.Color('red')
    ).configure_view(
     width = 650,
     height = 300).configure_axis(
    # domain = False, 
    grid=False,
    ticks = False,
    labelPadding = 10,
    labelFont = 'Ubuntu Mono',
    labelFontSize = 11,
    labelColor = '#3A3F4A',
    titleFont = 'Ubuntu Mono',
    titleColor = '#3A3F4A',
    # titleAnchor = "end",
    titlePadding = 10,
    gridWidth = 0.5,
    gridDash = [1, 0, 1],
)

In [206]:
dd = pd.DataFrame(
    (
        (10, 0.62019),
        (20, 0.664663),
        (30, 0.67668269),
        (40, 0.703125),
        (50, 0.6899038461538461),
        (60, 0.64663),
        (70, 0.6045),
        (80, 0.5528846153846154),
        (90, 0.5060096153846154)
    ),
    columns=['Number of Features', 'Accuracy']
)
alt.layer(alt.Chart(dd).mark_bar(size=2, color='#9400D3').encode(
    x=alt.X('Number of Features', scale=alt.Scale(bins=[10, 20, 30, 40, 50, 60, 70, 80, 90])), 
    y=alt.Y('Accuracy', scale=alt.Scale(zero=True)))+ \
alt.Chart(dd).mark_circle(size=50, color='#9400D3').encode(x='Number of Features', y='Accuracy')).configure_view(
     width = 750).configure_axis(
    # domain = False, 
    grid=False,
    ticks = False,
    labelPadding = 10,
    labelFont = 'Ubuntu Mono',
    labelFontSize = 11,
    labelColor = '#3A3F4A',
    titleFont = 'Ubuntu Mono',
    titleColor = '#3A3F4A',
    # titleAnchor = "end",
    titlePadding = 10,
    gridWidth = 0.5,
    gridDash = [1, 0, 1],
)

In [312]:
res = knn.predict(train_y_new[train_y_new.columns.difference(['user_id'])])
_rev = train_y_new.copy() 
_rev['predicted_user_id'] = res
# pd.set_option('display.max_rows', 100)
_rev[_rev.user_id != _rev.predicted_user_id]['user_id']

34                            A187X82UVX9973
60                            A1FSSSGYX9OZS2
93                            A1OEHMFL5A0G29
139                           A1YX045UYGCNLA
176                           A28M2RVUTQO891
206                           A2EQOFJNKS7RCN
225                           A2JRENREH72502
254                           A2S16E0HN8A3N7
259                           A2T5T842FSABA3
268                           A2W1E7UEQ7ZUQE
289                           A32SJS0TTSRIM5
298                           A34V3N0B4C3BMF
299                           A35DSHKI68VP6V
333                           A3D5C3AR576B1T
342                           A3GMI3KMQ4QOFQ
417                            ABVBWRI4D5O36
418                            ABW7WZONWDBVZ
427                            ADT8XTB3NGLTF
428                            ADW0Y55EQN5SP
434                            AFAWE4JWR9G84
452                            AL34WN7X302EF
457                            AMAOA3GTMDS94
488       

In [295]:
len(_rev[_rev.user_id != _rev.predicted_user_id]['user_id'])

52

In [296]:
len(train_y_new.user_id.unique())

1071

In [1]:
_a=pd.DataFrame(list(FEATURES_COUNTER.items()), columns=['key', 'counter'])#.sort(reverse=True)
# pd.set_option('display.max_rows', 200)
# _a.sort_values(by=['counter'], ascending=False).head(200)

NameError: name 'pd' is not defined

In [53]:
sorted(list(FEATURES_COUNTER.items()))

AttributeError: 'dict_items' object has no attribute '__reversed__'

In [74]:
alt.Chart(_a.sort_values(by=['counter'], ascending=False).head(20)).mark_bar().encode(
 x = 'counter:Q',
 y = alt.Y('key:O', sort = alt.Sort(field = 'counter', order='descending')),
)#.transform_filter('datum.counter > 2000')

In [70]:
_a.sort_values(by=['counter'], ascending=False).head(200).key.values

array(['space', 'E', 'T', 'A', 'O', 'backspace', 'N', 'I', 'S', 'R', 'H',
       'backspace_backspace', 'L', 'E_space', 'D', 'space_T', 'U', 'C',
       'space_A', 'T_H', 'S_space', 'M', 'shift', 'G', 'Y', 'T_space',
       'F', 'H_E', 'W', 'P', 'D_space', 'A_N', 'I_N', 'R_E', 'N_space',
       'Y_space', 'E_R', 'B', 'space_shift', 'space_I', 'space_O',
       'space_W', 'V', 'dot', 'space_S', 'O_space', 'O_N', 'N_D', 'E_N',
       'R_space', 'dot_space', 'A_T', 'O_U', 'T_O', 'space_C', 'I_T',
       'space_B', 'A_R', 'E_S', 'H_A', 'I_S', 'V_E', 'comma', 'A_L',
       'space_M', 'space_F', 'comma_space', 'R_I', 'O_R',
       'space_backspace', 'L_E', 'S_T', 'N_T', 'N_G', 'space_P',
       'space_H', 'S_E', 'T_I', 'A_space', 'M_E', 'space_G', 'T_E', 'E_D',
       'F_space', 'L_space', 'E_A', 'K', 'A_S', 'B_E', 'M_A', 'L_L',
       'G_space', '[shift]space', '[shift]I', 'space_R', 'space_D', 'U_N',
       'O_F', 'H_O', 'H_I', 'C_O', 'shift_I', 'N_E', 'space_L', 'R_O',
       'space_N', '

In [308]:
se_sions['UserName'].unique())

SyntaxError: invalid syntax (<ipython-input-308-4ac4dfeedcd5>, line 1)

In [309]:
len(df_sessions['UserName'].unique())

16