### Classification, Authentication, and Identification of users based on free-text analysis.

In [None]:
# imports
import os
import pickle
import numpy as np
import pandas as pd 

minimum_profile_length_r = 10
minimum_profile_length_a = 10     


In [None]:
user__  = [0] * 31
global user_i__
user_i__ = 0

#### Convert raw data to n-graph profiles

In [None]:
def keystrokes_to_digraphs(keystroke_array):
    digraphs = []
    i = 0    
    while i < len(keystroke_array) - 1:
        digraphs.append((
            str(keystroke_array[i][0]) +"-"+ str(keystroke_array[i+1][0]),
            np.round((keystroke_array[i+1][1]-keystroke_array[i][1]), 5)
            ))
        i +=1
    return digraphs

def keystrokes_to_trigraphs(keystroke_array):
    trigraphs = []
    i = 0
    while i < len(keystroke_array) - 2:   
        trigraphs.append((
            str(keystroke_array[i][0])+"-"+str(keystroke_array[i+1][0]) + "-" + str(keystroke_array[i+2][0]), 
            np.round((keystroke_array[i+2][1]-keystroke_array[i][1]), 5)
            ))
        i +=1
    return trigraphs

def keystrokes_to_fourgraphs(keystroke_array):
    fourgraphs = []
    i = 0
    while i < len(keystroke_array) -3:
        fourgraphs.append((
            str(keystroke_array[i][0]) + "-" + str(keystroke_array[i+1][0]) + "-" + str(keystroke_array[i+2][0]) + "-" + str(keystroke_array[i+3][0]), 
            np.round((keystroke_array[i+3][1] - keystroke_array[i][1]), 5)
        ))
        i += 1
    return fourgraphs


def calculate_mean_for_duplicates(ngraphs):
    cleaned_ngraphs = []
    processed_keys = []
    for key, time in ngraphs:
        if key not in processed_keys:
            duplicates = [e for e in ngraphs if e[0] == key ]
            if len(duplicates) > 1:
                processed_keys.append(key)
                cleaned_ngraphs.append((key, np.round(np.mean([d[1] for d in duplicates]), 5)))
            else :
                processed_keys.append(key)
                cleaned_ngraphs.append((key,time))
    return cleaned_ngraphs

def create_user_profile(keystroke_sequence):
    digraphs = calculate_mean_for_duplicates(keystrokes_to_digraphs(keystroke_sequence))
    trigraphs = calculate_mean_for_duplicates(keystrokes_to_trigraphs(keystroke_sequence))
    fourgraphs = calculate_mean_for_duplicates(keystrokes_to_fourgraphs(keystroke_sequence))
    return digraphs, trigraphs, fourgraphs


def read_file(complete: pd.DataFrame, user: int, set: int) -> list[(str, int)]:
    key_codes = complete.loc[(complete['user'] == user) & (complete['set'] == set)]['key'].to_list()
    timestamps =complete.loc[(complete['user'] == user) & (complete['set'] == set)]['timestamp'].to_list()

    keystrokes = [(str(k), t) for (k,t) in zip(key_codes, timestamps)]

    return keystrokes

def read_user_data(complete):
    users = []
    
    for user in range(1, 32):
        tmp_keystrokes = []
        for set in range(1, 16):
            f = read_file(complete, user, set)
            tmp_keystrokes.append(f)
        users.append(tmp_keystrokes)
    return users

def get_user_profiles(user_data):
    user_profiles = []
    count = 0
    for u_data in user_data:
        digraphs = []
        trigraphs =[]
        fourgraphs = []
        for sample in u_data:
            tmp_digraphs, tmp_trigraphs, tmp_fourgraphs = create_user_profile(sample)
            digraphs.append(dict(tmp_digraphs))
            trigraphs.append(dict(tmp_trigraphs))
            fourgraphs.append(dict(tmp_fourgraphs))

        user_profiles.append({"digraphs": digraphs, "trigraphs": trigraphs, "fourgraphs": fourgraphs})
        count += 1
    return user_profiles

def create_user_profiles(path_to_userdata, filename):
    user_data2 = read_user_data(pd.read_csv(path_to_userdata))
    user_profiles = get_user_profiles(user_data2)
    with open(filename, "wb") as fp:
        pickle.dump(user_profiles, fp)
    

#### Methods for calculating R- and A-distances

In [None]:
def calculate_r_distance(reference_ngraphs, evaluation_ngraphs):

    '''
    Returns the r(elative)-distance. 
    The higher the distance, the bigger the difference between profile and sample.
    The distance lies between 0 and 1 ([0, 1]).
    '''


    # number of shared n-graphs
    number_of_shared_ngraphs = len(evaluation_ngraphs)

    # check that a minimal number of n-graphs are shared
    # else maximum distance
    if number_of_shared_ngraphs < minimum_profile_length_r:
        global user_i__
        user__[user_i__] += 1
        return 1
    
    # order refernce(user profile) n-graphs based on n-grpah duration
    reference_ordered = list(dict(sorted(reference_ngraphs.items(), key= lambda item: item[1])))
   
    # order sample n-graphs based on n-grpah duration
    evaluation_ordered = list(dict(sorted(evaluation_ngraphs.items(), key= lambda item: item[1])))
    
    # calculate distances between n-graph positions in reference and evaluation datasets
    ordered_distances = [abs(evaluation_ordered.index(ele) - idx) for idx, ele in enumerate(reference_ordered)]
    
    # calculate maximum degree of disorder
    # (if |V| is even) 0> (|V|^2 / 2)
    if number_of_shared_ngraphs % 2 == 0:
        maximum_disorder = ((number_of_shared_ngraphs * number_of_shared_ngraphs)) / 2
    # (if |V| is odd) => (|V|^2 − 1) / 2
    else:
        maximum_disorder = ((number_of_shared_ngraphs * number_of_shared_ngraphs) - 1) / 2
    
    
    # if there are no shared n-graphs ?
    assert len(ordered_distances) == number_of_shared_ngraphs
    if len(ordered_distances) == 0:
        print("WARN: No R distance")
        return 1
    
    else:
        # calculate r-distance
        distance = np.sum(ordered_distances) / maximum_disorder
        return np.round(distance, 6)

def calculate_a_distance(reference_ngraphs, evaluation_ngraphs, threshold):
    '''
    Returns the a(bsolute)-distance. 
    The higher the distance, the bigger the difference between profile and sample.
    The distance lies between 0 and 1 ([0, 1]).
    '''

    # number of shared n-graphs
    number_of_shared_ngraphs = len(evaluation_ngraphs)

    # check that a minimal number of n-graphs are shared
    if number_of_shared_ngraphs < minimum_profile_length_a:
        global user_i__ 
        user__[user_i__] += 1
        return 1
    
    similar_ngraphs = 0

    # 1 < max(d1, d2)/min(d1, d2) ≤ t
    # => similar

    # for each n-graph
    for n_graph in evaluation_ngraphs:

        d1 = reference_ngraphs[n_graph]
        d2 = evaluation_ngraphs[n_graph]

        # 1 < max(d1, d2)/min(d1, d2) ≤ t
        # => similar

        # TODO:
        # make sure it's > 1
        if d2 >= d1:
            # shouldn't this be a devision: /  
            if (d2 - d1) <= threshold:
                similar_ngraphs += 1
        else:
            # shouldn't this be a devision: / 
            if (d1 - d2) <= threshold:
                similar_ngraphs += 1

    # calculate a-distance
    # 1 − (number of similar n-graphs between S1 and S2)/(total number of n-graphs shared by S1 and S2)
    distance = 1 - (similar_ngraphs / number_of_shared_ngraphs)

    return np.round(distance, 6)


def calculate_a_distance_alt_(reference_ngraphs, evaluation_ngraphs, threshold):

    # number of shared n-graphs
    number_of_shared_ngrpahs = len(evaluation_ngraphs)

    # check that a minimal number of n-graphs are shared
    if number_of_shared_ngrpahs < minimum_profile_length_a:
        return 1


    similar_ngraphs = 0

    # for each n-graph
    for n_graph in evaluation_ngraphs:

        d1 = reference_ngraphs[n_graph]
        d2 = evaluation_ngraphs[n_graph]

        # 1 < max(d1, d2)/min(d1, d2) ≤ t


        # max(d1, d2) => d2
        # min(d1, d2) => d1
        if d2 > d1 and d1 != 0:
            if (d2 / d1) <= threshold:
                similar_ngraphs += 1
        # max(d1, d2) => d1
        # min(d1, d2) => d2
        elif d1 > d2 and d2 != 0:
            if (d1 / d2) <= threshold:
                similar_ngraphs += 1
            
    distance = 1 - (similar_ngraphs / number_of_shared_ngrpahs)

    return np.round(distance, 6)



def get_matching_subsets(ngraphs_a, ngraphs_b):
    '''
        Returns data for each set for n-graphs that are present in both sets. 
    '''
    # get intersection(shared) of keys 
    intersection = ngraphs_a.keys() & ngraphs_b.keys()

    # only keep n-graphs that are present in both sets
    set_a = dict(map(lambda key: (key, ngraphs_a.get(key, None)), intersection))
    set_b = dict(map(lambda key: (key, ngraphs_b.get(key, None)), intersection))
    return set_a , set_b

def get_a_distance(ngraph_profile, ngraph_sample):
    '''
    Returns the average a-distance between a set of profiles and a sample
    '''

    # hold distance to each profile
    distances = []

    # for ach profile: len 14
    for profile in ngraph_profile:
        # get data sets only containing shared n-graphs
        set_a, set_b = get_matching_subsets(profile, ngraph_sample)

        # calculate a-distance
        distance = calculate_a_distance(set_a, set_b, 1.05)

        distances.append(distance)

    # return average a distance for each profile
    return np.mean(distances)

def get_r_distance(ngraph_profile, ngraph_sample):
    '''
    Returns the average r-distance between a set of profiles and a sample
    '''

    # hold distance to each profile
    distances = []

    # for each profile: len 14
    for profile in ngraph_profile:
        # get data sets only containing shared n-graphs
        set_a, set_b = get_matching_subsets(profile, ngraph_sample)

        # calculate r-distance
        distance = calculate_r_distance(set_a, set_b)

        # TODO
        # no distance n-graph ordering match exactly ?
        # r distance is between [0, 1]
        if distance != 99:
            distances.append(distance)
    
    if len(distances) == 0:
        return 1
    
    # calculate average distance
    return np.round(np.mean(distances), 5)

def get_mean_r_distance(samples_digraphs, samples_trigraps, samples_fourgraphs):
    '''
        Returns the mean r-distance of a profile.
    '''
    
    # list of all combination for all user samples (except combination of the same sample)
    # len: 14*13
    distances_digraphs = []
    distances_trigraphs = []
    distances_fourgraphs = []

     # 14 * 13: each combination of digraphs in all samples
    for i1, s1 in enumerate(samples_digraphs):
        for i2, s2 in enumerate(samples_digraphs):
            # exact with each other/same sample
            if i1 != i2:
                # 
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_digraphs.append(calculate_r_distance(set_a, set_b))

    # 14 * 13: each combination of trigraphs in all samples
    for i1, s1 in enumerate(samples_trigraps):
        for i2, s2 in enumerate(samples_trigraps):
            # exact with each other/same sample
            if i1 != i2:
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_trigraphs.append(calculate_r_distance(set_a, set_b))

     # 14 * 13: each combination of fourgraphs in all samples
    for i1, s1 in enumerate(samples_fourgraphs):
        for i2, s2 in enumerate(samples_fourgraphs):
            # exact with each other/same sample
            if i1 != i2:
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_fourgraphs.append(calculate_r_distance(set_a, set_b))


    # returns average distances for digraphs, trigraphs and fourgraphs
    return (np.mean(distances_digraphs) , np.mean(distances_trigraphs), np.mean(distances_fourgraphs))

def get_mean_a_distance(samples_digraphs, samples_trigraps, samples_fourgraphs):
    '''
        Returns the mean a-distance of a profile.
    '''

    # list of all combination for all user samples (except combination of the same sample)
    # len: 14*13
    distances_digraphs = []
    distances_trigraphs = []
    distances_fourgraphs = []

    # 14 * 13: each combination of digraphs in all samples
    for i1, s1 in enumerate(samples_digraphs):
        for i2, s2 in enumerate(samples_digraphs):
            # exact with each other/same sample
            if i1 != i2:
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_digraphs.append(calculate_a_distance(set_a, set_b, 1.05))
    
    # 14 * 13: each combination of trigraphs in all samples
    for i1, s1 in enumerate(samples_trigraps):
        for i2, s2 in enumerate(samples_trigraps):
            # exact with each other/same sample
            if i1 != i2:
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_trigraphs.append(calculate_a_distance(set_a, set_b, 1.05))

    # 14 * 13: each combination of fourgraphs in all samples
    for i1, s1 in enumerate(samples_fourgraphs):
        for i2, s2 in enumerate(samples_fourgraphs):
            # exact with each other/same sample
            if i1 != i2:
                set_a, set_b = get_matching_subsets(s1, s2)
                distances_fourgraphs.append(calculate_a_distance(set_a, set_b, 1.05))

    # returns average distances for digraphs, trigraphs and fourgraphs
    return (np.mean(distances_digraphs) , np.mean(distances_trigraphs), np.mean(distances_fourgraphs))


def get_a_distances(user_profile_digraph, user_profile_trigraph, user_profile_fourgraph, sample_digraph, sample_trigraph, sample_fourgraph):
    '''
        Returns the a2, a3, a4 distance from sample to user profile.
    '''
    
    # calculate a2 distance from user profile to sample
    a2 = get_a_distance(user_profile_digraph, sample_digraph)
     # calculate a3 distance from user profile to sample
    a3 = get_a_distance(user_profile_trigraph, sample_trigraph)
     # calculate a4 distance from user profile to sample
    a4 = get_a_distance(user_profile_fourgraph, sample_fourgraph)

    return (a2, a3, a4)

def get_r_distances(user_profile_digraph, user_profile_trigraph, user_profile_fourgraph, sample_digraph, sample_trigraph, sample_fourgraph):
    '''
        Returns the r2, r3, r4 distance from sample to user profile.
    '''
    
    # calculate r2 distance from user profile to sample
    r2 = get_r_distance(user_profile_digraph, sample_digraph)
    # calculate r3 distance from user profile to sample
    r3 = get_r_distance(user_profile_trigraph, sample_trigraph)
    # calculate r4 distance from user profile to sample
    r4 = get_r_distance(user_profile_fourgraph, sample_fourgraph)

    return (r2, r3, r4)

def calculate_distances(user_profile_digraph, user_profile_trigraph, user_profile_fourgraph, sample_digraph, sample_trigraph, sample_fourgraph):
    '''
        Returns the combination of all distances from sample to user profile.
    '''
    
    # calculate a-distances between user profile and samples
    # len: 3 -> (a2, a3, a4)
    a = get_a_distances(user_profile_digraph, user_profile_trigraph, user_profile_fourgraph, sample_digraph, sample_trigraph, sample_fourgraph)
    
    # calulate r-distances between user profile and samples
    # # len: 3 -> (r2, r3, r4)
    r = get_r_distances(user_profile_digraph, user_profile_trigraph, user_profile_fourgraph, sample_digraph, sample_trigraph, sample_fourgraph)
    
    # return all distance combination
    return {
        "a2": a[0],
        "a3": a[1],
        "a4": a[2],
        "a23": a[0]+a[1],
        "a24": a[0]+a[2],
        "a34": a[1]+a[2],
        "a234": a[0]+a[1]+a[2],
        "r2": r[0],
        "r3": r[1],
        "r4": r[2],
        "r23": r[0]+r[1],
        "r24": r[0]+r[2],
        "r34": r[1]+r[2],
        "r234": r[0]+r[1]+r[2],
        "r2_a2": r[0]+ a[0],
        "r2_a23": r[0]+a[0]+a[1],
        "r2_a24": r[0]+a[0]+a[2],
        "r2_a234": r[0]+a[0]+a[1]+a[2],

        "r23_a2": r[0]+r[1] + a[0],
        "r23_a23": r[0]+r[1] + a[0]+a[1],
        "r23_a24": r[0]+r[1] + a[0]+a[2],
        "r23_a234": r[0]+r[1] + a[0]+a[1]+a[2],

        "r234_a2": r[0]+r[1]+r[2] + a[0],
        "r234_a23": r[0]+r[1]+r[2] +  a[0]+a[1],
        "r234_a24": r[0]+r[1]+r[2] +  a[0]+a[2],
        "r234_a234": r[0]+r[1]+r[2] + a[0]+a[1]+a[2]
    }


In [None]:
def classify_user_experiment(user_profiles_training, user_profile_evaluation, user):

    ''''''
    
    # userA = {setA1, setA2, setA3, .., setA15}
    # md(userA, sampleX) = [d(setA1, sampleX) + d(setA2, sampleX) +···+ d(setAn, sampleX)]/n
    # classify sampleU => min(md(userA, sampleU), md(userB, sampleU), ...)

    tp_a2 = 0
    tp_a3 = 0
    tp_a4 = 0
    tp_a23 = 0
    tp_a24 = 0
    tp_a34 = 0
    tp_a234 = 0

    tp_r2 = 0
    tp_r3 = 0
    tp_r4 = 0
    tp_r23 = 0
    tp_r24 = 0
    tp_r34 = 0
    tp_r234 = 0

    tp_r2_a2 = 0
    tp_r2_a23 = 0
    tp_r2_a24 = 0
    tp_r2_a234 = 0

    tp_r23_a2 = 0
    tp_r23_a23 = 0
    tp_r23_a24 = 0
    tp_r23_a234 = 0

    tp_r234_a2 = 0
    tp_r234_a23 = 0
    tp_r234_a24 = 0
    tp_r234_a234 = 0

    # for each user data set
    for j in range(0, 15):
        a2_distance = []
        a3_distance = []
        a4_distance = []
        a23_distance = []
        a24_distance = []
        a34_distance = []
        a234_distance = []

        r2_distance = [] 
        r3_distance = []
        r4_distance = []
        r23_distance = []
        r24_distance = []
        r34_distance = []
        r234_distance = []

        r2_a2 = []
        r2_a23 = []
        r2_a24 = []
        r2_a234 = []

        r23_a2 = []
        r23_a23 = []
        r23_a24 = []
        r23_a234 = []

        r234_a2 = []
        r234_a23 = []
        r234_a24 = []
        r234_a234 = []

        # for each user profile: len 31
        for u in range(0, len(user_profiles_training)):
            global user_i__
            user_i__ = u

            # lists of n-graphs for each set
            # filter out current set: len 15 => len 14
            profile_digraphs = [key for i, key in enumerate(user_profiles_training[u]["digraphs"]) if not (u == user and i == j) ]#not (u == user and i == j) ]
            profile_trigraphs = [key for i, key in enumerate(user_profiles_training[u]["trigraphs"]) if not (u == user and i == j) ]#not (u == user and i == j) ]
            profile_fourgraphs = [key for i, key in enumerate(user_profiles_training[u]["fourgraphs"]) if not (u == user and i == j) ]#not (u == user and i == j) ]
            
            # assert len(profile_digraphs) == len(profile_trigraphs) == len(profile_fourgraphs) == 14

            # get average a-distance combination for each set j to user profile(without set j)
            # len 3
            a_distances = get_a_distances(
                profile_digraphs, # len 14 
                profile_trigraphs, # len 14
                profile_fourgraphs, # len 14
                user_profile_evaluation[user]["digraphs"][j],# len 1
                user_profile_evaluation[user]["trigraphs"][j], # len 1
                user_profile_evaluation[user]["fourgraphs"][j] # len 1
            )

            assert len(a_distances) == 3

            # store a-distances for user
            a2_distance.append(a_distances[0])
            a3_distance.append(a_distances[1])
            a4_distance.append(a_distances[2])
            a23_distance.append(a_distances[0]+a_distances[1])
            a24_distance.append(a_distances[0]+a_distances[2])
            a34_distance.append(a_distances[1]+a_distances[2])
            a234_distance.append(a_distances[0]+a_distances[1]+a_distances[2])

            # get average r-distances from each set j to user profiles
            r_distances = get_r_distances(
                profile_digraphs, 
                profile_trigraphs, 
                profile_fourgraphs, 
                user_profile_evaluation[user]["digraphs"][j],
                user_profile_evaluation[user]["trigraphs"][j],
                user_profile_evaluation[user]["fourgraphs"][j]
            )
            
            # store a-distances for user     
            r2_distance.append(r_distances[0])
            r3_distance.append(r_distances[1])
            r4_distance.append(r_distances[2])
            r23_distance.append(r_distances[0]+r_distances[1])
            r24_distance.append(r_distances[0]+r_distances[2])
            r34_distance.append(r_distances[1]+r_distances[2])
            r234_distance.append(r_distances[0]+r_distances[1]+r_distances[2])

            # TODO
            # store combinations of r- and a-distances for user
            r2_a2.append(r_distances[0]+ a_distances[0])
            r2_a23.append(r_distances[0] + a_distances[0] + a_distances[1] )
            r2_a24.append(r_distances[0] + a_distances[0] + a_distances[2])
            r2_a234.append(r_distances[0] + a_distances[0] + a_distances[1] + a_distances[2])

            r23_a2.append(r_distances[0]+r_distances[1] + a_distances[0])
            r23_a23.append(r_distances[0]+r_distances[1] + a_distances[0]+a_distances[1])
            r23_a24.append(r_distances[0]+r_distances[1] + a_distances[0]+a_distances[2])
            r23_a234.append(r_distances[0]+r_distances[1] + a_distances[0]+a_distances[1]+a_distances[2])

            r234_a2.append(r_distances[0]+r_distances[1]+r_distances[2] + a_distances[0])
            r234_a23.append(r_distances[0]+r_distances[1]+r_distances[2] + a_distances[0]+a_distances[1])
            r234_a24.append(r_distances[0]+r_distances[1]+r_distances[2] + a_distances[0]+a_distances[2])
            r234_a234.append(r_distances[0]+r_distances[1]+r_distances[2] + a_distances[0]+a_distances[1]+a_distances[2])

        # check with combination correctly classified user
        if user == np.argmin(np.array(a2_distance)):
            tp_a2 += 1
        if user == np.argmin(np.array(a3_distance)):
            tp_a3 += 1
        if user == np.argmin(np.array(a4_distance)):
            tp_a4 += 1
        if user == np.argmin(np.array(a23_distance)):
            tp_a23 += 1
        if user == np.argmin(np.array(a24_distance)):
            tp_a24 += 1
        if user == np.argmin(np.array(a34_distance)):
            tp_a34 += 1
        if user == np.argmin(np.array(a234_distance)):
            tp_a234 += 1
        
        if user == np.argmin(np.array(r2_distance)):
            tp_r2 += 1
        if user == np.argmin(np.array(r3_distance)):
            tp_r3 += 1
        if user == np.argmin(np.array(r4_distance)):
            tp_r4 += 1
        if user == np.argmin(np.array(r23_distance)):
            tp_r23 += 1
        if user == np.argmin(np.array(r24_distance)):
            tp_r24 += 1
        if user == np.argmin(np.array(r34_distance)):
            tp_r34 += 1
        if user == np.argmin(np.array(r234_distance)):
            tp_r234 += 1

        if user == np.argmin(np.array(r2_a2)):
            tp_r2_a2 += 1
        if user == np.argmin(np.array(r2_a23)):
            tp_r2_a23 += 1
        if user == np.argmin(np.array(r2_a24)):
            tp_r2_a24 += 1
        if user == np.argmin(np.array(r2_a234)):
            tp_r2_a234 += 1

        if user == np.argmin(np.array(r23_a2)):
            tp_r23_a2 += 1
        if user == np.argmin(np.array(r23_a23)):
            tp_r23_a23 += 1
        if user == np.argmin(np.array(r23_a24)):
            tp_r23_a24 += 1
        if user == np.argmin(np.array(r23_a234)):
            tp_r23_a234 += 1

        if user == np.argmin(np.array(r234_a2)):
            tp_r234_a2 += 1
        if user == np.argmin(np.array(r234_a23)):
            tp_r234_a23 += 1
        if user == np.argmin(np.array(r234_a24)):
            tp_r234_a24 += 1
        if user == np.argmin(np.array(r234_a234)):
            tp_r234_a234 += 1

    return tp_a2, tp_a3, tp_a4, tp_a23, tp_a24, tp_a34, tp_a234, tp_r2, tp_r3, tp_r4, tp_r23, tp_r24, tp_r34, tp_r234, tp_r2_a2, tp_r2_a23, tp_r2_a24, tp_r2_a234, tp_r23_a2, tp_r23_a23, tp_r23_a24, tp_r23_a234, tp_r234_a2, tp_r234_a23, tp_r234_a24, tp_r234_a234


In [None]:
def classify_user_for_authentication(training_user_profiles, testing_user_sample, user_index, profile_index):
    # will contains the r234_a23 distance for each user 
    # based on the training data(training_user_profile) for the test sample(testing_user_sample)
    r234_a23 = []

    for u in range(0, len(training_user_profiles)): # len should be 31 User
        # remove current test data(testing_user sample <- user_index, profile_index) from all training data 

        # get n-graphs for all sets from user(u), except the current(profile_index) set
        # => len 14 -> 15 sets - the current(profile_index)
        profile_digraphs = [key for i, key in enumerate(training_user_profiles[u]["digraphs"]) if not (user_index == u and profile_index == i)]

        profile_trigraphs = [key for i, key in enumerate(training_user_profiles[u]["trigraphs"]) if not (user_index == u and profile_index == i) ]
        
        profile_fourgraphs = [key for i, key in enumerate(training_user_profiles[u]["fourgraphs"])  if not (user_index == u and profile_index == i)]

        # assert len(profile_digraphs) == len(profile_trigraphs) == len(profile_fourgraphs) == 14


        # calculate a- and r-distance, 
        # based on the users profile(without the test set) for the test set

        # len 3 -> (a2, a3, a4)
        a_distances = get_a_distances(profile_digraphs, profile_trigraphs, profile_fourgraphs, testing_user_sample["digraphs"], testing_user_sample["trigraphs"], testing_user_sample["fourgraphs"])
        # len 3 -> (r2, r3, r4)
        r_distances = get_r_distances(profile_digraphs, profile_trigraphs, profile_fourgraphs, testing_user_sample["digraphs"], testing_user_sample["trigraphs"], testing_user_sample["fourgraphs"])
        
        # TODO
        # calc distance:    r2           r3             r4                  a2              a3 
        distance = r_distances[0] + r_distances[1] + r_distances[2] + a_distances[0] + a_distances[1]
        r234_a23.append(distance)

    # returns the index(user) with the minimal distance
    return np.argmin(np.array(r234_a23))

In [None]:
def authentication_test_legal_connection(user_profiles_training, user_profiles_evaluation):
    # keep track of results
    false_reject_classification = 0
    false_reject_distance = 0
    attempt = 0

    # for each user / userprofile:= (digraphs, trigraphs, fourgraphs)
    for index_test_user, test_user in enumerate(user_profiles_evaluation):
        global user_i__
        user_i__ = index_test_user
        
        print("Start legal attempt user " + str(index_test_user))
        # for each user data set
        for sample_test_user in range(0, 15):
            # try to authenticate
            attempt += 1
            # classifiy users based on all user profile training data and this users data
            # get the user where the sample(sample_test_user) is closest to all user profiles(user_profiles_training)
            classified_user = classify_user_for_authentication(user_profiles_training, {
                "digraphs": test_user["digraphs"][sample_test_user], 
                "trigraphs": test_user["trigraphs"][sample_test_user], 
                "fourgraphs": test_user["fourgraphs"][sample_test_user]
                }, index_test_user, sample_test_user)
            
            # the user was correctly classified
            if classified_user == index_test_user:
                # get average a-distances for all combination of user sets, without the current set
                m_d_a = get_mean_a_distance(
                    [key for i, key in enumerate(test_user["digraphs"]) if i != sample_test_user], 
                    [key for i, key in enumerate(test_user["trigraphs"]) if i != sample_test_user],
                    [key for i, key in enumerate(test_user["fourgraphs"]) if i != sample_test_user]
                )
                
                # get average r-distances for all combination of user sets, without the current set
                m_d_r = get_mean_r_distance(
                    [key for i, key in enumerate(test_user["digraphs"]) if i != sample_test_user], 
                    [key for i, key in enumerate(test_user["trigraphs"]) if i != sample_test_user], 
                    [key for i, key in enumerate(test_user["fourgraphs"]) if i != sample_test_user]
                )

                # mean r234_a23            r2       r3          r4          a2        a3 
                mean_distances_user_A = m_d_r[0] + m_d_r[1] + m_d_r[2] + m_d_a[1] + m_d_a[2]                

                # calculate all distance combination between this(test_user) and test sample(test_user)
                # training data consist of all samples except current sample(sample_test_user. test_user)
                mean_distance_profile_test_user = calculate_distances(
                    [key for i, key in enumerate(test_user["digraphs"]) if i != sample_test_user], 
                    [key for i, key in enumerate(test_user["trigraphs"]) if i != sample_test_user], 
                    [key for i, key in enumerate(test_user["fourgraphs"]) if i != sample_test_user], 
                    test_user["digraphs"][sample_test_user],  
                    test_user["trigraphs"][sample_test_user], 
                    test_user["fourgraphs"][sample_test_user]
                )

                # 
                for index_user_B, user_B in enumerate(user_profiles_training):
                    # if user_B is not current(test_user) user
                    if index_user_B != index_test_user:
                        # calculate averge distance 
                        mean_distance_sample_user_B = calculate_distances(
                            [key for i, key in enumerate(user_B["digraphs"])], 
                            [key for i, key in enumerate(user_B["trigraphs"])], 
                            [key for i, key in enumerate(user_B["fourgraphs"])], 
                            test_user["digraphs"][sample_test_user], 
                            test_user["trigraphs"][sample_test_user], 
                            test_user["fourgraphs"][sample_test_user] 
                        )

                        # check 
                        if mean_distance_profile_test_user["r234_a23"] >= mean_distances_user_A + (0.5 * (mean_distance_sample_user_B["r234_a23"] - mean_distances_user_A)):
                            false_reject_distance += 1
                            break
                        
                        # assert False

            # the user was wrongly classified
            else:
                false_reject_classification += 1

    print("Attempts: "+ str(attempt) + " FR_Classification: "+ str(false_reject_classification) + " FR_Distance: " + str(false_reject_distance))
    return (attempt, false_reject_classification+false_reject_distance)

In [None]:
def authentication_test_attack(user_profiles_training, user_profile_evaluation):
    false_accept = 0
    attempts = 0

    for index_attack_user, attack_user in enumerate(user_profiles_training[0:1]):
        print("Start attack on user " + str(index_attack_user))
        attacked_users = [user for i, user in enumerate(user_profile_evaluation) if i != index_attack_user]

        for sample_attack in range(0, 15):

            for index_attacked_user, attacked_user in enumerate(attacked_users):
                attempts += 1

                classified_user = classify_user_for_authentication(attacked_users, {
                    "digraphs": attack_user["digraphs"][sample_attack], 
                    "trigraphs": attack_user["trigraphs"][sample_attack], 
                    "fourgraphs": attack_user["fourgraphs"][sample_attack]
                }, -1, -1)

                if classified_user == index_attacked_user:
                    m_d_a = get_mean_a_distance(
                        [key for i, key in enumerate(attacked_user["digraphs"])], 
                        [key for i, key in enumerate(attacked_user["trigraphs"])], 
                        [key for i, key in enumerate(attacked_user["fourgraphs"])]
                    
                    )
                    m_d_r = get_mean_r_distance(
                        [key for i, key in enumerate(attacked_user["digraphs"])], 
                        [key for i, key in enumerate(attacked_user["trigraphs"])], 
                        [key for i, key in enumerate(attacked_user["fourgraphs"])]
                    
                    )
                    mean_distances_user_A = m_d_r[0] + m_d_r[1] + m_d_r[2] + m_d_a[1] + m_d_a[2]  

                    mean_distance_profile_test_user = calculate_distances(
                        [key for i, key in enumerate(attacked_user["digraphs"])], 
                        [key for i, key in enumerate(attacked_user["trigraphs"])], 
                        [key for i, key in enumerate(attacked_user["fourgraphs"])], 
                        attack_user["digraphs"][sample_attack], 
                        attack_user["trigraphs"][sample_attack], 
                        attack_user["fourgraphs"][sample_attack]
                    )
                 
                    for index_user_B, user_B in enumerate(attacked_users):
                        if index_user_B != index_attacked_user:
                            mean_distance_sample_user_B = calculate_distances(
                                [key for i, key in enumerate(user_B["digraphs"])], 
                                [key for i, key in enumerate(user_B["trigraphs"])], 
                                [key for i, key in enumerate(user_B["fourgraphs"])], 
                                attack_user["digraphs"][sample_attack], 
                                attack_user["trigraphs"][sample_attack], 
                                attack_user["fourgraphs"][sample_attack]
                            )
                            
                            if mean_distance_profile_test_user["r234_a23"] < mean_distances_user_A + (0.5 * (mean_distance_sample_user_B["r234_a23"] - mean_distances_user_A)):
                                false_accept += 1
                                
                                break
            
                
    return (attempts, false_accept)

In [None]:
def classify_dataset(path_to_dataset_training, path_to_dataset_evaluation, filename):
    # open training data set
    with open(path_to_dataset_training, "rb") as fp:
        user_profiles_training = pickle.load(fp)

    # open eval data set
    with open(path_to_dataset_evaluation, "rb") as fp:
        user_profiles_evaluation = pickle.load(fp)



    # remove row 13, 18, 26
    filter =  []
    user_profiles_training = [j for i, j in enumerate(user_profiles_training) if i not in filter]
    user_profiles_evaluation = [j for i, j in enumerate(user_profiles_evaluation) if i not in filter]



    classifications = []
    print("start classfication:")

    # for each user
    for user_index in range(len(user_profiles_training)):
        print("start classification for user " + str(user_index))
        user_i__ = user_index

        # classify user
        classification = classify_user_experiment(user_profiles_training, user_profiles_evaluation, user_index)
        classifications.append(classification)
        
        # create dataframe
        df = pd.DataFrame(classifications, columns=[
            'a2','a3','a4','a23','a24','a34', 'a234','r2','r3','r4','r23','r24',
            'r34','r234', 'r2_a2', 'r2_a23', 'r2_a24', 'r2_a234', 'r23_a2', 
            'r23_a23', 'r23_a24', 'r23_a234', 'r234_a2', 'r234_a23', 'r234_a24', 'r234_a234' 
            ]
        )

        # get sum of correct classifications
        sums = df.sum().to_frame()

        # total number of tried classifications
        total = (15 * (user_index + 1))

        # calculate missclassification as (total - sucessfull classifications)
        sums["Missclassifications"] = total - sums.iloc[:,0]

        # calculate error rate
        sums["Error"] = (sums["Missclassifications"] / total) * 100
        
        # write results to file
        sums.to_csv('./__DATA/'+ filename + '_classification_performance.csv')
        df.to_csv('./__DATA/' + filename + '_classification.csv')


In [None]:
def authenticate_dataset(path_to_dataset_training, path_to_dataset_evaluation, filename):
    # open training data set
    with open(path_to_dataset_training, "rb") as fp:
        user_profiles_training = pickle.load(fp)

    # open eval data sets
    with open(path_to_dataset_evaluation, "rb") as fp:
        user_profiles_evaluation = pickle.load(fp)

    # remove row 13, 18, 26
    filter =  []
    user_profiles_training = [j for i, j in enumerate(user_profiles_training) if i not in filter]
    user_profiles_evaluation = [j for i, j in enumerate(user_profiles_evaluation) if i not in filter]
    assert len(user_profiles_training) == 31


    # try legitimate auth
    legal_attempts = authentication_test_legal_connection(user_profiles_training, user_profiles_evaluation)

    # try fraudulent auth
    attacks = authentication_test_attack(user_profiles_training, user_profiles_evaluation)
    
    # write results to file
    df = pd.DataFrame(data={
        'Type':['False Reject', 'False Accept'], 
        'Attempts':[legal_attempts[0], attacks[0]], 
        'Result':[legal_attempts[1], attacks[1]]
    })
    df.to_csv('./__DATA/' + filename + "_authentication.csv", index=False)

## Execute experiments

In [None]:
# read in original data
original_set = './../freetext/FreeText-Dataset-31-USERS.csv'
original_data_profiles = './__DATA/original_data_profiles'

In [None]:
# original data
if not os.path.isfile(original_data_profiles):
    create_user_profiles(original_set, original_data_profiles)

authenticate_dataset(original_data_profiles, original_data_profiles, "original")
classify_dataset(original_data_profiles, original_data_profiles, "original")

In [None]:
print(user__)


with open('counters_3.txt', 'w+') as f:
    f.write("[")
    for x in user__:
        f.write(f'{x},')
    f.write("]")


print(user__[13]) # 40032
print(user__[18]) # 36520
print(user__[26]) # 36884
