In [None]:
from pprint import pprint
from IPython.display import JSON
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from functools import reduce

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow import keras
import pydot

from matplotlib import pyplot as plt
from skimage.draw import line_aa

from IPython.display import Image 

import pickle
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2
 

In [None]:
NUMBER_OF_USERS = 10000
MAX_SESSIONS_PER_USER = 10
MIN_GESTURES_PER_SESSION = 100
SOME_MAX_USERS = 2000
MIN_GESTURES = 400
NUM_PATHS = 10
GESTURES = ['swipe']
WINDOW_SIZE = 20


In [None]:
INPUT_SHAPE = (20, 92, 1)
USER_DERIVED_FEATURES = True
TOUCHALYTICS = 'touchalytics'
BRAINRUN = 'brainrun'
DATASET = BRAINRUN
USE_PRESSURE_AREA = False

def configure_cases(case_number): 
    global INPUT_SHAPE, USER_DERIVED_FEATURES

    if case_number == 1:
        INPUT_SHAPE = (20, 92, 1)
        USER_DERIVED_FEATURES = True
    elif case_number == 2:
        INPUT_SHAPE = (20, 81, 1)
        USER_DERIVED_FEATURES = False

configure_cases(1)

In [None]:
def pj(json):
    print(dumps(json, indent = 4))

def data_to_array(data):
    return np.array([
        data[k] for k in sorted(data.keys())
    ])

has_input = False

def extract_features(data, delta_time, use_area_pressure = USE_PRESSURE_AREA):
    result = []
    points = [(data[0]['x0'], data[0]['y0'])] + [(x['moveX'], x['moveY']) for x in data]
    points = np.array(points)

    res = np.nan_to_num(result)
    first_three_points = points[:5].flatten() 
    last_three_points = points[-5:].flatten() 
    first_three_points.resize((10,))
    last_three_points.resize((10,))

    return np.concatenate([res, first_three_points, last_three_points])
    
def gesture_to_data(c, use_extra_features = True):
    delta_time = (c['t_stop'] - c['t_start']) / 1000
    extra_features = extract_features(c['data'], delta_time) if use_extra_features else []
    return np.concatenate([[c['t_start'], c['t_stop'], delta_time], extra_features])

def window_to_datapoint(window):
    # TODO CHECK AXIS
    return np.concatenate([window[:, 2:],  # Exclude start and stop time
    np.concatenate([[0], (window[1:, 0] - window[1:, 0]).flatten() / 1000]).reshape(window.shape[0],1), # Window start - initial point stop
np.concatenate([[0], (window[1:, 0] - window[1:, 0]).flatten() / 1000]).reshape(window.shape[0],1)], axis = 1).reshape(window.shape[0], window.shape[1], 1)

def session_to_datapoints(s):
    featurized_session = np.array([gesture_to_data(x) for x in s['gestures']])
    sliding_windows = (
        np.expand_dims(np.arange(WINDOW_SIZE), 0) +
        np.expand_dims(np.arange(len(featurized_session) - WINDOW_SIZE), 0).T
    )

    temp = np.array([window_to_datapoint(window) for window in featurized_session[sliding_windows]])
    return deep_model.predict(temp)

def session_to_dp_with_intruders(session, test_indices, n_intruders, intruders):
    test_data = np.array([gesture_to_data(x) for x in session['gestures']])[test_indices]

    sliding_windows = (
        np.expand_dims(np.arange(WINDOW_SIZE), 0) +
        np.expand_dims(np.arange(len(test_data) - WINDOW_SIZE), 0).T
    )

    windows = test_data[sliding_windows]

    intruders = np.array([gesture_to_data(x) for x in intruders])
    intruders = np.resize(intruders, (windows.shape[0], n_intruders, windows.shape[2]))

    if n_intruders != 0:
        windows[:, -n_intruders:, :] = intruders
    temp = np.array([window_to_datapoint(window) for window in windows])

    return deep_model.predict(temp)
    

In [None]:
test_data = np.array([[1,2],[3,4],[5,6],[7,8],[9,10]])

sliding_windows = (
        np.expand_dims(np.arange(3), 0) +
        np.expand_dims(np.arange(len(test_data) - 3), 0).T
    )

windows = test_data[sliding_windows]
print(windows)
intr = np.array([[-1,-2],[-3,-4],[-5,-6],[-7,-8],[-9,-10]])
n_intr = 1
intruders = np.resize(intr, (windows.shape[0], n_intr, windows.shape[2]))

print(intruders)

windows[:, -n_intr:, :] = intruders
print(windows)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

LIMIT_GESTURES_PER_USER = 4000
SESSION_GESTURES_MORE_THAN = 140
good_screens = ['MathisisGame', 'FocusGame']

def prefilter_session(s):
    s['gestures'].sort(key = lambda x: x['t_start'])
    s['gestures'] = [x for x in s['gestures'] 
        if x['t_stop'] - x['t_start'] > 70 and x['t_stop'] - x['t_start'] < (1000 if DATASET == BRAINRUN else 2000) and 
        ((x['screen'].split(' ')[0] in good_screens and x['type'] == 'swipe') if DATASET == BRAINRUN else True)]

def parse_user(user_id):
    i = 0

    total = 0
    should_delete = False
    while i < len(users[user_id]['devices'][0]['sessions']):
        prefilter_session(users[user_id]['devices'][0]['sessions'][i])

        if len(users[user_id]['devices'][0]['sessions'][i]['gestures']) < SESSION_GESTURES_MORE_THAN or should_delete:
            del users[user_id]['devices'][0]['sessions'][i]
        else:
            users[user_id]['devices'][0]['sessions'][i] = session_to_datapoints(users[user_id]['devices'][0]['sessions'][i])
            total += len(users[user_id]['devices'][0]['sessions'][i])
            i += 1

def parse_user_dont_convert(user_id):
    i = 0
    total = 0
    should_delete = False
    while i < len(temp_users[user_id]['devices'][0]['sessions']):
        prefilter_session(temp_users[user_id]['devices'][0]['sessions'][i])

        if len(temp_users[user_id]['devices'][0]['sessions'][i]['gestures']) < SESSION_GESTURES_MORE_THAN or should_delete:
            del temp_users[user_id]['devices'][0]['sessions'][i]
        else:
            total += len(temp_users[user_id]['devices'][0]['sessions'][i])
            i += 1

def get_users_over_gestures(number_of_gestures = 600):
    uc = np.zeros((len(users), ))
    for i in range(len(users)):
        uc[i] = 0
        for session in users[i]['devices'][0]['sessions']:
            uc[i] += session.shape[0]

    return np.where(uc > number_of_gestures)[0]

frrs = []
fars = []
from scipy.stats import mode

ws = 1
def calculate_eer(res, y_test):
    global frrs, fars
    frrs = []
    fars = []
    end_index = max(np.where(y_test == 1)[0])
    user_results = res[: end_index]
    intruder_results = res[end_index:]

    user_windows = user_results[(
        np.expand_dims(np.arange(ws), 0) +
        np.expand_dims(np.arange(len(user_results) - ws), 0).T
    )]

    intruder_windows = intruder_results[(
            np.expand_dims(np.arange(ws), 0) +
            np.expand_dims(np.arange(len(intruder_results) - ws), 0).T
        )]

    desc_scores = np.sort(res)[::-1]

    for threshold in desc_scores[::10]:
        FRR = 1 - np.mean(mode(user_windows >= threshold, axis=1)[0])
        FAR = 1 - np.mean(mode(intruder_windows < threshold, axis=1)[0])

        frrs.append(FRR)
        fars.append(FAR)

    fars = np.array(fars)
    frrs = np.array(frrs)

    eer = fars[np.argwhere(np.diff(np.sign(fars - frrs))).flatten()]
    return eer[0] if eer.size > 0 else 1

In [None]:
from copy import deepcopy

DATASET = TOUCHALYTICS
DATA_DIRECTORY = 'simple_cnn'

resulting_data = []

for iteration in range(10):
    with open(f'{DATA_DIRECTORY}/results/iteration_{iteration}.pkl', 'rb') as f:
        u_training, u_testing, *_ = pickle.load(f)
    USERS_USED_FOR_TRAINING_FEAT_EXTRACTOR = u_training
    USERS_USED_FOR_TESTING_GENERALIZATION = u_testing

    deep_model = models.load_model(f'{DATA_DIRECTORY}/models/simple_cnn_128_embedding_{iteration}.h5')
    deep_model = models.Model(inputs = deep_model.input, outputs = deep_model.layers[-3].output)

    WINDOW_SIZE = deep_model.input.shape[1]

    if DATASET == BRAINRUN:
        with open('brainrun_full_not_parsed.pkl', 'rb') as f:
            users = pickle.load(f)
            valid_users = []
            for i, user in enumerate(users):
                if i in USERS_USED_FOR_TESTING_GENERALIZATION:
                    valid_users.append(user)

            users = valid_users
    if DATASET == TOUCHALYTICS:
        with open('touchalytics_full_not_parsed.pkl', 'rb') as f:
            users = pickle.load(f)
        with open('touchalytics_full_not_parsed.pkl', 'rb') as f:
            temp_users = pickle.load(f)

    for user in tqdm(range(len(users))):
        parse_user(user)
        parse_user_dont_convert(user)

    # if DATASET == TOUCHALYTICS:
    valid_users = get_users_over_gestures(140)
    break

    # ceva('simple_cnn_results', iteration, valid_users)
        

In [None]:
import sklearn.metrics

"""
Python compute equal error rate (eer)
ONLY tested on binary classification

:param label: ground-truth label, should be a 1-d list or np.array, each element represents the ground-truth label of one sample
:param pred: model prediction, should be a 1-d list or np.array, each element represents the model prediction of one sample
:param positive_label: the class that is viewed as positive class when computing EER
:return: equal error rate (EER)
"""
def compute_eer(label, pred, positive_label=1):
    # all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
    global fpr, fnr
    fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred)
    fnr = 1 - tpr

    # the threshold of fnr == fpr
    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]

    # theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
    eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]

    # return the mean of eer from fpr and from fnr
    eer = (eer_1 + eer_2) / 2
    tresh = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
    return (eer, tresh)

In [None]:
user_id = 10

max_train_per_session = int(200000 / len(users[int(user_id)]['devices'][0]['sessions']))
max_test_per_session = int(67777777 / len(users[int(user_id)]['devices'][0]['sessions']))
X_train = np.concatenate([session[get_train_indices(session.shape[0], max_size=max_train_per_session)] for session in users[int(user_id)]['devices'][0]['sessions']])
X_test = np.concatenate([session[get_test_indices(session.shape[0], max_size=max_test_per_session)] for session in users[int(user_id)]['devices'][0]['sessions']])

# Cross validation

hyperparameters = [(100, {'kernel':'rbf', 'gamma':0.8, 'nu': 0.3})]

best_hyperparameters = sorted(hyperparameters, key=lambda x: x[0])[0][1]
# results[user_id]['hyper'] = sorted(hyperparameters, key=lambda x: x[0])
svm = OneClassSVM(
            kernel = best_hyperparameters['kernel'], 
            nu=best_hyperparameters['nu'], 
            degree = best_hyperparameters['degree'] if 'degree' in best_hyperparameters else 0, 
            gamma = best_hyperparameters['gamma'] if 'gamma' in best_hyperparameters else 0,)
scaler = StandardScaler()

svm.fit(scaler.fit_transform(X_train))

res = np.concatenate([
    svm.decision_function(scaler.transform(X_test)), *[
        svm.decision_function(scaler.transform(session[get_intruder_size(session.shape[0])[1]])) for other_uid in valid_users[valid_users != user_id] for session in users[other_uid]['devices'][0]['sessions']]
])

y_test = np.concatenate([np.zeros((X_test.shape[0],)) + 1, np.zeros((res.shape[0] - X_test.shape[0],)) - 1])

intr_res = {}
# Compute eer and treshold
eer, threshold = compute_eer(y_test, res)
print(eer)
# Concatenate all other users data (from temp users) into one array
possible_intruders = []
for other_uid in valid_users[valid_users != user_id]:
    for session in temp_users[other_uid]['devices'][0]['sessions']:
        possible_intruders.extend(session['gestures'])

# For each intruder 1-7 n_intruders
for n_intruders in range(0, 11):
#  intruder_list = Select X_test * intruder samples at random from the other users'data
    intruders = np.random.choice(possible_intruders, size=n_intruders * X_test.shape[0], replace=False)
    X_test_intruders = np.concatenate([session_to_dp_with_intruders(session, get_test_indices_custom(len(session['gestures']), max_size=max_test_per_session), n_intruders, intruders) for session in temp_users[int(user_id)]['devices'][0]['sessions']])
    r_intr = svm.decision_function(scaler.transform(X_test_intruders))
    print(np.mean(r_intr < threshold))
    intr_res[n_intruders] = [r_intr, acc]
# print(intr_res)


In [None]:
users[0]['devices'][0]['sessions'][0][0]

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import OneClassSVM
import itertools
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import multiprocessing as mp
import time

MAX_TRAIN_SIZE = 40000
MAX_TEST_SIZE = 10000

def train_test_data(data, test_size = 0.2, gap = WINDOW_SIZE, max_size = 200):
    # ERROR IS HERE
    end = len(data)
    middle = int(end * (1 - test_size))
    return np.arange(middle - gap), np.arange(middle, end)

def get_train_indices(size, test_size = 0.2, gap = WINDOW_SIZE, max_size = 500):
    # ERROR IS HERE
    size = size
    middle = int(size * (1 - test_size))
    middle = min(middle, max_size)
    return np.arange(middle - gap)

def get_test_indices(size, test_size = 0.2, gap = WINDOW_SIZE, max_size = 500):
    size = min(size, max_size)
    middle = int(size * (1 - test_size))
    return np.arange(middle, size)

def get_test_indices_custom(size, test_size = 0.2, gap = WINDOW_SIZE, max_size = 500):
    size = min(size, max_size)
    middle = int(size * (1 - test_size))
    return np.arange(middle - gap, size)
    # return np.arange(0, size)

def get_intruder_size(size, test_size = 0.2, gap = WINDOW_SIZE, max_size = 500):
    size = min(size, max_size)
    middle = int(size * (1 - test_size))
    validation_middle = int(size * (1 - test_size / 2))
    return np.arange(middle, validation_middle - gap), np.arange(validation_middle, size)  

# Parameter space
parameters = [{
    'kernel': ['rbf'], 'gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 'nu': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],},
    ]

NUMBER_OF_G = 1000000

def ceva(fp, iteration, valid_users):
    FILE_PATH = f'{fp}/iteration_{iteration}'

    def run_experiment_for_user(user_id, users, valid_users, parameters, temp_users):
        # CHANGE HERE IF ERROR
        max_train_per_session = int(NUMBER_OF_G / len(users[int(user_id)]['devices'][0]['sessions']))
        max_test_per_session = int(MAX_TRAIN_SIZE / len(users[int(user_id)]['devices'][0]['sessions']))
        X_train = np.concatenate([session[get_train_indices(session.shape[0], max_size=max_train_per_session)] for session in users[int(user_id)]['devices'][0]['sessions']])
        X_test = np.concatenate([session[get_test_indices(session.shape[0], max_size=max_test_per_session)] for session in users[int(user_id)]['devices'][0]['sessions']])
        
        # Cross validation

        hyperparameters = [(100, {'kernel':'rbf', 'gamma':0.8, 'nu': 0.3})]
       
        best_hyperparameters = sorted(hyperparameters, key=lambda x: x[0])[0][1]
        # results[user_id]['hyper'] = sorted(hyperparameters, key=lambda x: x[0])
        svm = OneClassSVM(
                    kernel = best_hyperparameters['kernel'], 
                    nu=best_hyperparameters['nu'], 
                    degree = best_hyperparameters['degree'] if 'degree' in best_hyperparameters else 0, 
                    gamma = best_hyperparameters['gamma'] if 'gamma' in best_hyperparameters else 0,)
        scaler = StandardScaler()

        svm.fit(scaler.fit_transform(X_train))

        res = np.concatenate([
            svm.decision_function(scaler.transform(X_test)), *[
                svm.decision_function(scaler.transform(session[get_intruder_size(session.shape[0])[1]])) for other_uid in valid_users[valid_users != user_id] for session in users[other_uid]['devices'][0]['sessions']]
        ])

        y_test = np.concatenate([np.zeros((X_test.shape[0],)) + 1, np.zeros((res.shape[0] - X_test.shape[0],)) - 1])

        intr_res = {}
        # Compute eer and treshold
        eer, threshold = compute_eer(y_test, res)
        print(eer)
        # Concatenate all other users data (from temp users) into one array
        possible_intruders = []
        for other_uid in valid_users[valid_users != user_id]:
            for session in temp_users[other_uid]['devices'][0]['sessions']:
                possible_intruders.extend(session['gestures'])

        # For each intruder 1-7 n_intruders
        for n_intruders in range(1, 8):
        #  intruder_list = Select X_test * intruder samples at random from the other users'data
            intruders = np.random.choice(possible_intruders, size=n_intruders * X_test.shape[0], replace=False)
            X_test = np.concatenate([session_to_dp_with_intruders(session, get_test_indices_custom(session.shape[0], max_size=max_test_per_session), n_intruders, intruders) for session in temp_users[int(user_id)]['devices'][0]['sessions']])
            r_intr = svm.decision_function(scaler.transform(X_test))
            acc = np.mean(r_intr > threshold)
            intr_res[n_intruders] = [r_intr, acc]

        # Save results with pickle to a file
        with open(f'{FILE_PATH}/user_{user_id}.pkl', 'wb') as f:
            pickle.dump((y_test, res, hyperparameters, intr_res, USERS_USED_FOR_TESTING_GENERALIZATION), f)


    def find_hyper(user_id, hyper_pair, users, valid_users, X_train, hyperparameters):
        cv = TimeSeriesSplit(n_splits=4, gap = WINDOW_SIZE)
        avg_eer = []
        for train, test in cv.split(X_train):
            clf = OneClassSVM(
                kernel = hyper_pair['kernel'], 
                nu=hyper_pair['nu'], 
                degree = hyper_pair['degree'] if 'degree' in hyper_pair else 0, 
                gamma = hyper_pair['gamma'] if 'gamma' in hyper_pair else 0,)
            scaler = StandardScaler()
            clf.fit(scaler.fit_transform(X_train[train]))
            res = np.concatenate([
                clf.decision_function(scaler.transform(X_train[test])), *[
                    clf.decision_function(scaler.transform(session[get_intruder_size(session.shape[0])[0]])) for other_uid in valid_users[valid_users != user_id] for session in users[other_uid]['devices'][0]['sessions']]
            ])
            y_test = np.concatenate([np.zeros((test.shape[0],)) + 1, np.zeros((res.shape[0] - test.shape[0],)) - 1])
            
            res[np.isneginf(res)] = -1000
            res[np.isposinf(res)] = 1000
            avg_eer.append(compute_eer(y_test, res))
        hyperparameters.append((np.mean(avg_eer), hyper_pair))

    # results = [{i: {}} for i in range(300)]

    if not os.path.exists(FILE_PATH):
        os.makedirs(FILE_PATH)

    # ERROR HERE
    for user_id in list(set(valid_users).intersection([0,1])):
        max_train_per_session = int(NUMBER_OF_G / len(users[int(user_id)]['devices'][0]['sessions']))
        max_test_per_session = int(MAX_TRAIN_SIZE / len(users[int(user_id)]['devices'][0]['sessions']))
        X_train = np.concatenate([session[get_train_indices(session.shape[0], max_size=max_train_per_session)] for session in users[int(user_id)]['devices'][0]['sessions']])
        X_test = np.concatenate([session[get_test_indices(session.shape[0], max_size=max_test_per_session),] for session in users[int(user_id)]['devices'][0]['sessions']])
        
        # Cross validation

        # hyperparameters = [(100, {'kernel':'rbf', 'gamma':0.8, 'nu': 0.3})]
            
        threads = []
        cid = 0
        N_THREADS = 32
        can_exit = False

        hyper_grid = list(ParameterGrid(parameters))
        manager = mp.Manager()
        hyperparameters = manager.list()

        pbar = tqdm(total=len(hyper_grid))
        while not can_exit:
            while len(threads) < N_THREADS and cid < len(hyper_grid):
                hyper_pair = hyper_grid[cid]
                thread = mp.Process(target=find_hyper, args=(user_id, hyper_grid[cid], users, valid_users, X_train, hyperparameters))
                thread.start()
                threads.append(thread)
                pbar.update(1)
                cid += 1

            for thread in threads:
                if not thread.is_alive():
                    thread.join()
                    threads.remove(thread)

            if(len(threads) == 0):
                can_exit = True
            time.sleep(1)
        pbar.close()
        hyperparameters = list(hyperparameters)

        best_hyperparameters = sorted(hyperparameters, key=lambda x: x[0])[0][1]
        # results[user_id]['hyper'] = sorted(hyperparameters, key=lambda x: x[0])
        svm = OneClassSVM(
                    kernel = best_hyperparameters['kernel'], 
                    nu=best_hyperparameters['nu'], 
                    degree = best_hyperparameters['degree'] if 'degree' in best_hyperparameters else 0, 
                    gamma = best_hyperparameters['gamma'] if 'gamma' in best_hyperparameters else 0,)
        scaler = StandardScaler()

        svm.fit(scaler.fit_transform(X_train))

        res = np.concatenate([
            svm.decision_function(scaler.transform(X_test)), *[
                svm.decision_function(scaler.transform(session[get_intruder_size(session.shape[0])[1]])) for other_uid in valid_users[valid_users != user_id] for session in users[other_uid]['devices'][0]['sessions']]
        ])

        y_test = np.concatenate([np.zeros((X_test.shape[0],)) + 1, np.zeros((res.shape[0] - X_test.shape[0],)) - 1])
        # Save results with pickle to a file
        with open(f'{FILE_PATH}/user_{user_id}.pkl', 'wb') as f:
            pickle.dump((y_test, res, hyperparameters, USERS_USED_FOR_TESTING_GENERALIZATION), f)

        print(compute_eer(y_test, res))
        # results[user_id]['eer'] = compute_eer(y_test, res)

    # with open('1_class_svm_results_brainrun_full.pkl', 'wb') as f:
    #     pickle.dump(results, f)
        
    # print('---------- RESULTS ----------')
    # print(np.mean([x['eer'] for x in results]))
    # print(np.mean([x['hyper'][0][0] for x in results]))


    threads = []
    cid = 0
    N_THREADS = 32
    can_exit = False

    if not os.path.exists(FILE_PATH):
        os.makedirs(FILE_PATH)

    # ERROR HERE
    vu = list(set(valid_users).difference([0,1]))
    pbar = tqdm(total=len(vu))
    while not can_exit:
        while len(threads) < N_THREADS and cid < len(vu):
            user_id = vu[cid]
            thread = mp.Process(target=run_experiment_for_user, args=(vu[cid], users, valid_users, parameters,))
            thread.start()
            threads.append(thread)
            cid += 1
            pbar.update(1)

        for thread in threads:
            if not thread.is_alive():
                thread.join()
                threads.remove(thread)

        if(len(threads) == 0):
            can_exit = True

        time.sleep(1)
    