# Testing Accuracy with Batches

For each user that we test, group into batches with one sample per direction.

Test each sample individually, but average the prediction across the entire batch. 

## Code

In [1]:
import os
from random import randint
import numpy as np
import pandas as pd
from keras.layers import Dense, Activation, Input, CuDNNLSTM, Bidirectional, Dropout
from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import OneHotEncoder, StandardScaler

Using TensorFlow backend.


### Data Preperation

In [2]:
# Return a list of usernames based on the file names in feature files. 
def get_usernames(train_files):
    usernames = []
    for f in train_files:
        ext = f[-4:]
        if(ext == '.csv'):
            username = f[:f.index('-')]
            usernames.append(username)
    return usernames


# Return a list of Pandas Dataframes from the csv feature files.  
def getdfs(path, files):
    dfs = []
    for f in files: 
        ext = f[-4:]
        if(ext == '.csv'):
            file = os.path.join(path, f)
            df = pd.read_csv(file)
            dfs.append(df)
    return dfs


# Create sequences out of a dataframe by splitting the sample number into a 2D array.
# For each sample in a given direction: create a list of the 10 features, and append that list to the sample
# Remove the sample number from the features as this is not important to the classifier. 
    '''
    Goal: [Sample 1: [[10 features], [10 features], [10 features]], Sample 2: [[10 features], [10 features], [10 features]]]
    '''
def create_sequences(df):

    samples = df['Sample'].unique()
    dirs = df['Direction'].unique()
    sequences = []
    for dir in dirs:
        dir_frame = df.loc[df['Direction'] == dir]
        for sample in samples:
            sample_frame = dir_frame.loc[dir_frame['Sample'] == sample]
            sf = sample_frame.drop(columns=['Sample']) # Remove the sample number before adding to the list
            values = list(sf.values.tolist())
            if len(values) > 0:
                sequences.append(values)
    return sequences


# encode each of the features with a one hot encoding for the direction variable
def encode(x):
    newx = []
    for sequence in x:
        new_seq = []
        for feature in sequence:
            new_seq.append(one_hot_encode(feature))
        newx.append(new_seq)
    return newx


# One hot encode the direction variable only.
# The direction is the first variable in the feature vec, so base the one hot encoding on this. 
# If the direction value is 0 (used for padding only) then use all zeros not an encoding
def one_hot_encode(feature_vec):
    feature_vec = list(feature_vec)
    zeros = [0, 0, 0, 0, 0, 0, 0, 0]
    n_labels = 8
    i = np.eye(n_labels)
    
    dir = feature_vec[0]
    if dir < 1:
        enc = zeros
    else:
        enc = i[int(feature_vec[0]) - 1]

    feature_vec.remove(feature_vec[0])

    feature_vec[0:0] = enc
    return feature_vec


# Create a test set similar to the create_test_set method
# This version uses ALL test data for every user, not just a subset. 
def create_full_test(username, usernames, dfs):
    # Create the positive examples. Set y to 1
    useridx = usernames.index(username)
    user_df = dfs[useridx]
    pos_batches = batchify(user_df, is_pos=True)
    ys = [1 for _ in pos_batches]
    
    # Create the negative examples
    neg_batches = []
    for user in usernames:
        if user != username:
            idx = usernames.index(user)
            df = dfs[idx]
            user_batches = batchify(df, is_pos=True)
            neg_batches += user_batches
    
    batches = pos_batches + neg_batches
    ys += [0 for _ in neg_batches]
    
    x = []
    for batch in batches:
        x.append(encode(pad_sequences(batch, maxlen=50, dtype='float32', value=[0 for x in range(10)])))
    X = np.asarray(x)
    Y = np.asarray(ys)               
    return X, Y
    


def create_test_set(username, usernames, dfs):
    
    # Create the positive examples. Set y to 1
    useridx = usernames.index(username)
    user_df = dfs[useridx]
    pos_batches = batchify(user_df, is_pos=True)
    ys = [1 for _ in pos_batches]
    
    # Create the negative examples
    neg_batches = []
    for user in usernames:
        if user != username:
            idx = usernames.index(user)
            df = dfs[idx]
            user_batches = batchify(df, is_pos=False)
            neg_batches += user_batches
    
    batches = pos_batches + neg_batches
    ys += [0 for _ in neg_batches]
    
    x = []
    for batch in batches:
        x.append(encode(pad_sequences(batch, maxlen=50, dtype='float32', value=[0 for x in range(10)])))
    X = np.asarray(x)
    Y = np.asarray(ys)               
    return X, Y
 


def batchify(dataframe, is_pos=True):
    dirs = [1,2,3,4,5,6,7,8]
    direction_seqs = [] 
    for i in dirs:
        dir_i = dataframe.loc[dataframe['Direction'] == i]
        seqs = create_sequences(dir_i)
        
        if not is_pos:
            # Randomly choose 4 samples for each direction for a user.
            seqs = [seqs[randint(0, len(seqs) -1)] for x in range(4)]
        direction_seqs.append(seqs)

    # Batch 1 sample per direction into a batch
    batches = []
    if not is_pos:
        num_batches = 4
    else:
        num_batches = find_num_batches(direction_seqs)
    
    for count in range(num_batches):
        batch = []
        for dir in direction_seqs:
            batch.append(dir[count])
        batches.append(batch)
    
    return batches

        
# Find the number of batches in a series. 
def find_num_batches(seqs):
    lens = [len(s) for s in seqs]
    return min(lens)




### Model Testing

In [3]:
# convert the probabilities to predictions on a threshold
def convert(p, threshold):
    return 1 if p >= threshold else 0


# Given a set of probabilities and true Y-Values, 
# Convert probabilities to predictions. 
# Determine accuracy and return dictionary with tracked metrics. 
def calc_accuracy_metrics(y_preds, y_actual):
    pairs = list(zip(y_preds, y_actual))
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for pair in pairs:
        if pair[0] == 1:
            if pair[1] == 1:
                tp += 1
            else:
                fp += 1
        else:
            if pair[1] == 0:
                tn += 1
            else:
                fn += 1
    
    try:
        far = fp / (fp + tn)
    except ZeroDivisionError:
        far = -1
    try:    
        frr = fn / (fn + tp)
    except ZeroDivisionError:
        frr = -1
    try:    
        precision = tp / (tp + fp)
    except ZeroDivisionError:
        precision = -1
    try:
        recall = tp / (tp + fn)
    except ZeroDivisionError:
        recall = -1
    try:    
        f1_score = (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
        f1_score = -1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    performance_results = {'far': far, 'frr': frr, 'precision': precision, 'recall': recall, 'f1': f1_score, 
                           'accuracy': accuracy}
    return performance_results


# Get the predicitons from the classifier. 
# Returns a list of predictions. 
def get_predictions(clf, test_x):
    ypreds = clf.predict(test_x, batch_size=10, verbose=False)
    return ypreds


def test_batches(clf, test_x, thresh):
    batch_preds = []
    for batch in test_x:
        y_preds = get_predictions(clf, batch)
        yhats = [convert(y, thresh) for y in y_preds]
        prediction = round(sum(yhats)/len(yhats))
        batch_preds.append(prediction)
    return batch_preds


def test_overall_acc(accuracy_dicts):
    far = 0
    frr = 0
    precision = 0
    recall = 0
    f1_score = 0
    accuracy = 0

    for d in accuracy_dicts:
        far += d.get('far')
        frr += d.get('frr')
        precision += d.get('precision')
        recall += d.get('recall')
        f1_score += d.get('f1')
        accuracy += d.get('accuracy')

    print('Overall Accuracy - Average of Individual Models:\n')
    print('FAR : {}'.format(far / len(usernames)))
    print('FRR : {}'.format(frr / len(usernames)))
    print('Precision : {}'.format(precision / len(usernames)))
    print('Recall : {}'.format(recall / len(usernames)))
    print('F1-Score : {}'.format(f1_score / len(usernames)))
    print('Accuracy : {}'.format(accuracy / len(usernames)))

### Create Test Sets for each user

In [4]:
test_path = os.path.join(os.getcwd(), 'RNN-Test-Aligned')
model_save_path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
test_files = os.listdir(test_path)

usernames = get_usernames(test_files)
test_dfs = getdfs(test_path, test_files)

user_thresholds = {
    'Amanda11': 0.06, 
    'Benjamin1': 0.13, 
    'Blake1': 0.01, 
    'Chloe1': 0.3, 
    'Conner1': 0.16, 
    'Cormac1': 0.2, 
    'David1': 0.004,
    'Ian1': 0.3, 
    'Ian2': 0.4, 
    'Jacob1': 0.5, 
    'Jamison1': 0.35, 
    'John1': 0.2, 
    'Jonah1': 0.018, 
    'SamP1': 0.2, 
    'Theo1': 0.05    
}

col_labels = ['Direction', 'Sample', 'Sep', 'Mouse Speed', 'Eye Speed', 'Mouse Angle', 'Eye Angle', 'Mouse I-Angle', 'Eye I-Angle',
              'Mouse Region', 'Eye Region', 'Y-Value']

testing_xs = []
testing_ys = []
for user in usernames:
    xtest, ytest = create_test_set(user, usernames, test_dfs)
    testing_xs.append(xtest)
    testing_ys.append(ytest)

#### Looking at the total number of batches per user. 

We are using 4 batches per user for development set. 

In [5]:
batch_lens = []
for user in testing_xs:
    batch_lens.append(len(user))
    
print(batch_lens)
print(f'Avg Batch Length: {sum(batch_lens)/len(batch_lens)}')

[64, 63, 65, 65, 65, 63, 66, 64, 64, 60, 61, 64, 68, 63, 67]
Avg Batch Length: 64.13333333333334


## Test The Models

### Looking at each user individually to update thresholds

#### Amanda11
Best Threshold 
0.05 for both

In [6]:
username = usernames[0]
test_x = testing_xs[0]
test_y = testing_ys[0]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(1, 10):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR AMANDA11
Threshold 0.01
far: 0.16071428571428573
frr: 0.0
precision: 0.47058823529411764
recall: 1.0
f1: 0.6399999999999999
accuracy: 0.859375

Threshold 0.02
far: 0.10714285714285714
frr: 0.0
precision: 0.5714285714285714
recall: 1.0
f1: 0.7272727272727273
accuracy: 0.90625

Threshold 0.03
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.04
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.05
far: 0.017857142857142856
frr: 0.0
precision: 0.8888888888888888
recall: 1.0
f1: 0.9411764705882353
accuracy: 0.984375

Threshold 0.06
far: 0.017857142857142856
frr: 0.125
precision: 0.875
recall: 0.875
f1: 0.875
accuracy: 0.96875

Threshold 0.07
far: 0.017857142857142856
frr: 0.125
precision: 0.875
recall: 0.875
f1: 0.875
accuracy: 0.96875

Threshold 0.08
far: 0.017857142857142856
frr: 0.125
precision: 0.875
recall: 0.875


#### Benjamin1
Best Theshold EER 0.5 F 0.8

In [7]:
username = usernames[1]
test_x = testing_xs[1]
test_y = testing_ys[1]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(1, 10):
    thresh = 0.1 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR BENJAMIN1
Threshold 0.1
far: 0.5357142857142857
frr: 0.0
precision: 0.1891891891891892
recall: 1.0
f1: 0.3181818181818182
accuracy: 0.5238095238095238

Threshold 0.2
far: 0.375
frr: 0.14285714285714285
precision: 0.2222222222222222
recall: 0.8571428571428571
f1: 0.35294117647058826
accuracy: 0.6507936507936508

Threshold 0.30000000000000004
far: 0.21428571428571427
frr: 0.14285714285714285
precision: 0.3333333333333333
recall: 0.8571428571428571
f1: 0.48
accuracy: 0.7936507936507936

Threshold 0.4
far: 0.16071428571428573
frr: 0.14285714285714285
precision: 0.4
recall: 0.8571428571428571
f1: 0.5454545454545455
accuracy: 0.8412698412698413

Threshold 0.5
far: 0.14285714285714285
frr: 0.14285714285714285
precision: 0.42857142857142855
recall: 0.8571428571428571
f1: 0.5714285714285714
accuracy: 0.8571428571428571

Threshold 0.6000000000000001
far: 0.10714285714285714
frr: 0.14285714285714285
precision: 0.5
recall: 0.8571428571428571
f1: 0.631578947368421
accuracy: 0.888888888

#### Blake
Best Theshold 0.5 Both

In [22]:
username = usernames[2]
test_x = testing_xs[2]
test_y = testing_ys[2]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(50, 60):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR BLAKE1
Threshold 0.5
far: 0.125
frr: 0.0
precision: 0.5625
recall: 1.0
f1: 0.72
accuracy: 0.8923076923076924

Threshold 0.51
far: 0.10714285714285714
frr: 0.0
precision: 0.6
recall: 1.0
f1: 0.7499999999999999
accuracy: 0.9076923076923077

Threshold 0.52
far: 0.10714285714285714
frr: 0.1111111111111111
precision: 0.5714285714285714
recall: 0.8888888888888888
f1: 0.6956521739130435
accuracy: 0.8923076923076924

Threshold 0.53
far: 0.07142857142857142
frr: 0.1111111111111111
precision: 0.6666666666666666
recall: 0.8888888888888888
f1: 0.761904761904762
accuracy: 0.9230769230769231

Threshold 0.54
far: 0.03571428571428571
frr: 0.3333333333333333
precision: 0.75
recall: 0.6666666666666666
f1: 0.7058823529411765
accuracy: 0.9230769230769231

Threshold 0.55
far: 0.017857142857142856
frr: 0.5555555555555556
precision: 0.8
recall: 0.4444444444444444
f1: 0.5714285714285714
accuracy: 0.9076923076923077

Threshold 0.56
far: 0.017857142857142856
frr: 0.6666666666666666
precision: 0.75


#### Chloe
Best Theshold 0.15 Both

In [24]:
username = usernames[3]
test_x = testing_xs[3]
test_y = testing_ys[3]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(10, 20):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR CHLOE1
Threshold 0.1
far: 0.14285714285714285
frr: 0.0
precision: 0.5294117647058824
recall: 1.0
f1: 0.6923076923076924
accuracy: 0.8769230769230769

Threshold 0.11
far: 0.14285714285714285
frr: 0.1111111111111111
precision: 0.5
recall: 0.8888888888888888
f1: 0.64
accuracy: 0.8615384615384616

Threshold 0.12
far: 0.125
frr: 0.1111111111111111
precision: 0.5333333333333333
recall: 0.8888888888888888
f1: 0.6666666666666667
accuracy: 0.8769230769230769

Threshold 0.13
far: 0.10714285714285714
frr: 0.1111111111111111
precision: 0.5714285714285714
recall: 0.8888888888888888
f1: 0.6956521739130435
accuracy: 0.8923076923076924

Threshold 0.14
far: 0.10714285714285714
frr: 0.1111111111111111
precision: 0.5714285714285714
recall: 0.8888888888888888
f1: 0.6956521739130435
accuracy: 0.8923076923076924

Threshold 0.15
far: 0.10714285714285714
frr: 0.1111111111111111
precision: 0.5714285714285714
recall: 0.8888888888888888
f1: 0.6956521739130435
accuracy: 0.8923076923076924

Threshold 

#### Connor
Best Theshold 0.1 Both

In [10]:
username = usernames[4]
test_x = testing_xs[4]
test_y = testing_ys[4]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(5, 21):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR CONNER1
Threshold 0.05
far: 0.14285714285714285
frr: 0.0
precision: 0.5294117647058824
recall: 1.0
f1: 0.6923076923076924
accuracy: 0.8769230769230769

Threshold 0.06
far: 0.08928571428571429
frr: 0.0
precision: 0.6428571428571429
recall: 1.0
f1: 0.782608695652174
accuracy: 0.9230769230769231

Threshold 0.07
far: 0.05357142857142857
frr: 0.0
precision: 0.75
recall: 1.0
f1: 0.8571428571428571
accuracy: 0.9538461538461539

Threshold 0.08
far: 0.03571428571428571
frr: 0.0
precision: 0.8181818181818182
recall: 1.0
f1: 0.9
accuracy: 0.9692307692307692

Threshold 0.09
far: 0.03571428571428571
frr: 0.0
precision: 0.8181818181818182
recall: 1.0
f1: 0.9
accuracy: 0.9692307692307692

Threshold 0.1
far: 0.03571428571428571
frr: 0.0
precision: 0.8181818181818182
recall: 1.0
f1: 0.9
accuracy: 0.9692307692307692

Threshold 0.11
far: 0.03571428571428571
frr: 0.0
precision: 0.8181818181818182
recall: 1.0
f1: 0.9
accuracy: 0.9692307692307692

Threshold 0.12
far: 0.03571428571428571
frr: 0.

#### Cormac
Best Theshold 0.4 for both

In [11]:
username = usernames[5]
test_x = testing_xs[5]
test_y = testing_ys[5]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(1, 10):
    thresh = 0.1 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR CORMAC1
Threshold 0.1
far: 0.07142857142857142
frr: 0.0
precision: 0.6363636363636364
recall: 1.0
f1: 0.7777777777777778
accuracy: 0.9365079365079365

Threshold 0.2
far: 0.017857142857142856
frr: 0.0
precision: 0.875
recall: 1.0
f1: 0.9333333333333333
accuracy: 0.9841269841269841

Threshold 0.30000000000000004
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.4
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.5
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.6000000000000001
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.7000000000000001
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.8
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.9
far: 0.0
frr: 0.2857142857142857
precision: 1.0
recall: 0.7142857142857143
f1: 0.8333333333333333
accuracy: 0.9682539682539683



#### David
Best Theshold 0.2 for both

In [12]:
username = usernames[6]
test_x = testing_xs[6]
test_y = testing_ys[6]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(1, 10):
    thresh = 0.1 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR DAVID1
Threshold 0.1
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.2
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.30000000000000004
far: 0.0
frr: 0.3
precision: 1.0
recall: 0.7
f1: 0.8235294117647058
accuracy: 0.9545454545454546

Threshold 0.4
far: 0.0
frr: 0.5
precision: 1.0
recall: 0.5
f1: 0.6666666666666666
accuracy: 0.9242424242424242

Threshold 0.5
far: 0.0
frr: 0.5
precision: 1.0
recall: 0.5
f1: 0.6666666666666666
accuracy: 0.9242424242424242

Threshold 0.6000000000000001
far: 0.0
frr: 0.8
precision: 1.0
recall: 0.2
f1: 0.33333333333333337
accuracy: 0.8787878787878788

Threshold 0.7000000000000001
far: 0.0
frr: 0.8
precision: 1.0
recall: 0.2
f1: 0.33333333333333337
accuracy: 0.8787878787878788

Threshold 0.8
far: 0.0
frr: 0.9
precision: 1.0
recall: 0.1
f1: 0.18181818181818182
accuracy: 0.8636363636363636

Threshold 0.9
far: 0.0
frr: 1.0
precision: -1
recall: 0.0
f1: 0.0
accuracy: 0.8484848484848485


#### Ian1
Best Theshold EER:0.45 F: 0.5

In [27]:
username = usernames[7]
test_x = testing_xs[7]
test_y = testing_ys[7]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(49, 60):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR IAN1
Threshold 0.49
far: 0.05357142857142857
frr: 0.125
precision: 0.7
recall: 0.875
f1: 0.7777777777777777
accuracy: 0.9375

Threshold 0.5
far: 0.05357142857142857
frr: 0.125
precision: 0.7
recall: 0.875
f1: 0.7777777777777777
accuracy: 0.9375

Threshold 0.51
far: 0.05357142857142857
frr: 0.25
precision: 0.6666666666666666
recall: 0.75
f1: 0.7058823529411765
accuracy: 0.921875

Threshold 0.52
far: 0.05357142857142857
frr: 0.25
precision: 0.6666666666666666
recall: 0.75
f1: 0.7058823529411765
accuracy: 0.921875

Threshold 0.53
far: 0.05357142857142857
frr: 0.25
precision: 0.6666666666666666
recall: 0.75
f1: 0.7058823529411765
accuracy: 0.921875

Threshold 0.54
far: 0.03571428571428571
frr: 0.25
precision: 0.75
recall: 0.75
f1: 0.75
accuracy: 0.9375

Threshold 0.55
far: 0.017857142857142856
frr: 0.25
precision: 0.8571428571428571
recall: 0.75
f1: 0.7999999999999999
accuracy: 0.953125

Threshold 0.56
far: 0.0
frr: 0.25
precision: 1.0
recall: 0.75
f1: 0.8571428571428571
accur

#### Ian2
Best Theshold 0.05 for both

In [28]:
username = usernames[8]
test_x = testing_xs[8]
test_y = testing_ys[8]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(1, 10):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR IAN2
Threshold 0.01
far: 0.03571428571428571
frr: 0.0
precision: 0.8
recall: 1.0
f1: 0.888888888888889
accuracy: 0.96875

Threshold 0.02
far: 0.017857142857142856
frr: 0.0
precision: 0.8888888888888888
recall: 1.0
f1: 0.9411764705882353
accuracy: 0.984375

Threshold 0.03
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.04
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.05
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.06
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.07
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.08
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.09
far: 0.0
frr: 0.125
precision: 1.0
recall: 0.875
f1: 0.9333333333333333
accuracy: 0.984375



#### Jacob1
Best Theshold 0.6 for both

In [30]:
username = usernames[9]
test_x = testing_xs[9]
test_y = testing_ys[9]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(60, 70):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR JACOB1
Threshold 0.6
far: 0.05357142857142857
frr: 0.0
precision: 0.5714285714285714
recall: 1.0
f1: 0.7272727272727273
accuracy: 0.95

Threshold 0.61
far: 0.05357142857142857
frr: 0.0
precision: 0.5714285714285714
recall: 1.0
f1: 0.7272727272727273
accuracy: 0.95

Threshold 0.62
far: 0.05357142857142857
frr: 0.0
precision: 0.5714285714285714
recall: 1.0
f1: 0.7272727272727273
accuracy: 0.95

Threshold 0.63
far: 0.03571428571428571
frr: 0.0
precision: 0.6666666666666666
recall: 1.0
f1: 0.8
accuracy: 0.9666666666666667

Threshold 0.64
far: 0.017857142857142856
frr: 0.0
precision: 0.8
recall: 1.0
f1: 0.888888888888889
accuracy: 0.9833333333333333

Threshold 0.65
far: 0.017857142857142856
frr: 0.25
precision: 0.75
recall: 0.75
f1: 0.75
accuracy: 0.9666666666666667

Threshold 0.66
far: 0.0
frr: 0.5
precision: 1.0
recall: 0.5
f1: 0.6666666666666666
accuracy: 0.9666666666666667

Threshold 0.67
far: 0.0
frr: 0.5
precision: 1.0
recall: 0.5
f1: 0.6666666666666666
accuracy: 0.966666

#### Jamison1
Best Theshold 0.28 for both

In [31]:
username = usernames[10]
test_x = testing_xs[10]
test_y = testing_ys[10]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(25, 40):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR JAMISON1
Threshold 0.25
far: 0.017857142857142856
frr: 0.0
precision: 0.8333333333333334
recall: 1.0
f1: 0.9090909090909091
accuracy: 0.9836065573770492

Threshold 0.26
far: 0.017857142857142856
frr: 0.0
precision: 0.8333333333333334
recall: 1.0
f1: 0.9090909090909091
accuracy: 0.9836065573770492

Threshold 0.27
far: 0.017857142857142856
frr: 0.0
precision: 0.8333333333333334
recall: 1.0
f1: 0.9090909090909091
accuracy: 0.9836065573770492

Threshold 0.28
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.29
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.3
far: 0.0
frr: 0.0
precision: 1.0
recall: 1.0
f1: 1.0
accuracy: 1.0

Threshold 0.31
far: 0.0
frr: 0.2
precision: 1.0
recall: 0.8
f1: 0.888888888888889
accuracy: 0.9836065573770492

Threshold 0.32
far: 0.0
frr: 0.2
precision: 1.0
recall: 0.8
f1: 0.888888888888889
accuracy: 0.9836065573770492

Threshold 0.33
far: 0.0
frr: 0.2
precision: 1.0
recall: 0.8
f1: 0.88888

#### John1
Best Theshold 0.25 for both

In [32]:
username = usernames[11]
test_x = testing_xs[11]
test_y = testing_ys[11]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(20, 30):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR JOHN1
Threshold 0.2
far: 0.14285714285714285
frr: 0.0
precision: 0.5
recall: 1.0
f1: 0.6666666666666666
accuracy: 0.875

Threshold 0.21
far: 0.14285714285714285
frr: 0.0
precision: 0.5
recall: 1.0
f1: 0.6666666666666666
accuracy: 0.875

Threshold 0.22
far: 0.08928571428571429
frr: 0.0
precision: 0.6153846153846154
recall: 1.0
f1: 0.761904761904762
accuracy: 0.921875

Threshold 0.23
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.24
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.25
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.26
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
recall: 1.0
f1: 0.8421052631578948
accuracy: 0.953125

Threshold 0.27
far: 0.05357142857142857
frr: 0.0
precision: 0.7272727272727273
re

#### Jonah
Best Theshold EER: 0.2 F: 0.3

In [34]:
username = usernames[12]
test_x = testing_xs[12]
test_y = testing_ys[12]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(15, 30):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR JONAH1
Threshold 0.15
far: 0.19642857142857142
frr: 0.08333333333333333
precision: 0.5
recall: 0.9166666666666666
f1: 0.6470588235294118
accuracy: 0.8235294117647058

Threshold 0.16
far: 0.16071428571428573
frr: 0.08333333333333333
precision: 0.55
recall: 0.9166666666666666
f1: 0.6874999999999999
accuracy: 0.8529411764705882

Threshold 0.17
far: 0.16071428571428573
frr: 0.08333333333333333
precision: 0.55
recall: 0.9166666666666666
f1: 0.6874999999999999
accuracy: 0.8529411764705882

Threshold 0.18
far: 0.16071428571428573
frr: 0.08333333333333333
precision: 0.55
recall: 0.9166666666666666
f1: 0.6874999999999999
accuracy: 0.8529411764705882

Threshold 0.19
far: 0.16071428571428573
frr: 0.08333333333333333
precision: 0.55
recall: 0.9166666666666666
f1: 0.6874999999999999
accuracy: 0.8529411764705882

Threshold 0.2
far: 0.14285714285714285
frr: 0.16666666666666666
precision: 0.5555555555555556
recall: 0.8333333333333334
f1: 0.6666666666666667
accuracy: 0.8529411764705882

Th

#### SamP1
Best Theshold 0.18 for both

In [36]:
username = usernames[13]
test_x = testing_xs[13]
test_y = testing_ys[13]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(15, 30):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR SAMP1
Threshold 0.15
far: 0.08928571428571429
frr: 0.0
precision: 0.5833333333333334
recall: 1.0
f1: 0.7368421052631579
accuracy: 0.9206349206349206

Threshold 0.16
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.17
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.18
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.19
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.2
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.21
far: 0.05357142857142857
frr: 0.0
precision: 0.7
recall: 1.0
f1: 0.8235294117647058
accuracy: 0.9523809523809523

Threshold 0.22
far: 0.05357142857142857
frr: 0.14285714285714285

#### Theo1
Best Theshold EER 0.5 F 0.6

In [39]:
username = usernames[14]
test_x = testing_xs[14]
test_y = testing_ys[14]

filename = username + '_model.h5'
path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
userfile = os.path.join(path, filename)
clf = load_model(userfile)
    
print(f'ACCURACY FOR {username.upper()}')
for i in range(50, 65):
    thresh = 0.01 * i
    y_preds = test_batches(clf, test_x, thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    print(f'Threshold {thresh}')
    for k, v in performance.items():
        print(f'{k}: {v}')
    print()

ACCURACY FOR THEO1
Threshold 0.5
far: 0.07142857142857142
frr: 0.09090909090909091
precision: 0.7142857142857143
recall: 0.9090909090909091
f1: 0.8
accuracy: 0.9253731343283582

Threshold 0.51
far: 0.07142857142857142
frr: 0.09090909090909091
precision: 0.7142857142857143
recall: 0.9090909090909091
f1: 0.8
accuracy: 0.9253731343283582

Threshold 0.52
far: 0.03571428571428571
frr: 0.09090909090909091
precision: 0.8333333333333334
recall: 0.9090909090909091
f1: 0.8695652173913043
accuracy: 0.9552238805970149

Threshold 0.53
far: 0.03571428571428571
frr: 0.09090909090909091
precision: 0.8333333333333334
recall: 0.9090909090909091
f1: 0.8695652173913043
accuracy: 0.9552238805970149

Threshold 0.54
far: 0.03571428571428571
frr: 0.09090909090909091
precision: 0.8333333333333334
recall: 0.9090909090909091
f1: 0.8695652173913043
accuracy: 0.9552238805970149

Threshold 0.55
far: 0.03571428571428571
frr: 0.09090909090909091
precision: 0.8333333333333334
recall: 0.9090909090909091
f1: 0.869565217

## Testing Overall Accuracy

In [40]:
eer_thresholds = {
    'Amanda11': 0.05, 
    'Benjamin1': 0.5, 
    'Blake1': 0.5, 
    'Chloe1': 0.15, 
    'Conner1': 0.1, 
    'Cormac1': 0.4, 
    'David1': 0.2,
    'Ian1': 0.45, 
    'Ian2': 0.05, 
    'Jacob1': 0.6, 
    'Jamison1': 0.28, 
    'John1': 0.25, 
    'Jonah1': 0.2, 
    'SamP1': 0.18, 
    'Theo1': 0.5    
}

f_thresholds = {
    'Amanda11': 0.05, 
    'Benjamin1': 0.8, 
    'Blake1': 0.5, 
    'Chloe1': 0.15, 
    'Conner1': 0.1, 
    'Cormac1': 0.4, 
    'David1': 0.2,
    'Ian1': 0.5, 
    'Ian2': 0.05, 
    'Jacob1': 0.6, 
    'Jamison1': 0.28, 
    'John1': 0.25, 
    'Jonah1': 0.3, 
    'SamP1': 0.18, 
    'Theo1': 0.6    
}

eer_dicts = []
f_dicts = []
    
for i in range(len(usernames)):
    username = usernames[i]
    eer_thresh = eer_thresholds[username]
    f_thresh = f_thresholds[username]
    test_x = testing_xs[i]
    test_y = testing_ys[i]
    filename = username + '_model.h5'
    path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
    userfile = os.path.join(path, filename)
    clf = load_model(userfile)
    y_preds = test_batches(clf, test_x, eer_thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    eer_dicts.append(performance)
    
    y_preds = test_batches(clf, test_x, f_thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    f_dicts.append(performance)

print('EER Performance')
test_overall_acc(eer_dicts)
print()
print('F-Score Performance')
test_overall_acc(f_dicts)

EER Performance
Overall Accuracy - Average of Individual Models:

FAR : 0.06190476190476191
FRR : 0.04243626743626743
Precision : 0.7358742183742183
Recall : 0.9575637325637326
F1-Score : 0.8216129947436989
Accuracy : 0.9402789516112606

F-Score Performance
Overall Accuracy - Average of Individual Models:

FAR : 0.03809523809523809
FRR : 0.04243626743626743
Precision : 0.814534817034817
Recall : 0.9575637325637326
F1-Score : 0.8706454437761478
Accuracy : 0.9607982621547151


## Test on all testing data, not just a subset

In [41]:
# Get the Full Batches

test_path = os.path.join(os.getcwd(), 'RNN-Test-Aligned')
model_save_path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
test_files = os.listdir(test_path)

usernames = get_usernames(test_files)
test_dfs = getdfs(test_path, test_files)

testing_xs = []
testing_ys = []
for user in usernames:
    xtest, ytest = create_full_test(user, usernames, test_dfs)
    testing_xs.append(xtest)
    testing_ys.append(ytest)

#### Looking at the total number of batches per user. 

We are using 4 batches per user for development set. 

In [44]:
batch_lens = []
for user in testing_xs:
    batch_lens.append(len(user))
    
print(batch_lens)
print(f'Avg Batch Length: {sum(batch_lens)/len(batch_lens)}')

[122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122]
Avg Batch Length: 122.0


In [43]:
eer_thresholds = {
    'Amanda11': 0.05, 
    'Benjamin1': 0.5, 
    'Blake1': 0.5, 
    'Chloe1': 0.15, 
    'Conner1': 0.1, 
    'Cormac1': 0.4, 
    'David1': 0.2,
    'Ian1': 0.45, 
    'Ian2': 0.05, 
    'Jacob1': 0.6, 
    'Jamison1': 0.28, 
    'John1': 0.25, 
    'Jonah1': 0.2, 
    'SamP1': 0.18, 
    'Theo1': 0.5    
}

f_thresholds = {
    'Amanda11': 0.05, 
    'Benjamin1': 0.8, 
    'Blake1': 0.5, 
    'Chloe1': 0.15, 
    'Conner1': 0.1, 
    'Cormac1': 0.4, 
    'David1': 0.2,
    'Ian1': 0.5, 
    'Ian2': 0.05, 
    'Jacob1': 0.6, 
    'Jamison1': 0.28, 
    'John1': 0.25, 
    'Jonah1': 0.3, 
    'SamP1': 0.18, 
    'Theo1': 0.6    
}

eer_dicts = []
f_dicts = []
    
for i in range(len(usernames)):
    username = usernames[i]
    eer_thresh = eer_thresholds[username]
    f_thresh = f_thresholds[username]
    test_x = testing_xs[i]
    test_y = testing_ys[i]
    filename = username + '_model.h5'
    path = os.path.join(os.getcwd(), 'RNNv4-saved_models')
    userfile = os.path.join(path, filename)
    clf = load_model(userfile)
    y_preds = test_batches(clf, test_x, eer_thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    eer_dicts.append(performance)
    
    y_preds = test_batches(clf, test_x, f_thresh)
    performance = calc_accuracy_metrics(y_preds, test_y)
    f_dicts.append(performance)

print('EER Performance')
test_overall_acc(eer_dicts)
print()
print('F-Score Performance')
test_overall_acc(f_dicts)

EER Performance
Overall Accuracy - Average of Individual Models:

FAR : 0.060595128557204135
FRR : 0.04243626743626743
Precision : 0.6214064657201912
Recall : 0.9575637325637326
F1-Score : 0.7306845905466596
Accuracy : 0.9404371584699454

F-Score Performance
Overall Accuracy - Average of Individual Models:

FAR : 0.04577749836199956
FRR : 0.04243626743626743
Precision : 0.6852253265978756
Recall : 0.9575637325637326
F1-Score : 0.7761497952322541
Accuracy : 0.9540983606557377


## Findings

This model produced very good results compared to the original work with a basic neural network. 

The 2 layer lstm model used here generalized best of all models tested. Below are the accuracy metrics found for these tests. 

#### For 15 Users: 

While we looked at different thresholds in the development set for minimizing EER and maximizing F-Score, after running the full tests, it turned out that using the F-Score tuned thresholds actually provided the best EER as well. 

Therefore we will only use one threshold


When maximizing F-Score and minimizing EER (Equal Error Ratio):

    FAR : 0.0457
    FRR : 0.0424
    Precision : 0.6852
    Recall : 0.9576
    F1-Score : 0.7761
    Accuracy : 0.9541
