Use this file to test new data

### import libraries

In [3]:
import os
import time
import sys
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import keras

In [4]:
#sys.path.append('path/to/model.py')

from model import *
from utils import *

### import tokenizer

In [5]:
import pickle

# loading
with open('./tokenizer.pickle', 'rb') as handle:
    seq_tokenizer = pickle.load(handle)

### Define model

In [6]:
BATCH_SIZE = 256
embedding_dim = 64
units = 128
vocab_inp_size = len(seq_tokenizer.word_index)+1
vocab_tar_size = len(seq_tokenizer.word_index)+1

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

optimizer = tf.keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, decay=0.01)
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

### Load in new test data

* **change to directory where the new test data is**

In [7]:
## read in csv file as dataframe
df=pd.read_csv('../dataset/AnomalyDetectionChallenge/hexacopter-hil-clean-01.kev.csv')

### Process data

In [8]:
X=[]
Y=[]
for i in range(1):
    data=df[['class','event']]
    data=list(data['class']+data['event'])
    inp,tar=build_data(data,5,5)
    X=X+inp
    Y=Y+tar

number of training examples: 136011


In [9]:
input_tensor_train, target_tensor_train = load_dataset(X, Y, seq_tokenizer)

### load in trained weights

In [15]:
checkpoint_dir = './sumitmodel_checkpoint'
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.InitializationOnlyStatus at 0x14dea7ac8>

In [11]:
def evaluate(input):

    inputs = tf.convert_to_tensor(input)

      #result = ''
    result=[]

    hidden = [tf.zeros((len(input),units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([seq_tokenizer.word_index['<start>']]*len(input), 1)

    for t in range(input.shape[1]-1):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_out)

        predicted_id = tf.argmax(predictions,axis=1).numpy()

        #result += targ_seq.index_word[predicted_id] + ' '

        result.append(predicted_id)

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims(predicted_id, 1)

    return result

### predict output

With 5 events as one sequence, and evaluate the misclassfication rate every 200 sequences or every 1000 events.

In [37]:
s=200

predicts=evaluate(input_tensor_train[:s,:])

predicts=np.array(predicts).swapaxes(0,1)

targets=target_tensor_train[:s,1:]

In [42]:
misclassification_rate=np.sum(np.sum(np.not_equal(targets, predicts)))/s/5

## see 'anomaly_detection_NN_train' notebook for how I set bar value.
bar=0.365

if misclassification_rate>bar:
    print('This sequence series is an anomaly')
else:
    print('This is NOT an anomaly sequence series')

This is NOT an anomaly sequence series
