In [1]:
import sys
import operator
import os
import numpy as np
import pandas as pd
import time
import json
from operator import itemgetter
import csv
import scipy.stats as stats
from itertools import groupby
from operator import itemgetter
from datetime import datetime as dt

from telemanom._globals import Config
import telemanom.errors as err
import telemanom.helpers as helpers
import telemanom.modeling as models

  dictionary = yaml.load(f.read())

calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.

compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5



In [2]:
# init config class
config = Config("config.yaml")


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [3]:
# Current time 
_id = dt.now().strftime("%Y-%m-%d_%H.%M.%S")

In [4]:
'''
HELPER CLASS
'''

'\nHELPER CLASS\n'

In [5]:
import numpy as np
import os
import logging
from datetime import datetime
import sys
import csv
import pandas as pd
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode
import cufflinks as cf
import glob

In [6]:
def make_dirs(_id):
    '''Create directories for storing data in repo (using datetime ID) if they don't already exist'''

    if not config.train or not config.predict:
        if not os.path.isdir('data/%s' %config.use_id):
            raise ValueError("Run ID %s is not valid. If loading prior models or predictions, must provide valid ID.")

    paths = ['data', 'data/%s' %_id, 'data/%s/models' %_id, 'data/%s/smoothed_errors' %_id, 'data/%s/y_hat' %_id]

    for p in paths:
        if not os.path.isdir(p):
            os.mkdir(p)



def setup_logging(config, _id):
    '''Configure logging object to track parameter settings, training, and evaluation.
    
    Args:
        config(obj): Global object specifying system runtime params.

    Returns:
        logger (obj): Logging object
        _id (str): Unique identifier generated from datetime for storing data/models/results
    '''

    logger =  logging.getLogger('telemanom')
    hdlr = logging.FileHandler('data/%s/params.log' %_id)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.INFO)

    stdout = logging.StreamHandler(sys.stdout)
    stdout.setLevel(logging.INFO)
    logger.addHandler(stdout)

    logger.info("Runtime params:")
    logger.info("----------------")
    for attr in dir(config):    
        if not "__" in attr and not attr in ['header', 'date_format', 'path_to_config', 'build_group_lookup']:
            logger.info('%s: %s' %(attr, getattr(config, attr)))
    logger.info("----------------\n")

    return logger



def load_data(anom):
    '''Load train and test data from repo. If not in repo need to download from source.

    Args:
        anom (dict): contains anomaly information for a given input stream

    Returns:
        X_train (np array): array of train inputs with dimensions [timesteps, l_s, input dimensions]
        y_train (np array): array of train outputs corresponding to true values following each sequence
        X_test (np array): array of test inputs with dimensions [timesteps, l_s, input dimensions)
        y_test (np array): array of test outputs corresponding to true values following each sequence
    '''
    try:
        train = np.load(os.path.join("data", "train", anom['chan_id'] + ".npy"))
        test = np.load(os.path.join("data", "test", anom['chan_id'] + ".npy"))

    except:
        raise ValueError("Source data not found, may need to add data to repo: <link>")

    # shape, split data
    X_train, y_train = shape_data(train)
    X_test, y_test = shape_data(test, train=False)

    return X_train, y_train, X_test, y_test


def shape_data(arr, train=True):
    '''Shape raw input streams for ingestion into LSTM. config.l_s specifies the sequence length of 
    prior timesteps fed into the model at each timestep t. 

    Args:
        arr (np array): array of input streams with dimensions [timesteps, 1, input dimensions]
        train (bool): If shaping training data, this indicates data can be shuffled

    Returns:
        X (np array): array of inputs with dimensions [timesteps, l_s, input dimensions)
        y (np array): array of outputs corresponding to true values following each sequence. 
            shape = [timesteps, n_predictions, 1)
        l_s (int): sequence length to be passed to test shaping (if shaping train) so they are consistent
    '''
    
    # print("LEN ARR: %s" %len(arr))

    data = [] 
    for i in range(len(arr) - config.l_s - config.n_predictions):
        data.append(arr[i:i + config.l_s + config.n_predictions])
    data = np.array(data) 

    assert len(data.shape) == 3

    if train == True:
        np.random.shuffle(data)

    X = data[:,:-config.n_predictions,:]
    y = data[:,-config.n_predictions:,0] #telemetry value is at position 0

    return X, y

In [7]:
def final_stats(stats, logger):
    '''Log final stats at end of experiment.

    Args:
        stats (dict): Count of true positives, false positives, and false negatives from experiment
        logger (obj): logging object
    '''

    logger.info("Final Totals:")
    logger.info("-----------------")
    logger.info("True Positives: %s " %stats["true_positives"])
    logger.info("False Positives: %s " %stats["false_positives"])
    logger.info("False Negatives: %s\n" %stats["false_negatives"])
    try:
        logger.info("Precision: %s" %(float(stats["true_positives"])/float(stats["true_positives"]+stats["false_positives"])))
        logger.info("Recall: %s" %(float(stats["true_positives"])/float(stats["true_positives"]+stats["false_negatives"])))
    except:
        logger.info("Precision: NaN")
        logger.info("Recall: NaN")


def anom_stats(stats, anom, logger):
    '''Log stats after processing of each stream.

    Args:
        stats (dict): Count of true positives, false positives, and false negatives from experiment
        anom (dict): contains all anomaly information for a given input stream
        logger (obj): logging object
    '''

    logger.info("TP: %s  FP: %s  FN: %s" %(anom["true_positives"], anom["false_positives"], anom["false_negatives"]))
    logger.info('Total true positives: %s' %stats["true_positives"])
    logger.info('Total false positives: %s' %stats["false_positives"])
    logger.info('Total false negatives: %s\n' %stats["false_negatives"])



def view_results(results_fn, plot_errors=True, plot_train=False, rows=None):
    ''' Reads data from data dir and generates interactive plots for display in `results-viewer.ipynb` using 
    plotly offline mode. A chart showing y_hat and y_test values for each stream is generated by default. 

    Args:
        results_fn (str): name of results csv to plot results for
        plot_errors (bool): If True, a chart displaying the smoothed errors for each stream will be generated
        plot_train (bool): If True, a chart displaying the telemetry from training data is 
            be generated (command data not plotted)
        rows (tuple): Start and end row indicating rows to plot results for in results csv file

    Returns:
        None
    '''

    def create_shapes(ranges, range_type, _min, _max):
        ''' Create shapes for regions to highlight in plotly vizzes (true and predicted anomaly sequences)'''

        if range_type == 'true':
            color = 'red'
        elif range_type == 'predicted':
            color = 'blue'
        
        shapes = []
        if len(ranges) > 0:
        
            for r in ranges:

                shape = {
                    'type': 'rect',
                    'x0': r[0],
                    'y0': _min,
                    'x1': r[1],
                    'y1': _max,
                    'fillcolor': color,
                    'opacity': 0.2,
                    'line': {
                        'width': 0,
                    },
                }
            
                shapes.append(shape)
            
        return shapes



    vals = {}

    with open(results_fn, "r") as f:
        reader = csv.DictReader(f)
        for anom in reader:

            chan = anom["chan_id"]
            vals[chan] = {}
            dirs = ["y_hat", "smoothed_errors"]
            raw_dirs = ["test", "train"]

            for d in dirs:
                vals[chan][d] = list(np.load(os.path.join("../data", config.use_id, d, anom["chan_id"]) + ".npy"))
            for d in raw_dirs:
                vals[chan][d] = list(np.load(os.path.join("../data", d, anom["chan_id"]) + ".npy"))

            row_start = 0
            row_end = 100000
            if not rows == None:
                try:
                    row_start = rows[0]
                    row_end = rows[1]
                except:
                    raise ValueError("Rows not in correct format, please use (<first row>, <last row>)")

            # Info
            # ================================================================================================
            if reader.line_num - 1 >= row_start and reader.line_num -1 <= row_end:
                print("Spacecraft: %s" %anom['spacecraft'])
                print("Channel: %s" %anom["chan_id"])
                print('Normalized prediction error: %.3f' %float(anom['normalized_error']))
                print('Anomaly class(es): %s' %anom['class'])
                print("------------------")
                print('True Positives: %s' %anom['true_positives'])
                print("False Positives: %s" %anom["false_positives"])
                print("False Negatives: %s" %anom["false_negatives"])
                print("------------------")
                print('Predicted anomaly scores: %s' %anom['scores'])
                print("Number of values: %s"%len(vals[chan]["test"]))

                # Extract telemetry values from test data
                # ================================================================================================

                y_test = np.array(vals[chan]['test'])[:,0] 

                # Create highlighted regions (red = true anoms / blue = predicted anoms)
                # ================================================================================================
                y_shapes = create_shapes(eval(anom['anomaly_sequences']), "true", -1, 1)
                y_shapes += create_shapes(eval(anom['tp_sequences']) + eval(anom['fp_sequences']), "predicted", -1, 1)

                e_shapes = create_shapes(eval(anom['anomaly_sequences']), "true", 0, max(vals[chan]['smoothed_errors']))
                e_shapes += create_shapes(eval(anom['tp_sequences']) + eval(anom['fp_sequences']), "predicted", 
                                          0, max(vals[chan]['smoothed_errors']))

                # Move data into dataframes and plot with Plotly
                # ================================================================================================
                train_df = pd.DataFrame({
                    'train': [x[0] for x in vals[chan]['train']]
                })

                y = y_test[config.l_s:-config.n_predictions]
                if not len(y) == len(vals[chan]['y_hat']):
                    modified_l_s = len(y_test) - len(vals[chan]['y_hat']) - 1
                    y = y_test[modified_l_s:-1]
                y_df = pd.DataFrame({
                    'y_hat': vals[chan]['y_hat'],
                    'y': y
                })

                e_df = pd.DataFrame({
                    'e_s': vals[chan]['smoothed_errors']
                })

                y_layout = {
                    'title': "y / y_hat comparison",
                    'shapes': y_shapes,
                } 

                e_layout = {
                    'title': "Smoothed Errors (e_s)",
                    'shapes': e_shapes,
                } 

                if plot_train:
                    train_df.iplot(kind='scatter', color='green')
                
                y_df.iplot(kind='scatter', layout=y_layout)
                
                if plot_errors:
                    e_df.iplot(kind='scatter', layout=e_layout, color='red')

In [8]:
# train model with one file
from keras.models import Sequential, load_model
from keras.callbacks import History, EarlyStopping, Callback
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
import numpy as np
import os

def get_model(anom, X_train, y_train, logger):
    cbs = [History(), EarlyStopping(monitor='val_loss', patience=config.patience, 
            min_delta=config.min_delta, verbose=0)]
        
    model = Sequential()

    model.add(LSTM(
        config.layers[0],
        input_shape=(None, X_train.shape[2]),
        return_sequences=True))
    
    model.add(Dropout(config.dropout))

    model.add(LSTM(
        config.layers[1],
        return_sequences=False))
    model.add(Dropout(config.dropout))

    model.add(Dense(
        config.n_predictions))
    model.add(Activation("linear"))

    model.compile(loss=config.loss_metric, optimizer=config.optimizer) 

    model.fit(X_train, y_train, batch_size=config.lstm_batch_size, epochs=config.epochs, 
        validation_split=config.validation_split, callbacks=cbs, verbose=True)
    model.save(os.path.join("data", anom['run_id'], "models", anom["chan_id"] + ".h5"))

    return model 

In [9]:
def predict_in_batches(y_test, X_test, model, anom):
    '''Used trained LSTM model to predict test data arriving in batches (designed to 
    mimic a spacecraft downlinking schedule).

    Args:
        y_test (np array): numpy array of test outputs corresponding to true values to be predicted at end of each sequence
        X_test (np array): numpy array of test inputs with dimensions [timesteps, l_s, input dimensions)
        model (obj): trained Keras model 
        anom (dict): contains all anomaly information for a given input stream

    Returns:
        y_hat (np array): predicted test values for each timestep in y_test  
    '''

    y_hat = np.array([])

    num_batches = int((y_test.shape[0] - config.l_s) / config.batch_size)
    if num_batches < 0:
        raise ValueError("l_s (%s) too large for stream with length %s." %(config.l_s, y_test.shape[0]))

    # simulate data arriving in batches
    for i in range(1, num_batches+2):
        prior_idx = (i-1) * config.batch_size
        idx = i * config.batch_size
        if i == num_batches+1:
            idx = y_test.shape[0] #remaining values won't necessarily equal batch size
        
        X_test_period = X_test[prior_idx:idx]

        y_hat_period = model.predict(X_test_period)

        # map predictions n steps ahead to their corresponding timestep
        # TODO: vectorize
        final_y_hat = []
        for t in range(len(y_hat_period)+config.n_predictions):
            y_hat_t = []
            for j in range(config.n_predictions):
                if t - j >= 0 and t-j < len(y_hat_period):
                    y_hat_t.append(y_hat_period[t-j][j])
            if t < len(y_hat_period):
                if y_hat_t.count(0) == len(y_hat_t):
                    final_y_hat.append(0)
                else:
                    final_y_hat.append(y_hat_t[0]) # first prediction


        y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)
        y_hat = np.append(y_hat, y_hat_period)

    y_hat = np.reshape(y_hat, (y_hat.size,))

    np.save(os.path.join("data", anom['run_id'], "y_hat", anom["chan_id"] + ".npy"), np.array(y_hat))

In [10]:
stats = {
        "true_positives": 0,
        "false_positives": 0,
        "false_negatives": 0
    }

with open("labeled_anomalies.csv", "rU") as f:
    reader = csv.DictReader(f)
    for i, anom in enumerate(reader):
        p1 = anom
        break


'U' mode is deprecated



In [11]:
p1

{'anomaly_sequences': '[[1899, 2099], [4286, 4594], [3289, 3529]]',
 'chan_id': 'P-1',
 'class': '[contextual, contextual, contextual]',
 'num_values': '8505',
 'spacecraft': 'SMAP'}

In [12]:
X_train, y_train, X_test, y_test = load_data(p1)

In [15]:
print("X_train_shape: {0}, y_train_shape: {1}".format(X_train.shape,y_train.shape))

X_train_shape: (2612, 250, 25), y_train_shape: (2612, 10)


In [18]:
print("X_test_shape: {0}, y_test_shape: {1}".format(X_test.shape,y_test.shape))

X_test_shape: (8245, 250, 25), y_test_shape: (8245, 10)


In [19]:
make_dirs(_id)  
logger = setup_logging(config,_id)

Runtime params:
----------------
batch_size: 70
dropout: 0.3
epochs: 35
error_buffer: 100
l_s: 250
layers: [80, 80]
loss_metric: mse
lstm_batch_size: 64
min_delta: 0.0003
n_predictions: 10
optimizer: adam
p: 0.13
patience: 10
predict: True
smoothing_perc: 0.05
train: True
use_id: 2018-05-19_15.00.10
validation_split: 0.2
window_size: 30
----------------



In [20]:
y_hat = []

In [52]:
model = get_model(anom, X_train, y_train, logger)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 2089 samples, validate on 523 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [22]:
# P1 model
model = load_model(os.path.join("data", "2018-05-19_15.00.10", "models", p1["chan_id"] + ".h5"))

In [23]:
y_hat = np.array([])

In [24]:
i = 1
prior_idx = (i-1) * config.batch_size
idx = i * config.batch_size
X_test_period = X_test[prior_idx:idx]

In [25]:
X_test_period.shape

(70, 250, 25)

In [27]:
y_hat_period = model.predict(X_test_period)

In [28]:
y_hat_period.shape

(70, 10)

In [44]:
y_hat_period

array([[-0.4545483 , -0.34161842, -0.1977568 , -0.0593268 ,  0.03814217,
         0.08485641,  0.03919324, -0.05668224, -0.17526832, -0.2848638 ],
       [-0.3486319 , -0.22699055, -0.10134159,  0.00083086,  0.04826282,
         0.0280267 , -0.07065499, -0.20823088, -0.32681525, -0.42321256],
       [-0.08270735,  0.23091799,  0.45635355,  0.5964923 ,  0.53187954,
         0.43835044,  0.32073107,  0.14887512,  0.06470305, -0.0251391 ],
       [ 0.25156265,  0.50326735,  0.6108692 ,  0.62927806,  0.52183366,
         0.3770867 ,  0.25079176,  0.09547678,  0.00287756, -0.08897908],
       [ 0.2647493 ,  0.4107279 ,  0.4573353 ,  0.44847676,  0.38199106,
         0.25106314,  0.13804716,  0.00709714, -0.14563   , -0.2735321 ],
       [ 0.2888273 ,  0.3861268 ,  0.40258384,  0.38249806,  0.31393045,
         0.17083535,  0.02658679, -0.12449594, -0.2889688 , -0.41495708],
       [ 0.33300036,  0.39064655,  0.36238438,  0.30668855,  0.21106198,
         0.04761024, -0.11960689, -0.2710353 

In [29]:
len(y_hat_period)

70

In [31]:
a = []
a.append(y_hat_period[0][0])

In [34]:
a.append(y_hat_period[0][1])

In [30]:
 # map predictions n steps ahead to their corresponding timestep
# TODO: vectorize
final_y_hat = []
for t in range(len(y_hat_period)+config.n_predictions):
    y_hat_t = []
    for j in range(config.n_predictions):
        if t - j >= 0 and t-j < len(y_hat_period):
            print("y_hat_t appending:{0} when t: {1}, j: {2}".format(y_hat_period[t-j][j],t,j))
            y_hat_t.append(y_hat_period[t-j][j])
    if t < len(y_hat_period):
        if y_hat_t.count(0) == len(y_hat_t):
            print("final_y_hat appending:{0} when t: {1}, j: {2}".format(0,t,j))
            final_y_hat.append(0)
        else:
            print("final_y_hat appending:{0} when t: {1}, j: {2}".format(y_hat_t[0],t,j))
            final_y_hat.append(y_hat_t[0]) # first prediction

y_hat_t appending:-0.45454829931259155 when t: 0, j: 0
final_y_hat appending:-0.45454829931259155 when t: 0, j: 9
y_hat_t appending:-0.348631888628006 when t: 1, j: 0
y_hat_t appending:-0.3416184186935425 when t: 1, j: 1
final_y_hat appending:-0.348631888628006 when t: 1, j: 9
y_hat_t appending:-0.08270734548568726 when t: 2, j: 0
y_hat_t appending:-0.22699055075645447 when t: 2, j: 1
y_hat_t appending:-0.1977567970752716 when t: 2, j: 2
final_y_hat appending:-0.08270734548568726 when t: 2, j: 9
y_hat_t appending:0.2515626549720764 when t: 3, j: 0
y_hat_t appending:0.23091799020767212 when t: 3, j: 1
y_hat_t appending:-0.10134159028530121 when t: 3, j: 2
y_hat_t appending:-0.05932679772377014 when t: 3, j: 3
final_y_hat appending:0.2515626549720764 when t: 3, j: 9
y_hat_t appending:0.26474928855895996 when t: 4, j: 0
y_hat_t appending:0.5032673478126526 when t: 4, j: 1
y_hat_t appending:0.4563535451889038 when t: 4, j: 2
y_hat_t appending:0.0008308589458465576 when t: 4, j: 3
y_hat_t a

In [38]:
final_y_hat

[-0.4545483,
 -0.3486319,
 -0.082707345,
 0.25156265,
 0.2647493,
 0.2888273,
 0.33300036,
 0.4205653,
 0.44639736,
 0.44966102,
 0.37839478,
 0.25356072,
 0.13630722,
 -0.010917008,
 -0.19853283,
 -0.28950083,
 -0.39667094,
 -0.52728564,
 -0.61162245,
 -0.6153571,
 -0.5598759,
 -0.45408702,
 -0.37541375,
 -0.2751494,
 -0.24779238,
 -0.2845064,
 -0.35136914,
 -0.45190245,
 -0.57182837,
 -0.65094876,
 -0.69778156,
 -0.723036,
 -0.73543817,
 -0.73410517,
 -0.7224115,
 -0.7150469,
 -0.68901426,
 -0.6450873,
 -0.6065439,
 -0.5048354,
 -0.4403649,
 -0.33377615,
 -0.29262373,
 -0.32391977,
 -0.47862166,
 -0.6015562,
 -0.6815544,
 -0.7271993,
 -0.74007136,
 -0.7296256,
 -0.7054482,
 -0.6677703,
 -0.61952126,
 -0.5924357,
 -0.53929716,
 -0.48067427,
 -0.42383116,
 -0.3452798,
 -0.304403,
 -0.15019737,
 0.0011809915,
 0.033898517,
 0.013620421,
 0.04685454,
 0.06362878,
 0.06910564,
 0.08494048,
 0.11237237,
 0.1527978,
 0.15473683]

In [33]:
print(y_hat_period.shape)
print(y_hat_period)

(70, 10)
[[-0.6770421  -0.5551895  -0.4599952  -0.294711   -0.16158542 -0.0400058
   0.03732682  0.01172672 -0.0188783  -0.10645869]
 [-0.5974261  -0.45257142 -0.3475623  -0.18623129 -0.08983693 -0.02891666
  -0.00086468 -0.081485   -0.15216562 -0.24866883]
 [-0.39045352 -0.05292547  0.26081556  0.4546463   0.45914608  0.4315275
   0.37627184  0.3036738   0.24189723  0.14200293]
 [-0.02735439  0.27608597  0.47576302  0.5400898   0.5250947   0.47342008
   0.3676131   0.30703518  0.2059778   0.05133386]
 [ 0.25842124  0.4235867   0.508266    0.5085054   0.503571    0.4365129
   0.3248022   0.24925911  0.07909243 -0.13165544]
 [ 0.39303803  0.5023932   0.5449575   0.5030302   0.4746124   0.36876482
   0.19987321  0.06849283 -0.13108887 -0.33641827]
 [ 0.46716172  0.54288167  0.55855423  0.46528637  0.37945643  0.2286007
   0.01404979 -0.1486843  -0.3285302  -0.48577702]
 [ 0.52979684  0.5691159   0.54356605  0.38662058  0.2303429   0.05151409
  -0.18043366 -0.3421725  -0.47540966 -0.57106

In [34]:
print(len(final_y_hat))
print(final_y_hat)

70
[-0.6770421, -0.5974261, -0.39045352, -0.02735439, 0.25842124, 0.39303803, 0.46716172, 0.52979684, 0.54919237, 0.53861594, 0.47006792, 0.34165406, 0.21991129, 0.04706092, -0.11691395, -0.21143019, -0.34096992, -0.536691, -0.6701772, -0.6922969, -0.6164344, -0.45837212, -0.32611302, -0.19159502, -0.18923463, -0.26894957, -0.3579138, -0.46402556, -0.57980746, -0.64089406, -0.6646563, -0.66954684, -0.67042553, -0.6690006, -0.6711086, -0.69331896, -0.7049776, -0.7003339, -0.69291806, -0.59087855, -0.4963504, -0.3300259, -0.24017477, -0.25901964, -0.4384228, -0.5795539, -0.66974473, -0.7234544, -0.743745, -0.7434748, -0.73426676, -0.71519244, -0.68880033, -0.68457127, -0.64814293, -0.59873545, -0.5381755, -0.4386655, -0.36213535, -0.21792522, -0.011626489, 0.07156119, 0.16968031, 0.26661474, 0.29140842, 0.26931757, 0.23747368, 0.18900554, 0.20550825, 0.20900099]


In [42]:
final_y_hat[0]

-0.4545483

In [45]:
y_hat_period.shape

(70, 10)

In [46]:
y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)

In [47]:
print(y_hat_period.shape)
print(y_hat_period)

(70, 1)
[[-0.4545483 ]
 [-0.3486319 ]
 [-0.08270735]
 [ 0.25156265]
 [ 0.2647493 ]
 [ 0.2888273 ]
 [ 0.33300036]
 [ 0.4205653 ]
 [ 0.44639736]
 [ 0.44966102]
 [ 0.37839478]
 [ 0.25356072]
 [ 0.13630722]
 [-0.01091701]
 [-0.19853283]
 [-0.28950083]
 [-0.39667094]
 [-0.52728564]
 [-0.61162245]
 [-0.6153571 ]
 [-0.5598759 ]
 [-0.45408702]
 [-0.37541375]
 [-0.2751494 ]
 [-0.24779238]
 [-0.2845064 ]
 [-0.35136914]
 [-0.45190245]
 [-0.57182837]
 [-0.65094876]
 [-0.69778156]
 [-0.723036  ]
 [-0.73543817]
 [-0.73410517]
 [-0.7224115 ]
 [-0.7150469 ]
 [-0.68901426]
 [-0.6450873 ]
 [-0.6065439 ]
 [-0.5048354 ]
 [-0.4403649 ]
 [-0.33377615]
 [-0.29262373]
 [-0.32391977]
 [-0.47862166]
 [-0.6015562 ]
 [-0.6815544 ]
 [-0.7271993 ]
 [-0.74007136]
 [-0.7296256 ]
 [-0.7054482 ]
 [-0.6677703 ]
 [-0.61952126]
 [-0.5924357 ]
 [-0.53929716]
 [-0.48067427]
 [-0.42383116]
 [-0.3452798 ]
 [-0.304403  ]
 [-0.15019737]
 [ 0.00118099]
 [ 0.03389852]
 [ 0.01362042]
 [ 0.04685454]
 [ 0.06362878]
 [ 0.06910564]
 [

In [54]:
y_hat = np.array([])

num_batches = int((y_test.shape[0] - config.l_s) / config.batch_size)
if num_batches < 0:
    raise ValueError("l_s (%s) too large for stream with length %s." %(config.l_s, y_test.shape[0]))

# simulate data arriving in batches
for i in range(1, num_batches+2):
    prior_idx = (i-1) * config.batch_size
    idx = i * config.batch_size
    if i == num_batches+1:
        idx = y_test.shape[0] #remaining values won't necessarily equal batch size

    X_test_period = X_test[prior_idx:idx]

    y_hat_period = model.predict(X_test_period)

    # map predictions n steps ahead to their corresponding timestep
    # TODO: vectorize

    final_y_hat = []
    for t in range(len(y_hat_period)+config.n_predictions):
        y_hat_t = []
        for j in range(config.n_predictions):
            if t - j >= 0 and t-j < len(y_hat_period):
                y_hat_t.append(y_hat_period[t-j][j])
        if t < len(y_hat_period):
            if y_hat_t.count(0) == len(y_hat_t):
                final_y_hat.append(0)
            else:
                final_y_hat.append(y_hat_t[0]) # first prediction

    y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)
    y_hat = np.append(y_hat, y_hat_period)

y_hat = np.reshape(y_hat, (y_hat.size,))

y_test.shape

In [55]:
y_hat.shape

(8245,)

In [56]:
y_test.shape

(8245, 10)

In [None]:
'''
Get error
'''

In [57]:
a = []
for y_h,y_t in zip(y_hat, y_test):
    a.append([y_h,y_t])

In [61]:
e = [abs(y_h-y_t[0]) for y_h,y_t in zip(y_hat, y_test)]

In [62]:
config.smoothing_perc

0.05

In [63]:
int(config.batch_size * config.window_size * config.smoothing_perc)

105

In [66]:
config.window_size

30

In [65]:
len(e)

8245

In [80]:
smoothing_window = int(config.batch_size * config.window_size * config.smoothing_perc)
if not len(y_hat) == len(y_test):
    raise ValueError("len(y_hat) != len(y_test), can't calculate error: %s (y_hat) , %s (y_test)" %(len(y_hat), len(y_test)))

e_s = list(pd.DataFrame(e).ewm(span=smoothing_window).mean().values.flatten())

In [84]:
len(e_s)

8245

In [79]:
for e_s_,e_ in zip(e_s,e):
    print("e_s_: {0}, e_: {1}, diff: {2}".format(e_s_,e_,e_s_-e_))

e_s_: 0.08564810394701361, e_: 0.08564810394701361, diff: 0.0
e_s_: 0.14755700812089723, e_: 0.20829781976319817, diff: -0.06074081164230094
e_s_: 0.1465688302117481, e_: 0.14464805720215246, diff: 0.0019207730095956443
e_s_: 0.23036097534373678, e_: 0.4723702642265968, diff: -0.24200928888286
e_s_: 0.2141885589445341, e_: 0.15249326019353626, diff: 0.06169529875099783
e_s_: 0.19539062289542372, e_: 0.10658915218446596, diff: 0.08880147071095776
e_s_: 0.18133983048821603, e_: 0.10243056506444637, diff: 0.07890926542376966
e_s_: 0.19437143118614295, e_: 0.27896177812022094, diff: -0.08459034693407799
e_s_: 0.20489376536811407, e_: 0.2822311120849501, diff: -0.07733734671683604
e_s_: 0.21002359012010352, e_: 0.2520485718992451, diff: -0.0420249817791416
e_s_: 0.20448428822077228, e_: 0.15452618418650177, diff: 0.049958104034270506
e_s_: 0.21408531980254225, e_: 0.3084616358193828, diff: -0.09437631601684054
e_s_: 0.23278996099902913, e_: 0.43153563855477683, diff: -0.1987456775557477
e_s

e_s_: 0.1677825112043203, e_: 0.3154900976471522, diff: -0.14770758644283188
e_s_: 0.17041078510048563, e_: 0.30708102770011836, diff: -0.13667024259963273
e_s_: 0.17446145443730135, e_: 0.3850962599502612, diff: -0.21063480551295985
e_s_: 0.17309609983827848, e_: 0.10209766068957027, diff: 0.07099843914870821
e_s_: 0.1746985475454928, e_: 0.25802582832008136, diff: -0.08332728077458856
e_s_: 0.17197803883744783, e_: 0.030511586020034898, diff: 0.14146645281741294
e_s_: 0.169177251956732, e_: 0.023536334160442873, diff: 0.14564091779628913
e_s_: 0.1668205404938233, e_: 0.04427154442334125, diff: 0.12254899607048206
e_s_: 0.16487937504177697, e_: 0.06393877153599048, diff: 0.10094060350578649
e_s_: 0.16307146232278452, e_: 0.06906000093574827, diff: 0.09401146138703625
e_s_: 0.16161991350934468, e_: 0.08613937521092074, diff: 0.07548053829842394
e_s_: 0.15962885413778394, e_: 0.05609376681722944, diff: 0.1035350873205545
e_s_: 0.15727945339055868, e_: 0.03511061453554554, diff: 0.122168

e_s_: 0.17710005768296785, e_: 0.20172029730770147, diff: -0.024620239624733614
e_s_: 0.17407838002052725, e_: 0.01695114157361477, diff: 0.15712723844691248
e_s_: 0.1709131564087711, e_: 0.006321528597450943, diff: 0.16459162781132017
e_s_: 0.16781605413830875, e_: 0.006766736074268032, diff: 0.16104931806404071
e_s_: 0.16481611170014834, e_: 0.008819104915806175, diff: 0.15599700678434217
e_s_: 0.16191009218111727, e_: 0.010797077191501359, diff: 0.1511130149896159
e_s_: 0.1593792174542389, e_: 0.027773731656562717, diff: 0.13160548579767617
e_s_: 0.1574121737253804, e_: 0.0551258998247417, diff: 0.10228627390063871
e_s_: 0.1564354374824592, e_: 0.10564515285055709, diff: 0.050790284631902105
e_s_: 0.15719622996575858, e_: 0.19675743909732768, diff: -0.0395612091315691
e_s_: 0.1594610284917694, e_: 0.2772305518443323, diff: -0.11776952335256291
e_s_: 0.16394693386413664, e_: 0.3972140132272317, diff: -0.23326707936309507
e_s_: 0.16257537753039478, e_: 0.0912544481758184, diff: 0.0713

e_s_: 0.164548518389952, e_: 0.003647814558321638, diff: 0.16090070383163035
e_s_: 0.1620843952415807, e_: 0.03394999152627198, diff: 0.1281344037153087
e_s_: 0.16235425780027535, e_: 0.17638711085239756, diff: -0.014032853052122207
e_s_: 0.1593019760735099, e_: 0.0005833262817076523, diff: 0.15871864979180225
e_s_: 0.1573912532059759, e_: 0.05803366409420807, diff: 0.09935758911176784
e_s_: 0.1604698305756456, e_: 0.3205558537984685, diff: -0.16008602322282292
e_s_: 0.16346226833038863, e_: 0.31906903157702615, diff: -0.15560676324663753
e_s_: 0.16614010076519473, e_: 0.30538738737511206, diff: -0.13924728660991734
e_s_: 0.16775839274560597, e_: 0.25190957572698913, diff: -0.08415118298138316
e_s_: 0.16873721418433846, e_: 0.21963592899842954, diff: -0.050898714814091084
e_s_: 0.16734910467394568, e_: 0.09516741013352137, diff: 0.07218169454042431
e_s_: 0.1656523692421419, e_: 0.0774221267883477, diff: 0.08823024245379421
e_s_: 0.16352408137282373, e_: 0.05285311216827804, diff: 0.110

e_s_: 0.16670947174977505, e_: 0.01753934443402949, diff: 0.14917012731574555
e_s_: 0.16767838122324125, e_: 0.21806167384348507, diff: -0.050383292620243825
e_s_: 0.16787303188885883, e_: 0.17799486650097185, diff: -0.010121834612113023
e_s_: 0.16495617008022095, e_: 0.013279356031052458, diff: 0.1516768140491685
e_s_: 0.16306246429919197, e_: 0.06458976368568559, diff: 0.09847270061350638
e_s_: 0.16047474323171232, e_: 0.02591324772277126, diff: 0.13456149550894106
e_s_: 0.16009998025751, e_: 0.14061230559898918, diff: 0.019487674658520804
e_s_: 0.16372959139530202, e_: 0.3524693705604869, diff: -0.18873977916518486
e_s_: 0.16340957530612268, e_: 0.14676873866879658, diff: 0.0166408366373261
e_s_: 0.16049060755216255, e_: 0.008704284346235713, diff: 0.15178632320592683
e_s_: 0.15842276968199087, e_: 0.0508952004330645, diff: 0.10752756924892637
e_s_: 0.1578345722240193, e_: 0.12724830440949786, diff: 0.03058626781452145
e_s_: 0.16089791081401753, e_: 0.3201915174939245, diff: -0.1592

e_s_: 0.1553942018459896, e_: 0.2897876515255402, diff: -0.1343934496795506
e_s_: 0.15412632925771474, e_: 0.0881969546674215, diff: 0.06592937459029324
e_s_: 0.15206399642969234, e_: 0.04482268937252787, diff: 0.10724130705716448
e_s_: 0.14955089427109017, e_: 0.018869582023777642, diff: 0.13068131224731253
e_s_: 0.1470090059607912, e_: 0.014830813825244427, diff: 0.13217819213554677
e_s_: 0.14550613206318805, e_: 0.0673566893878248, diff: 0.07814944267536325
e_s_: 0.149395026984425, e_: 0.35161756288874635, diff: -0.20222253590432135
e_s_: 0.1514026209670843, e_: 0.2557975080653674, diff: -0.1043948870982831
e_s_: 0.1487450504142256, e_: 0.010551381665574677, diff: 0.1381936687486509
e_s_: 0.1478000670072063, e_: 0.09866092984220387, diff: 0.04913913716500243
e_s_: 0.1464564211740499, e_: 0.07658683784991727, diff: 0.06986958332413262
e_s_: 0.147419167498166, e_: 0.1974819763522042, diff: -0.050062808854038215
e_s_: 0.1469305210273256, e_: 0.12152090454362385, diff: 0.025409616483701

e_s_: 0.1900624722032568, e_: 0.12759680513057825, diff: 0.06246566707267856
e_s_: 0.18876227566818185, e_: 0.12115205584428379, diff: 0.06761021982389806
e_s_: 0.1891558211324424, e_: 0.2096201852739905, diff: -0.02046436414154812
e_s_: 0.1861245519556904, e_: 0.028498554764587514, diff: 0.15762599719110287
e_s_: 0.18317012670401508, e_: 0.029540013616899063, diff: 0.15363011308711602
e_s_: 0.18191943706915004, e_: 0.11688357605616817, diff: 0.06503586101298187
e_s_: 0.18298375799025207, e_: 0.23832844588755764, diff: -0.05534468789730557
e_s_: 0.18655341160173403, e_: 0.37217539939879574, diff: -0.1856219877970617
e_s_: 0.18459633584158708, e_: 0.08282839631394667, diff: 0.1017679395276404
e_s_: 0.18552079545424663, e_: 0.233592695312542, diff: -0.04807189985829538
e_s_: 0.18888106115202105, e_: 0.3636148774362882, diff: -0.17473381628426712
e_s_: 0.18817578454250675, e_: 0.1515014008477631, diff: 0.036674383694743656
e_s_: 0.18468284523839615, e_: 0.003050001424645288, diff: 0.18163

In [85]:
config.l_s

250

In [88]:
# for values at beginning < sequence length, just use avg
if not anom['chan_id'] == 'C-2': #anom occurs early in window (limited data available for channel)
    e_s[:config.l_s] = [np.mean(e_s[:config.l_s*2])]*config.l_s 

In [90]:
np.ptp(y_test) #Range of values (maximum - minimum) along an axis.

2.0000000000000004

In [91]:
np.mean(e) 

0.17008536800478288

In [92]:
anom["normalized_error"] = np.mean(e) / np.ptp(y_test)

In [94]:
i_anom = [] # anomaly indices
window_size = config.window_size
num_windows = int((y_test.shape[0] - (config.batch_size*window_size)) / config.batch_size)

In [107]:
num_windows

87

In [97]:
i = 1
prior_idx = (i-1) * (config.batch_size)
idx = (config.window_size*config.batch_size) + ((i-1) * config.batch_size)

In [98]:
window_e_s = e_s[prior_idx:idx]
window_y_test = y_test[prior_idx:idx]

In [99]:
sd = np.std(e_s)

In [100]:
len(window_e_s)

2100

In [101]:
len(window_y_test)

2100

In [102]:
logger =  logging.getLogger('telemanom')

In [103]:
E_seq, E_seq_scores = err.process_errors(y_test, y_hat, e_s, anom, logger)
anom['scores'] = E_seq_scores

In [104]:
anom = err.evaluate_sequences(E_seq, anom)
anom["num_values"] = y_test.shape[0] + config.l_s + config.n_predictions

In [105]:
for key, value in stats.items():
    stats[key] += anom[key]

In [106]:
anom

{'anomaly_sequences': '[[1899, 2099], [4286, 4594], [3289, 3529]]',
 'chan_id': 'P-1',
 'class': '[contextual, contextual, contextual]',
 'false_negatives': 0,
 'false_positives': 1,
 'fp_sequences': [[2940, 3079]],
 'normalized_error': 0.08504268400239143,
 'num_anoms': 42,
 'num_values': 8505,
 'scores': [12.020106408800693,
  11.719987209162486,
  12.005050252112591,
  12.409526767566376],
 'spacecraft': 'SMAP',
 'tp_sequences': [(1881, 2099), (3290, 3429), (4270, 4339)],
 'true_positives': 3}