In [1]:
import sys
import operator
import os
import numpy as np
import pandas as pd
import time
import json
from operator import itemgetter
import csv
import scipy.stats as stats
from itertools import groupby
from operator import itemgetter
from datetime import datetime as dt

from telemanom._globals import Config
import telemanom.errors as err
import telemanom.helpers as helpers
import telemanom.modeling as models

Using TensorFlow backend.


In [2]:
# init config class
config = Config("config.yaml")

In [5]:
# Current time 
_id = dt.now().strftime("%Y-%m-%d_%H.%M.%S")

In [6]:
'''
HELPER CLASS
'''

'\nHELPER CLASS\n'

In [7]:
import numpy as np
import os
import logging
from datetime import datetime
import sys
import csv
import pandas as pd
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode
import cufflinks as cf
import glob

In [8]:
def make_dirs(_id):
    '''Create directories for storing data in repo (using datetime ID) if they don't already exist'''

    if not config.train or not config.predict:
        if not os.path.isdir('data/%s' %config.use_id):
            raise ValueError("Run ID %s is not valid. If loading prior models or predictions, must provide valid ID.")

    paths = ['data', 'data/%s' %_id, 'data/%s/models' %_id, 'data/%s/smoothed_errors' %_id, 'data/%s/y_hat' %_id]

    for p in paths:
        if not os.path.isdir(p):
            os.mkdir(p)



def setup_logging(config, _id):
    '''Configure logging object to track parameter settings, training, and evaluation.
    
    Args:
        config(obj): Global object specifying system runtime params.

    Returns:
        logger (obj): Logging object
        _id (str): Unique identifier generated from datetime for storing data/models/results
    '''

    logger =  logging.getLogger('telemanom')
    hdlr = logging.FileHandler('data/%s/params.log' %_id)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.INFO)

    stdout = logging.StreamHandler(sys.stdout)
    stdout.setLevel(logging.INFO)
    logger.addHandler(stdout)

    logger.info("Runtime params:")
    logger.info("----------------")
    for attr in dir(config):    
        if not "__" in attr and not attr in ['header', 'date_format', 'path_to_config', 'build_group_lookup']:
            logger.info('%s: %s' %(attr, getattr(config, attr)))
    logger.info("----------------\n")

    return logger



def load_data(anom):
    '''Load train and test data from repo. If not in repo need to download from source.

    Args:
        anom (dict): contains anomaly information for a given input stream

    Returns:
        X_train (np array): array of train inputs with dimensions [timesteps, l_s, input dimensions]
        y_train (np array): array of train outputs corresponding to true values following each sequence
        X_test (np array): array of test inputs with dimensions [timesteps, l_s, input dimensions)
        y_test (np array): array of test outputs corresponding to true values following each sequence
    '''
    try:
        train = np.load(os.path.join("data", "train", anom['chan_id'] + ".npy"))
        test = np.load(os.path.join("data", "test", anom['chan_id'] + ".npy"))

    except:
        raise ValueError("Source data not found, may need to add data to repo: <link>")

    # shape, split data
    X_train, y_train = shape_data(train)
    X_test, y_test = shape_data(test, train=False)

    return X_train, y_train, X_test, y_test


def shape_data(arr, train=True):
    '''Shape raw input streams for ingestion into LSTM. config.l_s specifies the sequence length of 
    prior timesteps fed into the model at each timestep t. 

    Args:
        arr (np array): array of input streams with dimensions [timesteps, 1, input dimensions]
        train (bool): If shaping training data, this indicates data can be shuffled

    Returns:
        X (np array): array of inputs with dimensions [timesteps, l_s, input dimensions)
        y (np array): array of outputs corresponding to true values following each sequence. 
            shape = [timesteps, n_predictions, 1)
        l_s (int): sequence length to be passed to test shaping (if shaping train) so they are consistent
    '''
    
    # print("LEN ARR: %s" %len(arr))

    data = [] 
    for i in range(len(arr) - config.l_s - config.n_predictions):
        data.append(arr[i:i + config.l_s + config.n_predictions])
    data = np.array(data) 

    assert len(data.shape) == 3

    if train == True:
        np.random.shuffle(data)

    X = data[:,:-config.n_predictions,:]
    y = data[:,-config.n_predictions:,0] #telemetry value is at position 0

    return X, y

In [9]:
def final_stats(stats, logger):
    '''Log final stats at end of experiment.

    Args:
        stats (dict): Count of true positives, false positives, and false negatives from experiment
        logger (obj): logging object
    '''

    logger.info("Final Totals:")
    logger.info("-----------------")
    logger.info("True Positives: %s " %stats["true_positives"])
    logger.info("False Positives: %s " %stats["false_positives"])
    logger.info("False Negatives: %s\n" %stats["false_negatives"])
    try:
        logger.info("Precision: %s" %(float(stats["true_positives"])/float(stats["true_positives"]+stats["false_positives"])))
        logger.info("Recall: %s" %(float(stats["true_positives"])/float(stats["true_positives"]+stats["false_negatives"])))
    except:
        logger.info("Precision: NaN")
        logger.info("Recall: NaN")


def anom_stats(stats, anom, logger):
    '''Log stats after processing of each stream.

    Args:
        stats (dict): Count of true positives, false positives, and false negatives from experiment
        anom (dict): contains all anomaly information for a given input stream
        logger (obj): logging object
    '''

    logger.info("TP: %s  FP: %s  FN: %s" %(anom["true_positives"], anom["false_positives"], anom["false_negatives"]))
    logger.info('Total true positives: %s' %stats["true_positives"])
    logger.info('Total false positives: %s' %stats["false_positives"])
    logger.info('Total false negatives: %s\n' %stats["false_negatives"])



def view_results(results_fn, plot_errors=True, plot_train=False, rows=None):
    ''' Reads data from data dir and generates interactive plots for display in `results-viewer.ipynb` using 
    plotly offline mode. A chart showing y_hat and y_test values for each stream is generated by default. 

    Args:
        results_fn (str): name of results csv to plot results for
        plot_errors (bool): If True, a chart displaying the smoothed errors for each stream will be generated
        plot_train (bool): If True, a chart displaying the telemetry from training data is 
            be generated (command data not plotted)
        rows (tuple): Start and end row indicating rows to plot results for in results csv file

    Returns:
        None
    '''

    def create_shapes(ranges, range_type, _min, _max):
        ''' Create shapes for regions to highlight in plotly vizzes (true and predicted anomaly sequences)'''

        if range_type == 'true':
            color = 'red'
        elif range_type == 'predicted':
            color = 'blue'
        
        shapes = []
        if len(ranges) > 0:
        
            for r in ranges:

                shape = {
                    'type': 'rect',
                    'x0': r[0],
                    'y0': _min,
                    'x1': r[1],
                    'y1': _max,
                    'fillcolor': color,
                    'opacity': 0.2,
                    'line': {
                        'width': 0,
                    },
                }
            
                shapes.append(shape)
            
        return shapes



    vals = {}

    with open(results_fn, "r") as f:
        reader = csv.DictReader(f)
        for anom in reader:

            chan = anom["chan_id"]
            vals[chan] = {}
            dirs = ["y_hat", "smoothed_errors"]
            raw_dirs = ["test", "train"]

            for d in dirs:
                vals[chan][d] = list(np.load(os.path.join("../data", config.use_id, d, anom["chan_id"]) + ".npy"))
            for d in raw_dirs:
                vals[chan][d] = list(np.load(os.path.join("../data", d, anom["chan_id"]) + ".npy"))

            row_start = 0
            row_end = 100000
            if not rows == None:
                try:
                    row_start = rows[0]
                    row_end = rows[1]
                except:
                    raise ValueError("Rows not in correct format, please use (<first row>, <last row>)")

            # Info
            # ================================================================================================
            if reader.line_num - 1 >= row_start and reader.line_num -1 <= row_end:
                print("Spacecraft: %s" %anom['spacecraft'])
                print("Channel: %s" %anom["chan_id"])
                print('Normalized prediction error: %.3f' %float(anom['normalized_error']))
                print('Anomaly class(es): %s' %anom['class'])
                print("------------------")
                print('True Positives: %s' %anom['true_positives'])
                print("False Positives: %s" %anom["false_positives"])
                print("False Negatives: %s" %anom["false_negatives"])
                print("------------------")
                print('Predicted anomaly scores: %s' %anom['scores'])
                print("Number of values: %s"%len(vals[chan]["test"]))

                # Extract telemetry values from test data
                # ================================================================================================

                y_test = np.array(vals[chan]['test'])[:,0] 

                # Create highlighted regions (red = true anoms / blue = predicted anoms)
                # ================================================================================================
                y_shapes = create_shapes(eval(anom['anomaly_sequences']), "true", -1, 1)
                y_shapes += create_shapes(eval(anom['tp_sequences']) + eval(anom['fp_sequences']), "predicted", -1, 1)

                e_shapes = create_shapes(eval(anom['anomaly_sequences']), "true", 0, max(vals[chan]['smoothed_errors']))
                e_shapes += create_shapes(eval(anom['tp_sequences']) + eval(anom['fp_sequences']), "predicted", 
                                          0, max(vals[chan]['smoothed_errors']))

                # Move data into dataframes and plot with Plotly
                # ================================================================================================
                train_df = pd.DataFrame({
                    'train': [x[0] for x in vals[chan]['train']]
                })

                y = y_test[config.l_s:-config.n_predictions]
                if not len(y) == len(vals[chan]['y_hat']):
                    modified_l_s = len(y_test) - len(vals[chan]['y_hat']) - 1
                    y = y_test[modified_l_s:-1]
                y_df = pd.DataFrame({
                    'y_hat': vals[chan]['y_hat'],
                    'y': y
                })

                e_df = pd.DataFrame({
                    'e_s': vals[chan]['smoothed_errors']
                })

                y_layout = {
                    'title': "y / y_hat comparison",
                    'shapes': y_shapes,
                } 

                e_layout = {
                    'title': "Smoothed Errors (e_s)",
                    'shapes': e_shapes,
                } 

                if plot_train:
                    train_df.iplot(kind='scatter', color='green')
                
                y_df.iplot(kind='scatter', layout=y_layout)
                
                if plot_errors:
                    e_df.iplot(kind='scatter', layout=e_layout, color='red')

In [10]:
# train model with one file
from keras.models import Sequential, load_model
from keras.callbacks import History, EarlyStopping, Callback
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation, Dropout
import numpy as np
import os

def get_model(anom, X_train, y_train, logger):
    cbs = [History(), EarlyStopping(monitor='val_loss', patience=config.patience, 
            min_delta=config.min_delta, verbose=0)]
        
    model = Sequential()

    model.add(LSTM(
        config.layers[0],
        input_shape=(None, X_train.shape[2]),
        return_sequences=True))
    
    model.add(Dropout(config.dropout))

    model.add(LSTM(
        config.layers[1],
        return_sequences=False))
    model.add(Dropout(config.dropout))

    model.add(Dense(
        config.n_predictions))
    model.add(Activation("linear"))

    model.compile(loss=config.loss_metric, optimizer=config.optimizer) 

    model.fit(X_train, y_train, batch_size=config.lstm_batch_size, epochs=config.epochs, 
        validation_split=config.validation_split, callbacks=cbs, verbose=True)
    model.save(os.path.join("data", anom['run_id'], "models", anom["chan_id"] + ".h5"))

    return model 

In [11]:
def predict_in_batches(y_test, X_test, model, anom):
    '''Used trained LSTM model to predict test data arriving in batches (designed to 
    mimic a spacecraft downlinking schedule).

    Args:
        y_test (np array): numpy array of test outputs corresponding to true values to be predicted at end of each sequence
        X_test (np array): numpy array of test inputs with dimensions [timesteps, l_s, input dimensions)
        model (obj): trained Keras model 
        anom (dict): contains all anomaly information for a given input stream

    Returns:
        y_hat (np array): predicted test values for each timestep in y_test  
    '''

    y_hat = np.array([])

    num_batches = int((y_test.shape[0] - config.l_s) / config.batch_size)
    if num_batches < 0:
        raise ValueError("l_s (%s) too large for stream with length %s." %(config.l_s, y_test.shape[0]))

    # simulate data arriving in batches
    for i in range(1, num_batches+2):
        prior_idx = (i-1) * config.batch_size
        idx = i * config.batch_size
        if i == num_batches+1:
            idx = y_test.shape[0] #remaining values won't necessarily equal batch size
        
        X_test_period = X_test[prior_idx:idx]

        y_hat_period = model.predict(X_test_period)

        # map predictions n steps ahead to their corresponding timestep
        # TODO: vectorize
        final_y_hat = []
        for t in range(len(y_hat_period)+config.n_predictions):
            y_hat_t = []
            for j in range(config.n_predictions):
                if t - j >= 0 and t-j < len(y_hat_period):
                    y_hat_t.append(y_hat_period[t-j][j])
            if t < len(y_hat_period):
                if y_hat_t.count(0) == len(y_hat_t):
                    final_y_hat.append(0)
                else:
                    final_y_hat.append(y_hat_t[0]) # first prediction


        y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)
        y_hat = np.append(y_hat, y_hat_period)

    y_hat = np.reshape(y_hat, (y_hat.size,))

    np.save(os.path.join("data", anom['run_id'], "y_hat", anom["chan_id"] + ".npy"), np.array(y_hat))

In [13]:
stats = {
        "true_positives": 0,
        "false_positives": 0,
        "false_negatives": 0
    }

with open("labeled_anomalies.csv", "rU") as f:
    reader = csv.DictReader(f)
    for i, anom in enumerate(reader):
        p1 = anom
        break


'U' mode is deprecated



In [14]:
p1

OrderedDict([('chan_id', 'P-1'),
             ('spacecraft', 'SMAP'),
             ('anomaly_sequences',
              '[[1899, 2099], [4286, 4594], [3289, 3529]]'),
             ('class', '[contextual, contextual, contextual]'),
             ('num_values', '8505')])

In [15]:
X_train, y_train, X_test, y_test = load_data(p1)

In [44]:
print("train_shape: {0}, X_train_shape: {1}, y_train_shape: {2}".format(train.shape,X_train.shape,y_train.shape))

train_shape: (2872, 25), X_train_shape: (2612, 250, 25), y_train_shape: (2612, 10)


In [46]:
print("X_test_shape: {1}, y_test_shape: {2}".format(train.shape,X_test.shape,y_test.shape))

X_test_shape: (8245, 250, 25), y_test_shape: (8245, 10)


In [50]:
make_dirs(_id)  
logger = setup_logging(config,_id)

Runtime params:
----------------
batch_size: 70
dropout: 0.3
epochs: 35
error_buffer: 100
l_s: 250
layers: [80, 80]
loss_metric: mse
lstm_batch_size: 64
min_delta: 0.0003
n_predictions: 10
optimizer: adam
p: 0.13
patience: 10
predict: True
smoothing_perc: 0.05
train: True
use_id: 2018-05-19_15.00.10
validation_split: 0.2
window_size: 30
----------------



In [47]:
y_hat = []

In [52]:
model = get_model(anom, X_train, y_train, logger)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 2089 samples, validate on 523 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [20]:
# P1 model
model = load_model(os.path.join("data", "2019-10-19_12.05.37", "models", p1["chan_id"] + ".h5"))





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [40]:
y_hat = np.array([])

In [21]:
i = 1
prior_idx = (i-1) * config.batch_size
idx = i * config.batch_size
X_test_period = X_test[prior_idx:idx]

In [22]:
X_test_period.shape

(70, 250, 25)

In [24]:
y_hat_period = model.predict(X_test_period)

In [25]:
y_hat_period.shape

(70, 10)

In [None]:
for t in range(len(y_hat_period)+config.n_predictions):
     for j in range(config.n_predictions):
        if t - j >= 0 and t-j < len(y_hat_period):
           

In [26]:
 # map predictions n steps ahead to their corresponding timestep
# TODO: vectorize
final_y_hat = []
for t in range(len(y_hat_period)+config.n_predictions):
    y_hat_t = []
    for j in range(config.n_predictions):
        if t - j >= 0 and t-j < len(y_hat_period):
            print("y_hat_t appending:{0} when t: {1}, j: {2}".format(y_hat_period[t-j][j],t,j))
            y_hat_t.append(y_hat_period[t-j][j])
    if t < len(y_hat_period):
        if y_hat_t.count(0) == len(y_hat_t):
            print("final_y_hat appending:{0} when t: {1}, j: {2}".format(0,t,j))
            final_y_hat.append(0)
        else:
            print("final_y_hat appending:{0} when t: {1}, j: {2}".format(y_hat_t[0],t,j))
            final_y_hat.append(y_hat_t[0]) # first prediction

y_hat_t appending:-0.6770421266555786 when t: 0, j: 0
final_y_hat appending:-0.6770421266555786 when t: 0, j: 9
y_hat_t appending:-0.5974261164665222 when t: 1, j: 0
y_hat_t appending:-0.5551894903182983 when t: 1, j: 1
final_y_hat appending:-0.5974261164665222 when t: 1, j: 9
y_hat_t appending:-0.3904535174369812 when t: 2, j: 0
y_hat_t appending:-0.45257142186164856 when t: 2, j: 1
y_hat_t appending:-0.45999521017074585 when t: 2, j: 2
final_y_hat appending:-0.3904535174369812 when t: 2, j: 9
y_hat_t appending:-0.027354389429092407 when t: 3, j: 0
y_hat_t appending:-0.0529254749417305 when t: 3, j: 1
y_hat_t appending:-0.347562313079834 when t: 3, j: 2
y_hat_t appending:-0.29471099376678467 when t: 3, j: 3
final_y_hat appending:-0.027354389429092407 when t: 3, j: 9
y_hat_t appending:0.25842124223709106 when t: 4, j: 0
y_hat_t appending:0.2760859727859497 when t: 4, j: 1
y_hat_t appending:0.2608155608177185 when t: 4, j: 2
y_hat_t appending:-0.18623128533363342 when t: 4, j: 3
y_hat_t

In [33]:
print(y_hat_period.shape)
print(y_hat_period)

(70, 10)
[[-0.6770421  -0.5551895  -0.4599952  -0.294711   -0.16158542 -0.0400058
   0.03732682  0.01172672 -0.0188783  -0.10645869]
 [-0.5974261  -0.45257142 -0.3475623  -0.18623129 -0.08983693 -0.02891666
  -0.00086468 -0.081485   -0.15216562 -0.24866883]
 [-0.39045352 -0.05292547  0.26081556  0.4546463   0.45914608  0.4315275
   0.37627184  0.3036738   0.24189723  0.14200293]
 [-0.02735439  0.27608597  0.47576302  0.5400898   0.5250947   0.47342008
   0.3676131   0.30703518  0.2059778   0.05133386]
 [ 0.25842124  0.4235867   0.508266    0.5085054   0.503571    0.4365129
   0.3248022   0.24925911  0.07909243 -0.13165544]
 [ 0.39303803  0.5023932   0.5449575   0.5030302   0.4746124   0.36876482
   0.19987321  0.06849283 -0.13108887 -0.33641827]
 [ 0.46716172  0.54288167  0.55855423  0.46528637  0.37945643  0.2286007
   0.01404979 -0.1486843  -0.3285302  -0.48577702]
 [ 0.52979684  0.5691159   0.54356605  0.38662058  0.2303429   0.05151409
  -0.18043366 -0.3421725  -0.47540966 -0.57106

In [34]:
print(len(final_y_hat))
print(final_y_hat)

70
[-0.6770421, -0.5974261, -0.39045352, -0.02735439, 0.25842124, 0.39303803, 0.46716172, 0.52979684, 0.54919237, 0.53861594, 0.47006792, 0.34165406, 0.21991129, 0.04706092, -0.11691395, -0.21143019, -0.34096992, -0.536691, -0.6701772, -0.6922969, -0.6164344, -0.45837212, -0.32611302, -0.19159502, -0.18923463, -0.26894957, -0.3579138, -0.46402556, -0.57980746, -0.64089406, -0.6646563, -0.66954684, -0.67042553, -0.6690006, -0.6711086, -0.69331896, -0.7049776, -0.7003339, -0.69291806, -0.59087855, -0.4963504, -0.3300259, -0.24017477, -0.25901964, -0.4384228, -0.5795539, -0.66974473, -0.7234544, -0.743745, -0.7434748, -0.73426676, -0.71519244, -0.68880033, -0.68457127, -0.64814293, -0.59873545, -0.5381755, -0.4386655, -0.36213535, -0.21792522, -0.011626489, 0.07156119, 0.16968031, 0.26661474, 0.29140842, 0.26931757, 0.23747368, 0.18900554, 0.20550825, 0.20900099]


In [35]:
y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)

In [38]:
print(y_hat_period.shape)
print(y_hat_period)

(70, 1)
[[-0.6770421 ]
 [-0.5974261 ]
 [-0.39045352]
 [-0.02735439]
 [ 0.25842124]
 [ 0.39303803]
 [ 0.46716172]
 [ 0.52979684]
 [ 0.54919237]
 [ 0.53861594]
 [ 0.47006792]
 [ 0.34165406]
 [ 0.21991129]
 [ 0.04706092]
 [-0.11691395]
 [-0.21143019]
 [-0.34096992]
 [-0.536691  ]
 [-0.6701772 ]
 [-0.6922969 ]
 [-0.6164344 ]
 [-0.45837212]
 [-0.32611302]
 [-0.19159502]
 [-0.18923463]
 [-0.26894957]
 [-0.3579138 ]
 [-0.46402556]
 [-0.57980746]
 [-0.64089406]
 [-0.6646563 ]
 [-0.66954684]
 [-0.67042553]
 [-0.6690006 ]
 [-0.6711086 ]
 [-0.69331896]
 [-0.7049776 ]
 [-0.7003339 ]
 [-0.69291806]
 [-0.59087855]
 [-0.4963504 ]
 [-0.3300259 ]
 [-0.24017477]
 [-0.25901964]
 [-0.4384228 ]
 [-0.5795539 ]
 [-0.66974473]
 [-0.7234544 ]
 [-0.743745  ]
 [-0.7434748 ]
 [-0.73426676]
 [-0.71519244]
 [-0.68880033]
 [-0.68457127]
 [-0.64814293]
 [-0.59873545]
 [-0.5381755 ]
 [-0.4386655 ]
 [-0.36213535]
 [-0.21792522]
 [-0.01162649]
 [ 0.07156119]
 [ 0.16968031]
 [ 0.26661474]
 [ 0.29140842]
 [ 0.26931757]
 [

In [64]:
y_hat = np.array([])

num_batches = int((y_test.shape[0] - config.l_s) / config.batch_size)
if num_batches < 0:
    raise ValueError("l_s (%s) too large for stream with length %s." %(config.l_s, y_test.shape[0]))

# simulate data arriving in batches
for i in range(1, num_batches+2):
    prior_idx = (i-1) * config.batch_size
    idx = i * config.batch_size
    if i == num_batches+1:
        idx = y_test.shape[0] #remaining values won't necessarily equal batch size

    X_test_period = X_test[prior_idx:idx]

    y_hat_period = model.predict(X_test_period)

    # map predictions n steps ahead to their corresponding timestep
    # TODO: vectorize

    final_y_hat = []
    for t in range(len(y_hat_period)+config.n_predictions):
        y_hat_t = []
        for j in range(config.n_predictions):
            if t - j >= 0 and t-j < len(y_hat_period):
                y_hat_t.append(y_hat_period[t-j][j])
        if t < len(y_hat_period):
            if y_hat_t.count(0) == len(y_hat_t):
                final_y_hat.append(0)
            else:
                final_y_hat.append(y_hat_t[0]) # first prediction

    y_hat_period = np.array(final_y_hat).reshape(len(final_y_hat),1)
    y_hat = np.append(y_hat, y_hat_period)

y_hat = np.reshape(y_hat, (y_hat.size,))

y_test.shape

In [65]:
y_hat.shape

(8245,)

In [None]:
'''
Get error
'''

In [76]:
e = [abs(y_h-y_t[0]) for y_h,y_t in zip(y_hat, y_test)]

In [79]:
config.smoothing_perc

0.05

In [81]:
int(config.batch_size * config.window_size * config.smoothing_perc)

105

In [82]:
e

[0.13684572339597345,
 0.04049640807531807,
 0.1630981147491415,
 0.19345321982542796,
 0.15882130651540516,
 0.0023784178170953574,
 0.031730788046820724,
 0.1697302465001953,
 0.17943610724028458,
 0.16309364636213575,
 0.06285304842905548,
 0.2203682954225017,
 0.3479315709907662,
 0.4924077673436389,
 0.6229159597708898,
 0.10395723977598381,
 0.3963891367269454,
 0.09153752888986544,
 0.26894033458071975,
 0.2881498892898722,
 0.24720911679035895,
 0.053450326209954824,
 0.19770272557103352,
 0.00425416098098097,
 0.0017438038582695548,
 0.10343480783892822,
 0.06878683831429155,
 0.3475422634472056,
 0.0971658853595807,
 0.0032076833922256576,
 0.0276974105312644,
 0.0034864386088502286,
 0.03346662779278908,
 0.028404036405403232,
 0.09389687781875944,
 0.009070145337691127,
 0.022183791835292066,
 0.10686877892115776,
 0.20728700673798706,
 0.08536729839540769,
 0.2682675038782607,
 0.07138647251419528,
 0.032287827393847834,
 0.6253018324077733,
 0.317852243587299,
 0.15926015

In [84]:
smoothing_window = int(config.batch_size * config.window_size * config.smoothing_perc)
if not len(y_hat) == len(y_test):
    raise ValueError("len(y_hat) != len(y_test), can't calculate error: %s (y_hat) , %s (y_test)" %(len(y_hat), len(y_test)))

e_s = list(pd.DataFrame(e).ewm(span=smoothing_window).mean().values.flatten())

In [87]:
pd.DataFrame(e).ewm(span=smoothing_window).mean()

Unnamed: 0,0
0,0.136846
1,0.088212
2,0.113651
3,0.134175
4,0.139294
...,...
8240,0.156434
8241,0.159841
8242,0.157998
8243,0.158852


In [91]:
for e_s_,e_ in zip(e_s,e):
    print("e_s_: {0}, e_: {1}, diff: {2}".format(e_s_,e_,e_s_-e_))

e_s_: 0.13684572339597345, e_: 0.13684572339597345, diff: 0.0
e_s_: 0.08821225947221406, e_: 0.04049640807531807, diff: 0.04771585139689599
e_s_: 0.11365117197347044, e_: 0.1630981147491415, diff: -0.04944694277567105
e_s_: 0.1341752829113504, e_: 0.19345321982542796, diff: -0.05927793691407757
e_s_: 0.13929403791193712, e_: 0.15882130651540516, diff: -0.01952726860346804
e_s_: 0.11537447229196869, e_: 0.0023784178170953574, diff: 0.11299605447487333
e_s_: 0.10273183769249915, e_: 0.031730788046820724, diff: 0.07100104964567842
e_s_: 0.11167548747778705, e_: 0.1697302465001953, diff: -0.05805475902240825
e_s_: 0.11979069889472445, e_: 0.17943610724028458, diff: -0.05964540834556013
e_s_: 0.12450149187837868, e_: 0.16309364636213575, diff: -0.038592154483757066
e_s_: 0.11834824319684183, e_: 0.06285304842905548, diff: 0.055495194767786346
e_s_: 0.12776854257948803, e_: 0.2203682954225017, diff: -0.09259975284301367
e_s_: 0.14670652563141023, e_: 0.3479315709907662, diff: -0.201225045359

e_s_: 0.18633774344031392, e_: 0.4643012347662179, diff: -0.2779634913259039
e_s_: 0.18717410655092243, e_: 0.2306649883025642, diff: -0.043490881751641786
e_s_: 0.18812849843137758, e_: 0.23775687621504638, diff: -0.0496283777836688
e_s_: 0.18956852926232207, e_: 0.26445013247143745, diff: -0.07488160320911538
e_s_: 0.19097289631917977, e_: 0.26399998327578, diff: -0.07302708695660023
e_s_: 0.19197430386137748, e_: 0.24404749605565978, diff: -0.0520731921942823
e_s_: 0.1951865994501863, e_: 0.3622259700682422, diff: -0.1670393706180559
e_s_: 0.19449559738124966, e_: 0.15856348979654555, diff: 0.03593210758470411
e_s_: 0.19302931364204906, e_: 0.11678255920361913, diff: 0.07624675443842993
e_s_: 0.19154450427616107, e_: 0.11433441724998605, diff: 0.07721008702617502
e_s_: 0.18897377064037582, e_: 0.05529562157954304, diff: 0.13367814906083278
e_s_: 0.18541676598659168, e_: 0.0004525239898161004, diff: 0.18496424199677558
e_s_: 0.18223352591918762, e_: 0.01670504241417703, diff: 0.16552

e_s_: 0.1356398590392238, e_: 0.29486664165052234, diff: -0.15922678261129855
e_s_: 0.13338111029868074, e_: 0.015926175790442976, diff: 0.11745493450823777
e_s_: 0.13551198274612428, e_: 0.24631735001318766, diff: -0.11080536726706339
e_s_: 0.13562288693670815, e_: 0.14138990484706948, diff: -0.005767017910361327
e_s_: 0.1359084004649422, e_: 0.1507551039331121, diff: -0.01484670346816988
e_s_: 0.13795765558387255, e_: 0.24451892176824996, diff: -0.10656126618437742
e_s_: 0.13605136587160432, e_: 0.036924300833656165, diff: 0.09912706503794816
e_s_: 0.13463618898502636, e_: 0.06104699088297272, diff: 0.07358919810205364
e_s_: 0.13940714387217276, e_: 0.3874967980037849, diff: -0.24808965413161213
e_s_: 0.1408259604104392, e_: 0.21460442040029326, diff: -0.07377845998985405
e_s_: 0.1415709242707172, e_: 0.18030904500517453, diff: -0.038738120734457315
e_s_: 0.140128730156419, e_: 0.06513463621291082, diff: 0.07499409394350817
e_s_: 0.13877362774365243, e_: 0.0683083022797919, diff: 0.0

In [94]:
# for values at beginning < sequence length, just use avg
if not anom['chan_id'] == 'C-2': #anom occurs early in window (limited data available for channel)
    e_s[:config.l_s] = [np.mean(e_s[:config.l_s*2])]*config.l_s 

In [95]:
np.ptp(y_test) #Range of values (maximum - minimum) along an axis.

2.0000000000000004

In [104]:
np.mean(e) 

0.16041766925218764

In [103]:
anom["normalized_error"] = np.mean(e) / np.ptp(y_test)

In [110]:
len(e)

8245

In [106]:
i_anom = [] # anomaly indices
window_size = config.window_size
num_windows = int((y_test.shape[0] - (config.batch_size*window_size)) / config.batch_size)

In [107]:
num_windows

87

In [120]:
i = 1
prior_idx = (i-1) * (config.batch_size)
idx = (config.window_size*config.batch_size) + ((i-1) * config.batch_size)

In [121]:
window_e_s = e_s[prior_idx:idx]
window_y_test = y_test[prior_idx:idx]

In [125]:
sd = np.std(e_s)

In [123]:
len(window_e_s)

2100

In [124]:
len(window_y_test)

2100

In [128]:
logger =  logging.getLogger('telemanom')

In [129]:
E_seq, E_seq_scores = err.process_errors(y_test, y_hat, e_s, anom, logger)
anom['scores'] = E_seq_scores

In [132]:
anom = err.evaluate_sequences(E_seq, anom)
anom["num_values"] = y_test.shape[0] + config.l_s + config.n_predictions

In [134]:
for key, value in stats.items():
    stats[key] += anom[key]