In [None]:
# Step1. Load Configurations from a yaml file

In [1]:
import yaml
import json
import sys
import os
class Config:
    '''Loads parameters from config.yaml into global object'''

    def __init__(self, path_to_config):
        
        if os.path.isfile(path_to_config):    
            pass
        else:
            print("No configuration path found.")

        setattr(self, "path_to_config", path_to_config)

        dictionary = None
        
        with open(path_to_config, "r") as f:
            dictionary = yaml.load(f.read())
                
        try:
            for k,v in dictionary.items():
                setattr(self, k, v)
        except:
            for k,v in dictionary.iteritems():
                setattr(self, k, v)
                
# init config class
config = Config("config.yaml")



In [2]:
# Step2. Load & Preprocess data

In [3]:
import numpy as np
import sys
import os

def load_preprocess_data(train_path, test_path):
    '''Load & Preprocess data

    Returns:
        X_train (np array): array of train inputs with dimensions [timesteps, l_s, input dimensions]
        y_train (np array): array of train outputs corresponding to true values following each sequence
        X_test (np array): array of test inputs with dimensions [timesteps, l_s, input dimensions)
        y_test (np array): array of test outputs corresponding to true values following each sequence
    '''
    train = np.load(train_path)
    test = np.load(test_path)
    # shape, split data
    X_train, y_train = shape_data(train)
    X_test, y_test = shape_data(test, train=False)
    return X_train, y_train, X_test, y_test

def shape_data(arr, train=True):
    '''Shape raw input streams for ingestion into LSTM. config.l_s specifies the sequence length of 
    prior timesteps fed into the model at each timestep t. 

    Args:
        arr (np array): array of input streams with dimensions [timesteps, 1, input dimensions]
        train (bool): If shaping training data, this indicates data can be shuffled

    Returns:
        X (np array): array of inputs with dimensions [timesteps, l_s, input dimensions)
        y (np array): array of outputs corresponding to true values following each sequence. 
            shape = [timesteps, n_predictions, 1)
        l_s (int): sequence length to be passed to test shaping (if shaping train) so they are consistent
    '''
    
    # print("LEN ARR: %s" %len(arr))

    data = [] 
    for i in range(len(arr) - config.l_s - config.n_predictions):
        data.append(arr[i:i + config.l_s + config.n_predictions])
    data = np.array(data) 

    assert len(data.shape) == 3

    if train == True:
        np.random.shuffle(data)

    X = data[:,:-config.n_predictions,:]
    y = data[:,-config.n_predictions:,0] #telemetry value is at position 0

    return X, y

In [4]:
import numpy as np

def load_data(anom):
    '''Load train and test data from repo. If not in repo need to download from source.

    Args:
        anom (dict): contains anomaly information for a given input stream

    Returns:
        X_train (np array): array of train inputs with dimensions [timesteps, l_s, input dimensions]
        y_train (np array): array of train outputs corresponding to true values following each sequence
        X_test (np array): array of test inputs with dimensions [timesteps, l_s, input dimensions)
        y_test (np array): array of test outputs corresponding to true values following each sequence
    '''
    try:
        train = np.load(os.path.join("data", "train", anom['chan_id'] + ".npy"))
        test = np.load(os.path.join("data", "test", anom['chan_id'] + ".npy"))

    except:
        raise ValueError("Source data not found, may need to add data to repo: <link>")

    # shape, split data
    X_train, y_train = shape_data(train)
    X_test, y_test = shape_data(test, train=False)

    return X_train, y_train, X_test, y_test


def shape_data(arr, train=True):
    '''Shape raw input streams for ingestion into LSTM. config.l_s specifies the sequence length of 
    prior timesteps fed into the model at each timestep t. 

    Args:
        arr (np array): array of input streams with dimensions [timesteps, 1, input dimensions]
        train (bool): If shaping training data, this indicates data can be shuffled

    Returns:
        X (np array): array of inputs with dimensions [timesteps, l_s, input dimensions)
        y (np array): array of outputs corresponding to true values following each sequence. 
            shape = [timesteps, n_predictions, 1)
        l_s (int): sequence length to be passed to test shaping (if shaping train) so they are consistent
    '''
    
    # print("LEN ARR: %s" %len(arr))

    data = [] 
    for i in range(len(arr) - config.l_s - config.n_predictions):
        data.append(arr[i:i + config.l_s + config.n_predictions])
    data = np.array(data) 

    assert len(data.shape) == 3

    if train == True:
        np.random.shuffle(data)

    X = data[:,:-config.n_predictions,:]
    y = data[:,-config.n_predictions:,0] #telemetry value is at position 0

    return X, y
    