In [10]:
# coding: utf-8
import argparse
import sys

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM , SimpleRNN
from keras.layers import Dropout
from keras.layers import TimeDistributed
from keras.layers import Flatten, Reshape
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.utils.training_utils import multi_gpu_model
import tensorflow as tf
import numpy as np
from utilities.config_handler import get_config
from utilities.learning import  get_clipped_loss, split_train_validation, \
    train_model, predict_error_vectors, reshape_errors, train_gmm
from utilities.preprocessing import series_to_supervised, trim_by_seq_length, reshape_to_seq, \
    get_X_and_Y_columns,persist_object,load_object, load_test_data, load_train_data, scale_train_vectors, \
    scale_test_vectors
from utilities.detection import compute_emd_split_samples,compute_emd_distributions, detect_anomalies_median

# Argument parsing

In [11]:
sys.argv = '-m train -n iq_data/gps_new/norm --weights-save-path model/rnn_model.hdf5'.split()

In [12]:
parser = argparse.ArgumentParser()
parser.prog = 'Spectrum Anomaly Detection'
parser.description = 'Use this command parser for training or testing the anomaly detector'
parser.add_argument('-m', '--mode', help='train or test mode', choices=['train', 'test'])
parser.add_argument('-n', '--normal-data-dir', help='normal I/Q recording directory (for train mode)')
parser.add_argument('-a', '--anomaly-data-dir', help='anomaly I/Q recording directory (for test mode)')
parser.add_argument('-s', '--weights-save-path', help='path for trained weights (for train mode)')
parser.add_argument('-l', '--weights-load-path', help='path for loading weights (for test mode)')

namespace = parser.parse_args(sys.argv)
if (not namespace.normal_data_dir and namespace.mode == 'train'):
    parser.error('the -n arg must be present when mode is train')
if (not namespace.weights_save_path and namespace.mode == 'train'):
    parser.error('the -s arg must be present when mode is train')

if (not namespace.anomaly_data_dir and namespace.mode == 'test'):
    parser.error('the -a arg must be present when mode is test')
if (not namespace.weights_load_path and namespace.mode == 'test'):
    parser.error('the -l arg must be present when mode is test')

# Hyper parameters

In [14]:
conf=get_config()
gpus = conf['gpus']
seq_input_length = conf['learning']['rnn']['seq_input_length']
seq_output_length = conf['learning']['rnn']['seq_output_length']
output_padding = conf['learning']['rnn']['output_padding']
input_padding = conf['learning']['rnn']['input_padding']
lr=conf['learning']['lr']


normal_data_dir = namespace.normal_data_dir
anomaly_data_dir = namespace.anomaly_data_dir
seq_pad_length = seq_input_length + seq_output_length
use_padding = seq_input_length != seq_output_length
train = namespace.mode == 'train'
opt = Adam(lr=lr)

In [15]:
if normal_data_dir:
    assert len(normal_data_dir) != 0
if anomaly_data_dir:
    assert len(anomaly_data_dir) != 0

# Loading,whitening,scaling

In [16]:
#loading,whitening,scaling
if train:
    train_data = load_train_data(normal_data_dir)
else:
    test_data = load_test_data(anomaly_data_dir)

saving to:model/lstm/train_scaler.pkl


In [18]:
train_data.shape

(2095104, 2)

In [21]:
train_data[:,1]

array([ 0.16666669, -0.14285713,  0.4761905 , ...,  0.3214286 ,
       -0.09523809,  0.26190478], dtype=float32)

# Create the output sequences

In [None]:
if train:
    train_data = series_to_supervised(train_data,n_in=seq_input_length,n_out=seq_output_length)
    if anomaly_data_dir:
        test_data = series_to_supervised(test_data,n_in=seq_input_length,n_out=seq_output_length)
else:
    test_data = series_to_supervised(test_data,n_in=seq_input_length,n_out=seq_output_length)

# Trim the data to fit the sequence length (SEQ_LENGTH)

In [None]:
if train:
    train_data = trim_by_seq_length(train_data,seq_input_length)
    if anomaly_data_dir:
        test_data = trim_by_seq_length(test_data,seq_input_length) 
else:
    test_data = trim_by_seq_length(test_data,seq_input_length)

In [None]:
if train:
    (X_train,Y_train) = get_X_and_Y_columns(train_data)
    if anomaly_data_dir:
        (X_test,Y_test) = get_X_and_Y_columns(test_data)
else:
    (X_test,Y_test) = get_X_and_Y_columns(test_data)

In [None]:
if train:
    X_train = reshape_to_seq(X_train,seq_input_length)
    Y_train = reshape_to_seq(Y_train,seq_output_length)
    if anomaly_data_dir:
        X_test = reshape_to_seq(X_test,seq_input_length)
        Y_test = reshape_to_seq(Y_test,seq_output_length)
else:
    X_test = reshape_to_seq(X_test,seq_input_length)
    Y_test = reshape_to_seq(Y_test,seq_output_length)

# Pad input/output sequences

In [None]:
if use_padding:
    if train:
        X_train = pad_sequences(X_train,maxlen=seq_pad_length,dtype='float64',padding=input_padding)
        Y_train = pad_sequences(Y_train,maxlen=seq_pad_length,dtype='float64',padding=output_padding)
        
        if anomaly_data_dir:
            X_test = pad_sequences(X_test,maxlen=seq_pad_length,dtype='float64',padding=input_padding)
            Y_test = pad_sequences(Y_test,maxlen=seq_pad_length,dtype='float64',padding=output_padding)
    else:
        X_test = pad_sequences(X_test,maxlen=seq_pad_length,dtype='float64',padding=input_padding)
        Y_test = pad_sequences(Y_test,maxlen=seq_pad_length,dtype='float64',padding=output_padding)

In [None]:
if train:
    inp_shape=X_train.shape
else:
    inp_shape=X_test.shape

# Model and loss definition

In [None]:
def get_vannila_rnn_model(loss_fn):
    model = Sequential()
    model.add(SimpleRNN(seq_pad_length, input_shape=(inp_shape[1], inp_shape[2]), return_sequences=True, name='rnn1'))
    #     model.add(Dropout(0.5))
    model.add(SimpleRNN(seq_pad_length, return_sequences=True, name='rnn2'))
    #     model.add(Dropout(0.5))
    if use_padding:
        model.add(TimeDistributed(Dense(units=2 * seq_pad_length, activation='relu'), name='dense1'))
        #         model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(units=2 * seq_pad_length, activation='linear', name='dense2'))
        model.add(Reshape((seq_pad_length, 2,)))
    else:
        model.add(TimeDistributed(Dense(units=2 * seq_output_length, activation='relu'), name='dense1'))
        #         model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(units=2 * seq_output_length, activation='linear', name='dense2'))
        model.add(Reshape((seq_output_length, 2,)))
    return model


def get_lstm_model(loss_fn):
    model = Sequential()
    model.add(LSTM(seq_pad_length,input_shape = (inp_shape[1], inp_shape[2]),return_sequences=True,name='lstm1'))
#     model.add(Dropout(0.5))
    model.add(LSTM(seq_pad_length,return_sequences=True,name='lstm2'))
#     model.add(Dropout(0.5))
    if use_padding:
        model.add(TimeDistributed(Dense(units=3*seq_pad_length,activation='relu'),name='dense1'))
#         model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(units=2*seq_pad_length,activation='linear',name='dense2'))
        model.add(Reshape((seq_pad_length,2,)))
    else:
        model.add(TimeDistributed(Dense(units=2*seq_output_length,activation='relu'),name='dense1'))
#         model.add(Dropout(0.5))
        model.add(Flatten())
        model.add(Dense(units=2*seq_output_length,activation='linear',name='dense2'))
        model.add(Reshape((seq_output_length,2,)))
    return model

def get_lstm_dense_model(loss_fn):
    model = Sequential()
    model.add(LSTM(seq_input_length,input_shape = (inp_shape[1], inp_shape[2]),return_sequences=True,name='lstm1'))
    model.add(Dropout(0.5))
    model.add(LSTM(seq_input_length,return_sequences=True,name='lstm2'))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(units=seq_pad_length,activation='relu'),name='dense1'))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(units=12,activation='tanh'),name='dense2'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(units=seq_output_length*2, name='dense3', activation='sigmoid'))
    model.add(Reshape((seq_output_length,2,)))
    return model

# Model training

In [None]:
if use_padding:
    loss_fn = get_clipped_loss()
else:
    loss_fn = 'mse'

if gpus <= 1:
    model = get_vannila_rnn_model(loss_fn)
    model.summary()
    model.compile(optimizer=opt, loss=loss_fn)
else:
    with tf.device("/cpu:0"):
        model = get_vannila_rnn_model(loss_fn)
        model.summary()
    model_multi = multi_gpu_model(model, gpus=gpus)
    model_multi.compile(loss=loss_fn,
                             optimizer=opt)


In [None]:
if train:
    weights_save_path = namespace.weights_save_path
    (X_train, Y_train, X_val, Y_val) = split_train_validation(X_train, Y_train)
    if gpus <= 1:
        train_model(model, X_train, Y_train, X_val, Y_val)
    else:
        train_model(model_multi, X_train, Y_train, X_val, Y_val)
    model.save(weights_save_path)
else:
    weights_load_path = namespace.weights_load_path
    model.load_weights(weights_load_path)

In [None]:
if train:
    if gpus <= 1:
        train_errors = predict_error_vectors(X_train, Y_train, model)
        val_errors = predict_error_vectors(X_val, Y_val, model)
    else:
        train_errors = predict_error_vectors(X_train, Y_train, model_multi)
        val_errors = predict_error_vectors(X_val, Y_val, model_multi)

    train_errors = reshape_errors(train_errors)
    val_errors = reshape_errors(val_errors)

else:
    if gpus <= 1:
        test_errors = predict_error_vectors(X_test, Y_test, model)
    else:
        test_errors = predict_error_vectors(X_test, Y_test, model_multi)
    test_errors = reshape_errors(test_errors)


# Scale errors

In [None]:
error_scaler_path = 'model/lstm/error_scaler.pkl'
train_errors_path = 'model/lstm/train_errors.pkl'
val_errors_path = 'model/lstm/val_errors.pkl'
if train:
    (scaled_train_errors, error_scaler) = scale_train_vectors(train_errors, error_scaler_path)
    persist_object(scaled_train_errors, train_errors_path)

    scaled_val_errors = error_scaler.transform(val_errors)
    persist_object(scaled_val_errors, val_errors_path)

    if anomaly_data_dir:
        scaled_test_errors = error_scaler.transform(test_errors)
else:
    scaled_train_errors = load_object(train_errors_path)
    scaled_test_errors = scale_test_vectors(test_errors, error_scaler_path)


# Error density estimation - GMM

In [None]:
gmm_save_path = 'model/lstm/gmm.pkl'
if train:
    gmm = train_gmm(gmm_save_path,scaled_train_errors)
else:
    gmm=load_object(gmm_save_path)

In [None]:
train_scores_path = 'model/lstm/train_scores.pkl'
val_scores_path = 'model/lstm/val_scores.pkl'
if train:
    train_scores = (gmm.score_samples(scaled_train_errors))
    persist_object(train_scores, train_scores_path)

    val_scores = (gmm.score_samples(scaled_val_errors))
    persist_object(val_scores, val_scores_path)
    if anomaly_data_dir:
        test_scores = (gmm.score_samples(scaled_test_errors))
else:
    test_scores = (gmm.score_samples(scaled_test_errors))
    try:
        train_scores = load_object(train_scores_path)
    except:
        raise Exception('No train scores are found, please train to obtain them')

## Anomaly detection phase - EMD

In [None]:
# Dataset-wise
if not train:
    # for now, just return the EMD between the train and test scores
    emd_dists=compute_emd_distributions(train_scores,test_scores)
    print("Overall distributions EMD:", emd_dists)

In [None]:
val_emds_path = 'model/lstm/val_emds.pkl'
if train:
    val_emds = compute_emd_split_samples(val_scores, train_scores)
    persist_object(val_emds, val_emds_path)

    if anomaly_data_dir:
        test_emds = compute_emd_split_samples(test_scores, train_scores)
else:
    val_emds = load_object(val_emds_path)
    test_emds = compute_emd_split_samples(test_scores, train_scores)


# EMD detection experiement

In [None]:
if not train:
    detect_anomalies_median(val_emds,test_emds)

# Visualizations

In [None]:
from matplotlib import pyplot as plt
num_clusters = conf['learning']['num_clusters']

In [None]:
fig, ax = pyplot.subplots(nrows=1, ncols=2,sharey=True , figsize=(20,8))
ax0 , ax1 = ax.flatten()
if train:
    ax0.hist(gmm.predict(scaled_train_errors),bins=np.arange(num_clusters))
    if anomaly_data_dir:
        ax1.hist(gmm.predict(scaled_test_errors),bins=np.arange(num_clusters))
else:
    ax1.hist(gmm.predict(scaled_test_errors),bins=np.arange(num_clusters))
plt.show()

In [None]:
from utilities.learning import predict_one_sample
fig, axes = pyplot.subplots(nrows=1, ncols=3 , figsize=(15,5))
ax0,ax1,ax2 = axes.flatten()

sample_index=6436

pred = predict_one_sample(X_test[sample_index],model)

ax0.set_title('true')
ax0.plot(Y_test[sample_index][-seq_output_length:])

ax1.set_title('pred')
ax1.plot(pred[-seq_output_length:])

ax2.set_title('diff')
ax2.plot(np.abs(pred[-seq_output_length:] - Y_test[sample_index][-seq_output_length:]))
plt.show()

In [None]:
sum_train_errors = np.sum(scaled_train_errors,axis=1)
sum_test_errors = np.sum(scaled_test_errors,axis=1)
(n1, bins, _) = pyplot.hist(sum_train_errors,bins=100,color='b',alpha=0.5)
(n2, _ , _) = pyplot.hist(sum_test_errors,bins=100,color='r',alpha=0.5)
# (n3, _ , _) = pyplot.hist(sum_test_errors_anomaly,bins=[x/100 for x in range(100)],color='y',alpha=0.5)
plt.show()

In [None]:
(n1, bins, _) = pyplot.hist(train_scores,bins=200,color='b',alpha=0.5)
(n2, _ , _) = pyplot.hist(test_scores,bins=200,color='r',alpha=0.5)
# (n3, _ , _) = pyplot.hist(test_scores_falseneg,bins=list(range(-200,100, 5)),color='y',alpha=0.5)
plt.show()

# Experiements

In [None]:
sample = X_train[0]

In [None]:
X_train[:,:,0]+1j*X_train[:,:,1]

In [None]:
from scipy.fftpack import fft

In [7]:
def samples2complex(samples):
    return samples[:,:,0]+1j*samples[:,:,1]

In [8]:
samples2complex(X_train).shape

NameError: name 'X_train' is not defined