<a href="https://colab.research.google.com/github/ubermen/anomaly_detector/blob/master/log_quality_anomaly_detection_20181108.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!rm -rf anomaly_detector
!git clone https://github.com/ubermen/anomaly_detector.git
%cd anomaly_detector
!ls
from preprocessor import Preprocessor
from testor import TestUtil
from vae_on_cnn import VariationalAutoencoder
from latent_gen_2d import LatentGen2D

import tensorflow as tf
tfd = tf.contrib.distributions
from google.colab import auth
auth.authenticate_user()

In [0]:
# data preprocessing
sequence_length = 20
encoding_size = 128
code_size = 2
  
preprocess = Preprocessor(sequence_length, encoding_size)
test_data = preprocess.extract_from_bigquery('test', 'bi-service-155107', 'bigpi_test', 'log_anomaly_globalsignin_devicemodel_20181004_test_shuffled', print_result=True)

In [0]:
# Define the model 
vae = VariationalAutoencoder(sequence_length, encoding_size, code_size, kernel=(sequence_length,10))
data = tf.placeholder(tf.float32, [None, sequence_length, encoding_size])
prior = vae.make_prior()
posterior = vae.make_encoder(data)
code = posterior.sample()

# Define the loss.
decoder = vae.make_decoder(code)
likelihood = decoder.log_prob(data)
divergence = tfd.kl_divergence(posterior, prior)
elbo = tf.reduce_mean(likelihood - divergence)
optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(-elbo)
anomaly_score = -likelihood

# checkpoint
checkpoint_dir = './training_checkpoints'
saver = tf.train.Saver()

test_util = TestUtil(anomaly_score, data)

In [0]:
# initialize
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  save_path = saver.save(sess, checkpoint_dir)

In [0]:
# train
total_epochs = 1
extract_batch_size = 100000
learning_batch_size = 1000

with tf.Session() as sess:
  is_training = True
  abnormal_samples = None
  saver.restore(sess, checkpoint_dir)
  for epoch in range(total_epochs):
    s = (epoch*100)%len(test_data)
    e = s + 100
    test_util.test('e_{:04d}'.format(epoch), sess, test_data[s:e])
    
    i = 0
    while True :
      e_start = i
      e_end = e_start + extract_batch_size - 1
      i = i + extract_batch_size
      train_batch_data = preprocess.extract_batch_from_bigquery('extract_batch', 'bi-service-155107', 'bigpi_test', 'log_anomaly_globalsignin_devicemodel_20181004_real', e_start, e_end)
      data_count = train_batch_data.shape[0]
      if data_count == 0 : break
      j = 0
      while j < data_count :
        l_start = j
        l_end = l_start + learning_batch_size - 1
        j = j + learning_batch_size
        feed = {data: train_batch_data[l_start:l_end]}
        sess.run(optimize, feed)
    print('finish epoch',epoch)
    save_path = saver.save(sess, checkpoint_dir)

In [0]:
# test
with tf.Session() as sess:
  saver.restore(sess, checkpoint_dir)
  test_util.test('test', sess, test_data[:1000])

In [0]:
# generation
gen_count = 10000
gen_codes = vae.make_decoder(prior.sample(gen_count)).mean()

with tf.Session() as sess:
  saver.restore(sess, checkpoint_dir)
  #test_util.test('generate', sess, samples)
  result = {}
  for i in range(1) :
    samples = sess.run(gen_codes)
    ascii_codes_list = test_util.convert_onehot_to_ascii(sess, samples)
    print(ascii_codes_list)
    score = test_util.get_score_list(sess, samples)
    for j, ascii_code in enumerate(ascii_codes_list) :
      result[ascii_code] = score[j]
  sortable = []
  for key in result:
    value = result[key]
    sortable.append((key,value))
  sorted_by_value = sorted(sortable, key=lambda x: x[1])
  print(sorted_by_value)

In [0]:
# generation from manually created codes
manual_codes = tf.placeholder(tf.float32, [None, code_size])
gen_codes_manual = vae.make_decoder(manual_codes).mean()

with tf.Session() as sess:
  saver.restore(sess, checkpoint_dir)
  #codes=[\
  #       [-1.5,-1.5], [-1.5,-1],[-1.5,-0.5],[-1.5,0],[-1.5,0.5],[-1.5,1],
  #       [-1,-1.5],   [-1,-1],  [-1,-0.5],  [-1,0],  [-1,0.5],  [-1,1],
  #       [-0.5,-1.5], [-0.5,-1],[-0.5,-0.5],[-0.5,0],[-0.5,0.5],[-0.5,1],
  #       [0,-1.5],    [0,-1],   [0,-0.5],   [0,0],   [0,0.5],   [0,1],
  #       [0.5,-1.5],  [0.5,-1], [0.5,-0.5], [0.5,0], [0.5,0.5], [0.5,1],
  #       [1,-1.5],    [1,-1],   [1,-0.5],   [1,0],   [1,0.5],   [1,1],
  #       [1.5,-1.5],  [1.5,-1], [1.5,-0.5], [1.5,0], [1.5,0.5], [1.5,1],
  #       [2,-1.5],    [2,-1],   [2,-0.5],   [2,0],   [2,0.5],   [2,1],
  #]
  offset_array = [-3,-2,-1,0,1,2,3]
  for i in offset_array :
    print('------------------------------------------------')
    for j in offset_array :
      latent_gen = LatentGen2D(horizontal_offset=i, vertical_offset=j)
      #latent_gen.print_codes()

      #codes = sess.run(prior.sample(10))
      samples = sess.run(gen_codes_manual, {manual_codes:latent_gen.codes})
      ascii_codes_list = test_util.convert_onehot_to_ascii(sess, samples)
      latent_gen.print_row_by_row(ascii_codes_list, latent_gen.width)

In [0]:
# generation from char sequences
charseq = ['iPad6,4', 'iPhone','Samsung']
onehot = preprocess.convert_str_array_to_onehot_ndarray('gen', charseq)
manual_codes = tf.placeholder(tf.float32, [None, code_size])
gen_codes_manual = vae.make_decoder(manual_codes).mean()
with tf.Session() as sess:
  saver.restore(sess, checkpoint_dir)
  codes = sess.run([code], {data:onehot})[0]
  samples = sess.run(gen_codes_manual, {manual_codes:codes})
  test_util.test('generate', sess, samples)