In [1]:
import numpy as np
import pandas as pd
import random
import os
from os.path import expanduser
import sys
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_context('poster')

code_base='./'
sys.path.insert(0, code_base)
sys.path.insert(0, code_base+'sampler')
sys.path.insert(0, code_base+'models')

from utility import data_transformation, transform_with_keys, generate_mixed_events
from utility import get_entity_samplers_and_noise_prob, get_entity_type_sampler_and_mappings

from sklearn.metrics import average_precision_score, roc_auc_score
from metrics_ranking import eval_multiple, eval_apk

import keras
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Layer, Input, Dense, Embedding, Flatten, Merge, AveragePooling1D, Merge, Permute, merge
from keras.regularizers import WeightRegularizer, l1, l2, activity_l2, activity_l1

Using TensorFlow backend.


# Data and utility preparations

In [2]:
# data locations
data_folder = 'demo_toy/'                      # it only contains unrunnable toy data for demonstration
data_event_file = data_folder + '/events.csv'  # historical events without any label
data_test_file = data_folder + '/test.csv'     # test (future) events with additional label column

In [3]:
# load data
home = expanduser('~')
table = pd.read_csv(data_event_file)
table_test = pd.read_csv(data_test_file)

# index the entities in data
table_transformed, id2type_and_name, type_and_name2id, type2range = data_transformation(table)
table_transformed_test = transform_with_keys(table_test, table_test.columns[:-1], type_and_name2id)
# drop rows in test with NaN if there are any (imputation can also be used here)
table_transformed_test = table_transformed_test.dropna()

# sampler preparation
type2sampler, noise_prob = get_entity_samplers_and_noise_prob(table_transformed, noise_prob_cal='logkPn@10', neg_dist='unigram')
type2typeid, typeid2type, entity_type_sample, type_cad_dist = get_entity_type_sampler_and_mappings(table_transformed, neg_dist='uniform')

In [4]:
class DataSpec(object):
    def __init__(self, table_transformed, type2range):
        self.num_entity_type = len(table_transformed.columns)
        self.num_entity = max([max(type_range) for each, type_range in type2range.iteritems()]) - \
                          min([min(type_range) for each, type_range in type2range.iteritems()]) + 1
            
data_spec = DataSpec(table_transformed, type2range)

# Model specifications and run

In [5]:
class Conf(object):
    def __init__(self):
        self.max_epoch = 10
        self.batch_size = 512
        self.num_negatives = 5
        self.emb_dim = 10
        self.loss = 'skip-gram'
        #self.loss = 'max-margin'
        self.no_weight = False
        self.ignore_noise_dist = False
        
conf = Conf()
import ape
reload(ape)
from ape import get_model
model = get_model(conf, data_spec)

In [6]:
# main logics

abandon_uneven_batch = True
batch_size = conf.batch_size
num_negatives = conf.num_negatives
events = np.array(table_transformed)
num_iters = np.ceil(events.shape[0] / float(batch_size)).astype(int)
for epoch in range(1, conf.max_epoch + 1):
    np.random.shuffle(events)
    cost = 0
    entity_type_assigns = entity_type_sample(num_iters)
    for it in range(num_iters):
        neg_entity_typeid = entity_type_assigns[it]
        events_batch = events[it * batch_size: (it + 1) * batch_size]
        if abandon_uneven_batch and events_batch.shape[0] != batch_size:
            continue
        events_batch_mixed, events_noise_prob, events_label = \
            generate_mixed_events(events_batch, neg_entity_typeid, num_negatives, type2sampler, typeid2type, noise_prob)
        cost += model.train_on_batch([events_batch_mixed, events_noise_prob], events_label)
    print '[INFO] epoch %d, cost: %f' % (epoch, cost), 'norm', np.sqrt(np.mean(model.get_weights()[0]**2))

[INFO] epoch 1, cost: 0.340711 norm 0.246857
[INFO] epoch 2, cost: 0.190969 norm 0.348757
[INFO] epoch 3, cost: 0.167266 norm 0.423766
[INFO] epoch 4, cost: 0.164935 norm 0.464869
[INFO] epoch 5, cost: 0.164203 norm 0.501307
[INFO] epoch 6, cost: 0.151317 norm 0.532868
[INFO] epoch 7, cost: 0.158085 norm 0.552177
[INFO] epoch 8, cost: 0.154772 norm 0.577105
[INFO] epoch 9, cost: 0.138678 norm 0.600417
[INFO] epoch 10, cost: 0.152146 norm 0.614172


# Evaluation

In [7]:
# evaluation system anomaly
def eval_print(table_transformed_w_label):
    truth = np.array(table_transformed_w_label)[:, -1]
    truth[truth > 0] = 1
    truth = truth* -1 + 1
    print 'truth_mean', np.mean(truth)
    pred = -model.predict([np.array(table_transformed_w_label)[:, :-1], np.zeros(table_transformed_w_label.shape[0])], batch_size=1024)[:, 0]
    perturbation = np.random.random(pred.shape) * 1e-4
    pred += perturbation
    pred_random = pred.copy()
    np.random.shuffle(pred_random)
    print 'pred_mean', np.mean(pred)
    print 'AP', average_precision_score(truth, pred)#, average_precision_score(truth, pred_random)
    print 'AUC', roc_auc_score(truth, pred)#, roc_auc_score(truth, pred_random)
    
eval_print(table_transformed_test)

truth_mean 0.450419210033
pred_mean 5.68575
AP 0.936628846843
AUC 0.925326399191
