In [3]:
import sys
sys.path.insert(0, '..')
sys.path.insert(0, 'plots')

import os

from IPython.display import display
from IPython.display import clear_output
import ipywidgets as widgets


import json

from core.Tokenizer import Tokenizer
from core.TFIDFVectorizer import TFIDFVectorizer
from sources.SplunkFileSource import SplunkFileSource
from core.KmeansCluster import KmeansCluster
from core.KmeansAnomalyDetector import KmeansAnomalyDetector
from core.IsolationForestClassifier import IsolationForestClassifier

import plotly as py
import pandas as pd
import numpy as np
import plot as plot

py.offline.init_notebook_mode()
pd.set_option('display.notebook_repr_html', True)

data_sources_names = []

data_source_picker = None
time_range_picker = None
event_picker = None
event_text_area = None 
threshold_picker = None
new_time_range_picker = None

#create widgets
def create_widgets():
    global data_source_picker, time_range_picker, event_picker
    global event_text_area, threshold_picker, new_time_range_picker
    for file in os.listdir("../../data"):
            if file.endswith(".json"):
                data_sources_names.append(file)

    data_source_picker = widgets.Dropdown(
            options=data_sources_names,
            description='Data Source:',
            disabled=False,
            button_style='' # 'success', 'info', 'warning', 'danger' or ''
        )    

    time_range_picker = widgets.IntRangeSlider(
                value=[1, 2],
                min=0,
                max=100,
                step=1,
                description='Control Time:',
                disabled=False,
                continuous_update=False,
                orientation='horizontal',
                readout=True,
                readout_format='i',
                slider_color='white',
                color='black'
        )
    
    new_time_range_picker = widgets.IntRangeSlider(
                value=[1, 2],
                min=0,
                max=100,
                step=1,
                description='Test Time:',
                disabled=False,
                continuous_update=False,
                orientation='horizontal',
                readout=True,
                readout_format='i',
                slider_color='white',
                color='black'
        )
    
    threshold_picker = widgets.FloatSlider(
                value=0.5,
                min=0,
                max=1,
                step=0.1,
                description='Threshold:',
                disabled=False,
                continuous_update=False,
                orientation='horizontal',
                readout=True,
                readout_format='.1f',
                slider_color='white',
                color='black'
        )
    
    event_picker = widgets.Dropdown(
            options=[1,2,3],
            description='Event Number:',
            disabled=False,
            button_style='' # 'success', 'info', 'warning', 'danger' or ''
        )
    
    l = widgets.Layout(height='40px', width='800px')
    event_text_area = widgets.Textarea(value='TA: height=40px', layout=l)

all_events = []
usable_events = []
usable_control_events = []
usable_test_events = []

def event_picker_change_handler(change):
    event_text_area.value = json.dumps(usable_events[event_picker.value])
    
# Handle data source change
def data_source_change_handler(change):
    del all_events[:]
    all_events.extend(SplunkFileSource.load_data('../../data/' + data_source_picker.value))

    #print(len(loaded_dictionaries))
    
    minutes = 0
    for dict in all_events:
        if dict.get('cluster_label') == '1':
            minutes = minutes + 1

    time_range_picker.value = [1,2]
    time_range_picker.min = 1
    time_range_picker.max = minutes
    
    new_time_range_picker.value = [minutes -2, minutes]
    new_time_range_picker.min = 1
    new_time_range_picker.max = minutes
    
    time_slider_change_handler(' ')    
    
def time_slider_change_handler(change):
    clear_output()
    del usable_events[:]
    del usable_control_events[:]
    del usable_test_events[:]

    minute=0
    for dict in all_events:
        if dict.get('cluster_label') == '1':
            minute = minute + 1
        if minute >= time_range_picker.value[0] and minute <= time_range_picker.value[1]:
            usable_events.append(dict)
            usable_control_events.append([dict.get('_raw'), len(usable_events) - 1, dict.get('cluster_count')])
        if minute>= new_time_range_picker.value[0] and minute <= new_time_range_picker.value[0]:
            usable_events.append(dict)
            usable_test_events.append([dict.get('_raw'), len(usable_events) - 1, dict.get('cluster_count')])
            
    event_picker.options = [ x for x in range(len(usable_events))] 
    cluster()
    
def plot_cluster(dist, km, features):
    order_centroids = km.get_centriods()
    klusters = km.get_clusters()
    tooltips = []
    for id in range(len(set(klusters))):
        tip = 'cluster = ' + str(id) + '<br>'
        for i in order_centroids[id,:15]:
            tip = tip + features[i] + '<br>'
        tooltips.append(tip)
    plot.scatter_plot_groups(dist, klusters, tooltips)

    
def split(input, length, size):
    input.replace('\n', ' ')
    input.replace('\tat', ' ')
    return '<br>'.join([input[start:start + size] for start in range(0, length, size)])

def create_combined_scatter_plot(control_events, test_events, labels, clusters, plot_3d=False):
    combined_usable_events = []
    combined_usable_events.extend(control_events)
    combined_usable_events.extend(test_events)
    combined_tfidf_vectorizer = TFIDFVectorizer(Tokenizer.default_tokenizer, 1, 1.0)
    combined_tfidf_matrix = combined_tfidf_vectorizer.fit_transform(np.array(combined_usable_events)[:,0])

    combined_dist = combined_tfidf_vectorizer.get_cosine_dist_matrix(combined_tfidf_matrix)
    tooltips = []
    #usable_combined_event_counts=[]
    usable_control_event_counts = []
    usable_test_event_counts = []
    for idx, event in enumerate(control_events):
        tooltips.append(split(event[0], min(1000, len(event[0])), 100) + '<br> id = ' + str(event[1]) + '<br> cluster = ' + str(clusters[idx]))
     
    for idx, event in enumerate(test_events):
        tooltips.append(split(event[0], min(1000, len(event[0])), 100) + '<br> id = ' + str(event[1]) + '<br> cluster = ' + str(clusters[len(control_events) + idx]))
        #usable_combined_event_counts.append(usable_events[event[1]].get('cluster_count'))
        #usable_test_event_counts.append(usable_events[event[1]].get('cluster_count'))
    
    
    if plot_3d == False:
        plot.scatter_plot_groups(combined_dist, labels, tooltips, ['control', 'test', 'test-anomaly'], ['blue','orange','red'])
    else:
        combined_values = np.column_stack((combined_dist, np.array(combined_usable_events)[:,2]))
        plot.scatter_plot_groups_4d(combined_values, labels, clusters, tooltips, ['blue','orange','red'])


    
def cluster():
        
    print('plotting....')
    
    tfidf_vectorizer = TFIDFVectorizer(Tokenizer.default_tokenizer, 1, 1.0)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(np.array(usable_control_events)[:,0])
    
    dist = tfidf_vectorizer.get_cosine_dist_matrix(tfidf_matrix)
    
    plot.scatter_plot(dist, usable_control_events)
    
    print("clustering....")
    kmeans = KmeansCluster(tfidf_matrix, threshold_picker.value)
    kmeans.cluster_cosine_threshold()
    plot_cluster(dist, kmeans, tfidf_vectorizer.get_feature_names())
    
    print("Detecting new anomaly")
    tfidf_matrix_test = tfidf_vectorizer.transform(np.array(usable_test_events)[:,0])
    newAnomDetector = KmeansAnomalyDetector()
    predictions = np.array(newAnomDetector.detect_kmeans_anomaly_cosine_dist(tfidf_matrix_test, kmeans, threshold_picker.value))
    
    
    labels = []
    klus = kmeans.get_clusters()
    
    for i in range(len(usable_control_events)):
        labels.append(0)
    #labels = np.zeros(len(usable_control_events), dtype=np.int))
    for p in predictions:
        klus.append(p)
        if p != -1:
            labels.append(1)
        else:
            labels.append(2)
   
    
    create_combined_scatter_plot(usable_control_events, usable_test_events, labels, klus)


    print("Detecting existing anomaly")


    control_values = np.column_stack((kmeans.get_clusters(), np.array(usable_control_events)))
        
    test_values = np.column_stack((predictions, np.array(usable_test_events)))[predictions != -1]
    #test_events_fit = np.array(usable_test_events)[predictions != -1]
    
    control_df = pd.DataFrame(dict(x=control_values[:,0], y=control_values[:,3], 
                                   label=control_values[:,0], text=control_values[:,1], idx=control_values[:,2]))
    test_df = pd.DataFrame(dict(x=test_values[:,0], y=test_values[:,3], 
                                label=test_values[:,0], text=test_values[:,1], idx=test_values[:,2]))

    control_groups = control_df.groupby('label')
    test_groups = test_df.groupby('label')
    
    classifier = IsolationForestClassifier()
        
    labels_test = [0] * len(usable_events)

    #labels_by_cluster = {}
    #combined_event_counts = {}
    klusters_test = [-1] * len(usable_events)
    
    for name, group in control_groups:
        classifier.fit_transform(str(name), np.column_stack((group.x,group.y)))
        #control_values_by_cluster[str(name)] = np.column_stack(np.arange(group.shape[0]),group.y)
        #labels_by_cluster[str(name)] = [0] * group.shape[0]
    
    for name, group in test_groups:
        preds = classifier.predict(str(name), np.column_stack((group.x,group.y)))
        #control_values_by_cluster[str(name)].append(np.column_stack(np.arange(group.shape[0]),group.y)  )
        ids = group.idx.tolist()
        for k,pred in enumerate(preds):
            if pred == 1:
                labels_test[int(ids[k])] = 1
                klusters_test[int(ids[k])] = int(name)
                #labels_by_cluster[str(name)].append(1)
            else:
                labels_test[int(ids[k])] = 2
                klusters_test[int(ids[k])] = int(name)
                #labels_by_cluster[str(name)].append(2)                    
                    
                    
    labels_test = np.array(labels_test)
    labels_test = labels_test[np.where(np.array(labels_test) > 0)[0]]
    labels = np.concatenate([np.zeros(len(usable_control_events),dtype=np.int),labels_test])
    
    klusters_test = np.array(klusters_test)
    klusters_test = klusters_test[np.where(np.array(klusters_test) > -1)[0]]
    klusters = np.concatenate([kmeans.get_clusters(), klusters_test])


    print('------')
    print(len(labels))
    print(len(klusters))
    print('-------')
    create_combined_scatter_plot(control_values[:,1:4], test_values[:,1:4], labels, klusters, True)

    
    
    #usable_control_event_counts = np.column_stack((dist, usable_combined_event_counts))
    #usable_test_event_counts = np.column_stack((dist, usable_combined_event_counts))

            
    #tooltips = [ str(id) for id in range(set(klusters))]
    #tooltips = []
        #Cluster_KMeans.scatter_plot(tfidf_matrix,num_clusters,km,np.array(selected_text),terms)
        #print(num_clusters)
        #print(len(terms))
        #print(len(selected_text))
        #print(km.score(tfidf_matrix))
        #score = len(terms)
        #print(score)
        #print(min_idf)    
        #min_idf = min_idf + 0.01
        #if prev_score != 0 and ((prev_score - score)/prev_score) <= 0.05:
         #   break
        #else:
         #   prev_tfidf_vectorizer = tfidf_vectorizer
         #   prev_score = score
         #   prev_tfidf_matrix = tfidf_matrix
         #   prev_terms = terms
         #   prev_num_clusters = num_clusters
         #   prev_km = km
            
            
    
    #tfidf_matrix =  prev_tfidf_matrix
    #terms = prev_terms
    #tfidf_vectorizer = prev_tfidf_vectorizer
    #km, num_clusters = Cluster_KMeans.cluster(tfidf_matrix, np.array(selected_text),threshold_picker.value)
    #Cluster_KMeans.scatter_plot(tfidf_matrix,num_clusters,km,np.array(selected_text),terms)    
        #print(keep_list)
        #print(tfidf_matrix.shape)

        #print(tfidf_matrix[np.array(keep_list)].shape)
        #print(len(np.array(selected_text)[np.array(keep_list)]))

        #selected_text = np.array(selected_text)[np.array(keep_list)]
        #print(len(selected_text))


        
    
    #while True:
        #print('trying min_idf = ' + str(min_idf))
        #tfidf_vectorizer = TFIDF_Generator.make_tfidf_vector(min_idf,max_idf)
        #tfidf_matrix, terms = TFIDF_Generator.tfidf_vector_fit_transform(tfidf_vectorizer, np.array(selected_text), True)
        #print(terms)
        #found = True
        #for i in range(tfidf_matrix.shape[0]):
            #if np.sum(tfidf_matrix[i,:]) == 0.0:
             #   print(i)
             #   found = False
                
        #if found == True:
         #   print('min_idf = ' + str(min_idf))
         #   break
        #else:
         #   min_idf = min_idf - 0.01
    
        #if min_idf < 0.0:
         #   print('Something is very wrong. min_idf =  ' + str(min_idf))    

create_widgets()

    
#setup even handlers    
data_source_picker.observe(data_source_change_handler, names="value")
time_range_picker.observe(time_slider_change_handler, names="value")
new_time_range_picker.observe(time_slider_change_handler, names="value")
event_picker.observe(event_picker_change_handler, names="value")
#threshold_picker.observe(threshold_picker_change_handler, names="value")
#intialize
data_source_change_handler('')
event_picker_change_handler('')

# show widgets
display(widgets.HBox([data_source_picker]))
display(widgets.HBox([time_range_picker, threshold_picker]))
display(widgets.HBox([new_time_range_picker]))
display(widgets.HBox([event_picker, event_text_area]))


 
    












plotting....


clustering....


Detecting new anomaly


Detecting existing anomaly



Data with input dtype object was converted to float64 by StandardScaler.

