In [1]:
import datetime
import os
import preprocessing
import BASE_LOG_ANALYSE
import pandas as pd
from multiprocess import Pool
import urllib3
from functools import partial
urllib3.disable_warnings()
import plotly.express as px
import ipywidgets as widgets
import ipydatagrid
from IPython.display import display, HTML

In [2]:
data = dict()
entropy = dict()
vectorizer = None

In [3]:
def load_data_log_entity(folder, entity_name, hostname):
    data1 = {'timestamp': [], 'message': []}
    for filename in BASE_LOG_ANALYSE.get_file_list_by_filename_filter(folder, entity_name):
        # Get file content - Start
        if '.gz' in filename:
            content = BASE_LOG_ANALYSE.read_gzip_file(filename)
        else:
            with open(filename, 'r', encoding="latin-1") as f_in:
                content = f_in.read()
        # Get file content - End
        # Parsing data to get time and info - Start
        if 'messages' in filename:
            for line in content.split('\n'):
                tokens = line.split(hostname)
                if len(tokens) == 2:
                    if '<' in tokens[0]:
                        tokens[0] = tokens[0].split()[1]
                    data1['timestamp'].append(tokens[0])
                    data1['message'].append(tokens[1])
        else:
            for line in content.split('\n'):
                tokens = line.split(hostname)
                if len(tokens) == 2:
                    if '<' in tokens[0]:
                        tokens[0] = tokens[0].split()[1]
                    data1['timestamp'].append(tokens[0])
                    data1['message'].append(tokens[1])
        # Parsing data to get time and info - End    
    return pd.DataFrame(data1)

In [4]:
def training_data(folder, start, end, hostname):
    df = load_data_log_entity(folder, 'messages*', hostname)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.sort_values(by=['timestamp'], inplace=True)
    global entropy, vectorizer, data
    data = dict()
    data['message'] = df
    training_data = {k: v[(v['timestamp'] >= start) & (v['timestamp'] < end)] for k, v in data.items()}
    entropy, vectorizer = preprocessing.preprocess_training_data(training_data)
    print("Training done!")
    js = "<script>alert('Training Done!');</script>"
    display(HTML(js))
    return 

def inspect_data(start, end):
    inspect = {k: v[(v['timestamp'] >= start) & (v['timestamp'] < end)] for k, v in data.items()}['message']
    return inspect

In [5]:
def call_training_data(b, folder=None, start=None, end=None, hostname=None):
    print("Run")
    training_data(folder.value, str(start.value), str(end.value), hostname.value)
    

In [6]:
def call_inspect_data(a, start, end):
    x = inspect_data(str(start.value), str(end.value))
    print(x.head())

In [7]:
path = widgets.Textarea(
    disabled=False,
    description='Log path'
)
hostname = widgets.Text(
    value='',
    placeholder='Type hostname',
    description='Hostname',
    disabled=False
)
start_training = widgets.DatetimePicker(
    description='Start training time',
    disabled=False
)
end_training = widgets.DatetimePicker(
    description='End training time',
    disabled=False
)
train_button = widgets.Button(
    description='Train data'
)

In [8]:
header = widgets.HTML("<h1>Unsupervised Log Anomaly Detection</h1>")
header.style.text_align='center'

In [9]:
start_testing = widgets.DatetimePicker(
    description='Start testing time',
    disabled=False
)
end_testing = widgets.DatetimePicker(
    description='End testing time',
    disabled=False
)
test_button = widgets.Button(
    description='Check test data'
)
inspect_button = widgets.Button(
    description='Show log'
)

In [10]:
train_button.on_click(partial(call_training_data, folder=path, start=start_training, end=end_training, hostname=hostname))

In [11]:
inspect_button.on_click(partial(call_inspect_data, start=start_testing, end=end_testing))

In [12]:
app = widgets.AppLayout(
    center=widgets.Textarea("test"),
    header=header,
    left_sidebar=widgets.VBox([
        path,
        hostname,
        start_training,
        end_training,
        train_button,
        start_testing,
        end_testing,
        widgets.HBox([test_button, inspect_button])
    ]),
    right_sidebar=widgets.Textarea("test2"),
    footer=ipydatagrid.DataGrid(pd.DataFrame({'a': [1,2,3]})),
    pane_widths=['500px', 1, 1],
    pane_heights=['10px', 4, 1],
    height='900px',
    grid_gap="30px"
)

In [13]:
display(app)

AppLayout(children=(HTML(value='<h1>Unsupervised Log Anomaly Detection</h1>', layout=Layout(grid_area='header'â€¦

Run


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process'] = data['message'].map(preprocess)


Training done!


                              timestamp  \
672335 2021-03-01 11:00:00.001000+07:00   
672336 2021-03-01 11:00:00.001000+07:00   
672337 2021-03-01 11:05:00.005000+07:00   
672338 2021-03-01 11:10:00.005000+07:00   
672340 2021-03-01 11:15:00.001000+07:00   

                                                  message  
672335   /usr/sbin/cron 53581 - - (root) CMD (newsyslo...  
672336   /usr/sbin/cron 53582 - - (root) CMD (   /usr/...  
672337   /usr/sbin/cron 53585 - - (root) CMD (   /usr/...  
672338   /usr/sbin/cron 53588 - - (root) CMD (   /usr/...  
672340   /usr/sbin/cron 53593 - - (root) CMD (   /usr/...  
