In [57]:
from sklearn.metrics import classification_report
import numpy as np
import torch

# DeepCASE Imports
from deepcase.preprocessing   import Preprocessor
from deepcase.context_builder import ContextBuilder
import pandas as pd

In [58]:
fox_alerts = '../../../datasets/ait_alerts_csv/alerts_csv/fox_alerts.txt'
import sys
sys.path.append('..')

# DeepCASE Imports
from deepcase.preprocessing   import Preprocessor
from deepcase.context_builder import ContextBuilder
from deepcase.interpreter     import Interpreter

In [59]:
data = pd.read_csv(fox_alerts, sep=',')
data = data.rename(columns = {"time":"timestamp","ip":"machine","short":"event","time_label":"label"})

In [60]:
# Create mapping of events
mapping = {
    i: event for i, event in enumerate(np.unique(data['event'].values))
}

# Check that NO_EVENT is not in events
# if self.NO_EVENT in mapping.values():
#     raise ValueError(
#         "NO_EVENT ('{}') is also a valid Event ID".format(self.NO_EVENT)
#     )

mapping[len(mapping)] = -1337
mapping_inverse = {v: k for k, v in mapping.items()}
# Apply mapping
data['event'] = data['event'].map(mapping_inverse)

mapping_label = {i: label for i, label in enumerate(np.unique(data['label'].values))}
mapping_label_inverse = {v:k for k,v in mapping_label.items()}
data['label'] = data['label'].map(mapping_label_inverse)
#print(mapping_label_inverse)
labels = torch.Tensor(data['label'].values).to(torch.long)

index_false_positive = mapping_label_inverse['false_positive']
labels_binary = [0 if i==index_false_positive else 1 for i in labels ] # benign or malicious
labels_binary = np.array(labels_binary)

In [61]:
# Set events as events
events = torch.Tensor(data['event'].values).to(torch.long)

# Set context full of NO_EVENTs
context = torch.full(
    size       = (data.shape[0], 10),
    fill_value = mapping_inverse[-1337],
).to(torch.long)

In [62]:
context.shape

torch.Size([473104, 10])

In [None]:
# Sort data by timestamp
data = data.sort_values(by='timestamp')

# Group by machines
machine_grouped = data.groupby('machine')

In [None]:
# Group by machine
for machine, events_ in machine_grouped:
    # Get indices, timestamps and events
    indices    = events_.index.values
    timestamps = events_['timestamp'].values
    events_    = events_['event'].values

    # Initialise context for single machine
    machine_context = np.full(
        (events_.shape[0], 10),
        mapping_inverse[-1337],
        dtype = int,
    )

    # Loop over all parts of the context
    for i in range(10):
        # Compute time difference between context and event
        time_diff = timestamps[i+1:] - timestamps[:-i-1]
        # Check if time difference is larger than threshold
        timeout_mask = time_diff > 86400

        # Set mask to NO_EVENT
        machine_context[i+1:, 10-i-1] = np.where(
            timeout_mask,
            mapping_inverse[-1337],
            events_[:-i-1],
        )
        machine_context[i+1:,10-i-1] = events_[:-i-1]

    # Convert to torch Tensor
    machine_context = torch.Tensor(machine_context).to(torch.long)
    # Add machine_context to context
    context[indices] = machine_context

    # arrange labels
labels_events = context[1:,-1]
context = context[:-1,:]
labels_binary = labels_binary[1:]

In [None]:
context.shape

In [None]:
data.shape

In [None]:
events_

In [None]:
if __name__ == "__main__":
    ########################################################################
    #                             Loading data                             #
    ########################################################################

    # Create preprocessor
    preprocessor = Preprocessor(
        length  = 10,    # 10 events in context
        timeout = 86400, # Ignore events older than 1 day (60*60*24 = 86400 seconds)
    )

    # Load data from file
    context, events, labels, mapping = preprocessor.text(
        path    = 'data/hdfs/hdfs_test_normal',
        verbose = True,
    )

    # In case no labels are provided, set labels to -1
    # IMPORTANT: If no labels are provided, make sure to manually set the labels
    # before calling the interpreter.score_clusters method. Otherwise, this will
    # raise an exception, because scores == NO_SCORE cannot be computed.
    if labels is None:
        labels = np.full(events.shape[0], -1, dtype=int)

    # Cast to cuda if available
    if torch.cuda.is_available():
        events  = events .to('cuda')
        context = context.to('cuda')


In [None]:
print(events.shape)
print(context.shape)

In [None]:
labels[0:10]

In [None]:
context[0:10]

In [None]:
events[0:10]

In [None]:
a={'a':1,'b':2}

In [None]:
a.items()

In [None]:
b={v:k for k,v in a.items()}
b