In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest

In [2]:
import numpy as np
import os
import nbimporter
from IPython.display import clear_output

In [3]:
import utils

Importing Jupyter notebook from utils.ipynb


### CREATE DATASET FROM COMPRESSED LOG FILES

In [4]:
dataframe = utils.create_dataset('data/logs')

In [5]:
dataframe.head(5)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...


In [6]:
#dataframe.to_csv('data/may-dataset.csv')

In [7]:
#dataset = utils.load_csv('data/may-dataset.csv')

In [8]:
len(dataframe)

16191

### SEPARATE DATAFRAME INTO SEGMENTS

In [9]:
def separate(dataframe):
    
    # CONTAINERS
    normal_events, anomalous_events = [], []
    
    # EXTRACT THE MESSAGE COLUMN
    messages = list(dataframe['message'])
    
    # BLACKLISTED PHRASES
    blacklisted_phrases = [
        'ropsten',
        'Created slice User',
        'Started Session c',
        'New USB device found',
        '@reboot jobs',
        'Started System Logging'
    ]
    
    # LOOP THROUGH THE MESSAGES
    for message in messages:
        
        # CHECK IF MESSAGE CONTAINS A BLACKLISTED PHRASE
        anomalous = contains(message, blacklisted_phrases)
            
        # APPEND TO THE CORRECT CONTAINER
        if anomalous:
            anomalous_events.append(message)
        else:
            normal_events.append(message)
                
    # PRINT LENGTHS
    print('TOTAL EVENTS:\t\t', len(messages))
    print('UNIQUE EVENTS:\t\t', len(set(messages)), '\n')
    print('NORMAL EVENTS:\t\t', len(normal_events) )
    print('ANOMALOUS EVENTS:\t', len(anomalous_events))
    
    return normal_events, anomalous_events, messages

In [10]:
def contains(message, blacklist):
    
    # LOOP THROUGH BLACKLIST
    for phrase in blacklist:
        
        # CHECK IF THE MESSAGE CONTAINS THE BAD PHRASE
        if phrase in message:
            return True
        
    return False

In [11]:
normal_dataset, anomalous_dataset, combined_dataset = separate(dataframe)

TOTAL EVENTS:		 16191
UNIQUE EVENTS:		 1354 

NORMAL EVENTS:		 15870
ANOMALOUS EVENTS:	 321


### VECTORIZE DATASET INTO FEATURES

In [12]:
def create_features(dataset):
    
    # CREATE THE VECTORIZER
    vectorizer = TfidfVectorizer()
    
    # FIT VECTORIZER & CREATE TRAIN FEATURES
    features = vectorizer.fit_transform(dataset)
    
    return features, vectorizer

### TRAIN ISOLATION FOREST

In [13]:
def train_isolation(features, config):
    
    # CREATE THE MODEL
    model = IsolationForest(**config)
    
    # TRAIN IT
    model.fit(features)
    
    return model

### PREDICTION ACCURACY

In [14]:
def set_accuracy(prediction_vector, dataset, anomalies):
    
    # INITIALIZE VARS
    hit, miss = 0, 0
    
    # CONVERT ANOMALIES INTO A SET FOR FASTER QUERYING
    anomalies = set(anomalies)
    
    # LOOP THROUGH PREDICTIONS
    for index, prediction in enumerate(prediction_vector):
    
        # FIND ALL -1 VALUES (ANOMALIES)
        if prediction != 1:
            
            # FIND THE MESSAGE
            message = dataset[index]

            # INCREMENT HIT/MISS BASED ON WHETHER MESSAGE EXISTS IN ANOMALIES
            if message in anomalies:
                hit += 1
            else:
                miss += 1
    
    # ACCURACY PERCENTAGE
    percentage = (hit / miss) * 100
    rounded = round(percentage, 2)
    
    # PRINT ACCURACY
    #print('accuracy: {}%'.format(rounded))
    
    return percentage

In [15]:
def length_accuracy(prediction_vector, dataset):
    
    # INITIALIZE HIT VAR
    hit = 0
    
    # LOOP THROUGH PREDICTIONS
    for index, prediction in enumerate(prediction_vector):
    
        # FIND ALL -1 VALUES (ANOMALIES)
        if prediction != 1:
            hit += 1
            
    # ACCURACY PERCENTAGE
    percentage = (hit / len(dataset)) * 100
    rounded = round(percentage, 2)
    
    # PRINT ACCURACY
    #print('accuracy: {}%'.format(rounded))
    
    return percentage

### EXPERIMENT #1 -- TRAIN WITH COMBINED DATASET

In [16]:
def combined_experiment(model_config={}):
    
    # VECTORIZE TRAINING DATASET
    training_features, vectorizer = create_features(combined_dataset)
    
    # ADD STATIC MODEL CONFIG PARAMS TO THE PROVIDED CONFIG
    model_config['random_state'] = 200
    model_config['contamination'] = len(anomalous_dataset) / len(combined_dataset)
    
    # TRAIN THE MODEL
    model = train_isolation(training_features, model_config)
    
    # PREDICT WITH TRAINING FEATURES
    predictions = model.predict(training_features)
    
    # MEASURE & RETURN ACCURACY
    return set_accuracy(predictions, combined_dataset, anomalous_dataset)

In [17]:
combined_experiment({
    'n_estimators': 1000
})

0.0

### EXPERIMENT #2 -- TRAIN & TEST WITH SEPARATE SEGMENTS

In [18]:
def segmented_experiment(model_config={}):
    
    # VECTORIZE NORMAL DATASET
    training_features, vectorizer = create_features(normal_dataset)
    
    # ADD STATIC MODEL CONFIG PARAMS TO THE PROVIDED CONFIG
    model_config['random_state'] = 200
    model_config['contamination'] = 0
    
    # TRAIN THE MODEL
    model = train_isolation(training_features, model_config)
    
    # VECTORIZE ANOMALOUS DATASET
    testing_features = vectorizer.transform(anomalous_dataset)
    
    # PREDICT WITH ANOMALIES
    predictions = model.predict(testing_features)
    
    # MEASURE & RETURN ACCURACY
    return length_accuracy(predictions, anomalous_dataset)

In [19]:
segmented_experiment({
    'n_estimators': 1000
})

0.0

### GRID SEARCH

In [20]:
def grid_search(param, minimum, maximum, increment):
    
    # GRID LABELS
    grid = list(range(minimum, maximum, increment))
    
    # CONTAINERS
    combined, segmented = [], []
    
    # BEST ACCURACY
    best = 0
    
    # LOOP THROUGH ESTIMATORS
    for value in grid:

        # RUN COMBINED EXPERIMENT & APPEND ACCURACY
        accuracy = combined_experiment({ param: value })
        combined.append(accuracy)
        
        # UPDATE ACCURACY WHEN A BETTER RESULT IS FOUND
        if accuracy > best:
            best = accuracy
        
        # RUN SEGMENTED EXPERIMENT & APPEND ACCURACY
        accuracy = segmented_experiment({ param: value })
        segmented.append(accuracy)
        
        # UPDATE ACCURACY WHEN A BETTER RESULT IS FOUND
        if accuracy > best:
            best = accuracy
        
        # CLEAR OLD OUTPUT & PRINT NEW
        clear_output(wait=True)
        print('GRID:\t\t', value + 1, '/', maximum)
        print('BEST ACCURACY:\t', str(best) + '%')
        
    return grid, combined, segmented

In [21]:
estimators, combined, segmented = grid_search(
    param='n_estimators',
    minimum=1,
    maximum=50,
    increment=1
)

GRID:		 50 / 50
BEST ACCURACY:	 0%
