In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest

In [2]:
import pandas as pd
import math
import numpy as np

### LOAD DATASET

In [3]:
dataset = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [4]:
dataset.head(10)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...
5,1613207821,CRON,25771,(root) CMD ( cd / && run-parts --report /etc/c...
6,1613211421,CRON,32248,(root) CMD ( cd / && run-parts --report /etc/c...
7,1613214001,CRON,4465,(root) CMD (./home/wickstjo/scripts/ropsten.sh)
8,1613214001,CRON,4461,"(CRON) info (No MTA installed, discarding output)"
9,1613215021,CRON,6268,(root) CMD ( cd / && run-parts --report /etc/c...


### SPLIT INTO TRAIN & TESTSETS --- 80%

In [5]:
def split(features, percentage):
    
    # CALCULATE THE BREAKPOINT INDEX
    breakpoint = math.floor(len(dataset) * percentage)
    
    # SPLIT & RETURN AS TRAIN & TEST
    return np.array(features[:breakpoint]), np.array(features[breakpoint:])

In [6]:
train_set, test_set = split(dataset['message'], 0.8)

### VECTORIZE DATASETS INTO FEATURES

In [7]:
def vectorize(train_set):
    
    # CREATE THE VECTORIZER
    vectorizer = TfidfVectorizer()
    
    # FIT VECTORIZER & CREATE TRAIN FEATURES
    train_features = vectorizer.fit_transform(train_set)
    
    return train_features, vectorizer

In [8]:
train_features, vectorizer = vectorize(train_set)

In [9]:
test_features = vectorizer.transform(test_set)

### TRAIN ISOLATION FOREST

In [10]:
model_config = {
    'n_estimators': 1
}

In [11]:
def train_isolation(features):
    
    # CREATE THE MODEL
    model = IsolationForest(**model_config)
    
    # TRAIN IT
    model.fit(features)
    
    return model

In [12]:
model = train_isolation(train_features)

### PREDICT WITH TEST FEATURES

In [13]:
predictions = model.predict(test_features)

In [14]:
predictions[0:150]

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

### INSPECT THE ANOMALOUS EVENTS

In [15]:
for index, prediction in enumerate(predictions):
    
    # FIND ALL -1 VALUES (ANOMALIES)
    if prediction != 1:
        
        # PRINT ROW FROM TEST SET
        print(test_set[index])

High-Speed Isochronous Endpoints
usb 1-1: new high-speed USB device number 2 using dwc_otg
usb 1-1.1: new high-speed USB device number 3 using dwc_otg
usb 1-1.1.1: new high-speed USB device number 7 using dwc_otg
wlan0: adding address fe80::1013:35ee:ab9d:d76b
Joining mDNS multicast group on interface wlan0.IPv6 with address fe80::1013:35ee:ab9d:d76b.
Registering new address record for fe80::1013:35ee:ab9d:d76b on wlan0.*.
Leaving mDNS multicast group on interface wlan0.IPv6 with address fe80::1013:35ee:ab9d:d76b.
Withdrawing address record for fe80::1013:35ee:ab9d:d76b on wlan0.
