In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest

In [2]:
import pandas as pd
import numpy as np
import math
import random

### LOAD DATASET

In [3]:
dataframe = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [4]:
dataframe.head(10)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...
5,1613207821,CRON,25771,(root) CMD ( cd / && run-parts --report /etc/c...
6,1613211421,CRON,32248,(root) CMD ( cd / && run-parts --report /etc/c...
7,1613214001,CRON,4465,(root) CMD (./home/wickstjo/scripts/ropsten.sh)
8,1613214001,CRON,4461,"(CRON) info (No MTA installed, discarding output)"
9,1613215021,CRON,6268,(root) CMD ( cd / && run-parts --report /etc/c...


### SEPARATE GOOD & BAD EVENTS

In [5]:
dataset = np.array(dataframe['message'])

In [6]:
def blacklisted(row):
    
    # BLACKLISTED PHRASES
    blacklist = [
        'ropsten',
        'Created slice User',
        'Started Session c',
        'New USB device found',
        '@reboot jobs',
        'Started System Logging'
    ]
    
    # LOOP THROUGH PHRASES
    for phrase in blacklist:
        
        # IF THE PHRASE IS FOUND, RETURN TRUE
        if phrase in row:
            return True
        
    # IF NOTHING WAS FOUND, RETURN FALSE
    return False

In [7]:
good_dataset, bad_dataset = [], []

In [8]:
for row in dataset:
    
    # IF THE ROW CONTAINS A BLACKLISTED PHRASE
    if blacklisted(row):
        bad_dataset.append(row)
        
    # OTHERWISE..
    else:
        good_dataset.append(row)

### SPLIT GOOD EVENTS INTO TRAIN & TESTSETS --- 90%

In [9]:
def split(features, percentage):
    
    # CALCULATE THE BREAKPOINT INDEX
    breakpoint = math.floor(len(dataset) * percentage)
    
    # SPLIT & RETURN AS TRAIN & TEST
    return features[:breakpoint], features[breakpoint:]

In [10]:
train_set, test_set = split(good_dataset, 0.9)

### ADD BAD EVENTS TO TEST SET & RANDOMIZE

In [11]:
contamination = len(bad_dataset) / len(test_set)

In [12]:
test_set = test_set + bad_dataset

In [13]:
random.shuffle(test_set)

### VECTORIZE DATASETS INTO FEATURES

In [14]:
def vectorize(train_set, test_set):
    
    # CREATE THE VECTORIZER
    vectorizer = TfidfVectorizer()
    
    # FIT VECTORIZER & CREATE TRAIN FEATURES
    train_features = vectorizer.fit_transform(train_set)
    
    # TRANSFORM TEST FEATUERS
    test_features = vectorizer.transform(test_set)
    
    return train_features, test_features

In [15]:
train_features, test_features = vectorize(train_set, test_set)

### TRAIN ISOLATION FOREST

In [16]:
model_config = {
    'n_estimators': 1000,
    'random_state': 200,
    'contamination': contamination
}

In [17]:
def train_isolation(features):
    
    # CREATE THE MODEL
    model = IsolationForest(**model_config)
    
    # TRAIN IT
    model.fit(features)
    
    return model

In [18]:
model = train_isolation(train_features)

### PREDICT WITH TEST FEATURES

In [19]:
predictions = model.predict(test_features)

In [20]:
predictions

array([-1,  1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,  1, -1,
        1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,  1,
        1, -1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1, -1,
       -1,  1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1,
        1, -1, -1,  1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,  1,  1,
       -1,  1,  1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,
        1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,
       -1, -1, -1, -1,  1

### INSPECT THE ANOMALOUS EVENTS

In [21]:
for index, prediction in enumerate(predictions):
    
    # FIND ALL -1 VALUES (ANOMALIES)
    if prediction != 1:
        
        # PRINT ROW FROM TEST SET
        print(test_set[index])

(root) CMD ( cd / && run-parts --report /etc/cron.hourly)
(root) CMD ( cd / && run-parts --report /etc/cron.hourly)
(root) CMD ( cd / && run-parts --report /etc/cron.hourly)
Started Daily apt upgrade and clean activities.
Joining mDNS multicast group on interface wlan0.IPv6 with address 2001:708:170:360:93e7:82bf:af2c:bc6d.
apt-daily.timer: Adding 1h 24min 33.502020s random time.
Server startup complete. Host name is raspberrypi-2.local. Local service cookie is 929520076.
(root) CMD (test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.daily ))
[origin software="rsyslogd" swVersion="8.24.0" x-pid="363" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
Listening on GnuPG cryptographic agent and passphrase cache.
(root) CMD ( cd / && run-parts --report /etc/cron.hourly)
[origin software="rsyslogd" swVersion="8.24.0" x-pid="363" x-info="http://www.rsyslog.com"] rsyslogd was HUPed
(root) CMD ( cd / && run-parts --report /etc/cron.hourly)
Closed GnuPG cryptographic agent (a