In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest

In [2]:
import pandas as pd
import numpy as np
import math
import random

### LOAD DATASET

In [3]:
dataframe = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [4]:
dataframe.head(10)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...
5,1613207821,CRON,25771,(root) CMD ( cd / && run-parts --report /etc/c...
6,1613211421,CRON,32248,(root) CMD ( cd / && run-parts --report /etc/c...
7,1613214001,CRON,4465,(root) CMD (./home/wickstjo/scripts/ropsten.sh)
8,1613214001,CRON,4461,"(CRON) info (No MTA installed, discarding output)"
9,1613215021,CRON,6268,(root) CMD ( cd / && run-parts --report /etc/c...


In [5]:
dataset = np.array(dataframe['message'])

### FIND ANOMALOUS EVENTS

In [6]:
outliers = []

In [7]:
blacklisted_phrases = [
    'ropsten',
    'Created slice User',
    'Started Session c',
    'New USB device found',
    '@reboot jobs',
    'Started System Logging'
]

In [8]:
# LOOP THROUGH DATASET ROWS
for row in dataset:

    # LOOP THROUGH BLACKLISTED PHRASES
    for phrase in blacklisted_phrases:
        
        # CHECK IF THE ROW CONTAINS BAD PHRASES
        if phrase in row:
            outliers.append(row)

### VECTORIZE DATASETS INTO FEATURES

In [9]:
def create_features(dataset):
    
    # CREATE THE VECTORIZER
    vectorizer = TfidfVectorizer()
    
    # FIT VECTORIZER & CREATE TRAIN FEATURES
    features = vectorizer.fit_transform(dataset)
    
    return features, vectorizer

In [10]:
features, vectorizer = create_features(dataset)

### TRAIN ISOLATION FOREST

In [11]:
model_config = {
    'n_estimators': 1000,
    'random_state': 200,
    'contamination': len(outliers) / len(dataset)
}

In [12]:
def train_isolation(features):
    
    # CREATE THE MODEL
    model = IsolationForest(**model_config)
    
    # TRAIN IT
    model.fit(features)
    
    return model

In [13]:
model = train_isolation(features)

### PREDICT WITH SAME FEATURES

In [14]:
predictions = model.predict(features)

### INSPECT THE ANOMALOUS EVENTS

In [15]:
for index, prediction in enumerate(predictions):
    
    # FIND ALL -1 VALUES (ANOMALIES)
    if prediction != 1:
        
        # PRINT ROW FROM TEST SET
        print(dataset[index])

Error getting user list from org.freedesktop.Accounts: GDBus.Error:org.freedesktop.DBus.Error.ServiceUnknown: The name org.freedesktop.Accounts was not provided by any .service files
Error getting user list from org.freedesktop.Accounts: GDBus.Error:org.freedesktop.DBus.Error.ServiceUnknown: The name org.freedesktop.Accounts was not provided by any .service files
Error getting user list from org.freedesktop.Accounts: GDBus.Error:org.freedesktop.DBus.Error.ServiceUnknown: The name org.freedesktop.Accounts was not provided by any .service files
Kernel command line: coherent_pool=1M 8250.nr_uarts=1 bcm2708_fb.fbwidth=1824 bcm2708_fb.fbheight=984 bcm2708_fb.fbswap=1 vc_mem.mem_base=0x3ec00000 vc_mem.mem_size=0x40000000 dwc_otg.lpm_enable=0 console=tty1 console=ttyS0,115200 root=/dev/mmcblk0p7 rootfstype=ext4 elevator=deadline fsck.repair=yes rootwait splash plymouth.ignore-serial-consoles quiet
input: Logitech Optical USB Mouse as /devices/platform/soc/3f980000.usb/usb1/1-1/1-1.2/1-1.2:1.0