In [6]:
import pandas as pd
import numpy as np

In [35]:
from collections import Counter
import binascii

### LOAD DATAFRAME

In [8]:
dataframe = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [9]:
dataframe.head(5)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...


### CONVERT MESSAGES TO NUMPY ARRAY

In [10]:
dataset = np.array(dataframe['message'])

### BLACKLISTED PHRASES -- ANOMALIES

In [17]:
blacklist = [
    'ropsten',
    'Created slice User',
    'Started Session c',
    'New USB device found',
    '@reboot jobs',
    'Started System Logging'
]

### SPLIT INTO NORMAL & ANOMALOUS EVENTS

In [20]:
def split(dataset, blacklist):
    
    # CONTAINERS
    normal, anomalous = [], []
    
    # LOOP THROUGH DATASET
    for event in dataset:
        
        # CHECK IF EVENT CONTAINS BLACKLISTED PHRASES
        anomaly = contains(event, blacklist)
        
        # APPEND TO ANOMALOUS EVENTS
        if anomaly:
            anomalous.append(event)
            
        # APPEND TO NORMAL EVENTS
        else:
            normal.append(event)
            
    return normal, anomalous

In [21]:
def contains(message, blacklist):
    
    # LOOP THROUGH BLACKLIST
    for phrase in blacklist:
        
        # CHECK IF THE MESSAGE CONTAINS THE BAD PHRASE
        if phrase in message:
            return True
        
    return False

In [30]:
normal_dataset, anomalous_dataset = split(dataset, blacklist)

### CONVERT DATASETS TO SHINGLE HASHES

In [67]:
def to_shingles(msg, k=10):
    
    # CONTAINER
    shingles = set()  # we use a set to automatically eliminate duplicates
    
    # CRAWL THROUGH THE MESSAGE
    for i in range(len(msg)-k+1):
        
        # CREATE SHINGLE & HASH IT
        shingle = msg[i:i+k]
        crc = binascii.crc32(shingle.encode('utf-8')) #& 0xffffffff  # hash the shingle to a 32-bit integer
        
        # APPEND TO CONTAINER
        shingles.add(crc)
        
    # RETURN AS NUMPY ARRAY
    return np.array(list(shingles))

In [68]:
foo = to_shingles(normal_dataset[5], k=3)

In [69]:
foo

array([2402230784, 4010115586,   46379397, 2209542664, 2283854346,
       2539417229, 2705265040, 3039523217, 2346285456, 2395644693,
       3624532246,  339066393, 3663602841, 3764084251, 3183161246,
       1218807839, 1236802723, 3604977963,  993083564, 1351207084,
       4101391790, 1767766964, 4201251125, 4158576186, 1015804091,
       3549537981, 1349952704,  164075330,  678980936, 3905434184,
       3120258634,  783637705, 2041411532, 2073014609, 4231386708,
        685994070, 3271754455, 4001269591,  711719256, 1976880086,
       1375028830, 4138466142, 2479166307, 1937220836, 2464940133,
       1131402726, 2734266728, 2606898921, 3441890539, 3893286898,
       3770387442, 2995556597,  333081077,  152807927, 4117159806])