In [1]:
import pandas as pd
import numpy as np

In [2]:
import binascii
import itertools

In [3]:
from collections import Counter
import operator

### LOAD DATAFRAME

In [4]:
dataframe = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [5]:
dataframe.head(5)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...


### CONVERT MESSAGES TO NUMPY ARRAY

In [6]:
dataset = np.array(dataframe['message'])

### BLACKLISTED PHRASES -- DANGEROUS ANOMALIES

In [7]:
blacklist = [
    'ropsten',
    'Created slice User',
    'Started Session c',
    'New USB device found',
    '@reboot jobs',
    'Started System Logging'
]

### SPLIT INTO NORMAL & ANOMALOUS EVENTS

In [8]:
def split(dataset, blacklist):
    
    # CONTAINERS
    normal, anomalous = [], []
    
    # LOOP THROUGH DATASET
    for event in dataset:
        
        # CHECK IF EVENT CONTAINS BLACKLISTED PHRASES
        anomaly = contains(event, blacklist)
        
        # APPEND TO ANOMALOUS EVENTS
        if anomaly:
            anomalous.append(event)
            
        # APPEND TO NORMAL EVENTS
        else:
            normal.append(event)
        
    # RETURN UNIQUE EVENTS
    return list(set(normal)), list(set(anomalous))

In [9]:
def contains(message, blacklist):
    
    # LOOP THROUGH BLACKLIST
    for phrase in blacklist:
        
        # CHECK IF THE MESSAGE CONTAINS THE BAD PHRASE
        if phrase in message:
            return True
        
    return False

In [10]:
normal_dataset, anomalous_dataset = split(dataset, blacklist)

### CONVERT MESSAGE TO HASHED SHINGLES

In [11]:
def hashed_shingles(message, size=10):
    
    # SHINGLE CONTAINER -- USING SET TO ELIMINATE DUPLICATED
    shingles = set()

    # CRAWL THROUGH THE MESSAGE IN CHUNKS
    for i in range(len(message)-size+1):

        # CREATE SHINGLE & HASH IT TO A 32-BIT INTEGER
        shingle = message[i:i+size]
        crc = binascii.crc32(shingle.encode('utf-8'))

        # APPEND TO CONTAINER
        shingles.add(crc)

    return shingles

#### EXAMPLE - WITH SHINGLE SIZE 10

In [52]:
normal_dataset[0]

'BCM2708FB: allocated DMA channel 0'

In [54]:
len(normal_dataset[0])

34

In [53]:
hashed_shingles(normal_dataset[0])

{97016831,
 155059334,
 224570597,
 656957105,
 667840274,
 933119675,
 935589922,
 1017414130,
 1082190970,
 1232749889,
 1373320409,
 1434168141,
 1691671328,
 1951906778,
 2190387590,
 2277818445,
 2308618111,
 2365764720,
 2404013047,
 2428818993,
 3006685846,
 3214337224,
 3995843060,
 4153444613,
 4211946975}

In [55]:
len(hashed_shingles(normal_dataset[0]))

25

### JACCARD SIMILARITY SCORE

In [42]:
def jaccard_similarity(first_shingle, second_shingle):
    
    # FIND NUMBER OF HASHES THAT EXIST IN BOTH SHINGLES
    intersection_cardinality = len(first_shingle.intersection(second_shingle))
    
    # FIND NUMBER OF UNIQUE HASHES IN BOTH SHINGLES
    union_cardinality = len(first_shingle.union(second_shingle))
    
    # RETURN DIFFERENCE
    return intersection_cardinality / float(union_cardinality)

### FIND SIMILAR EVENTS

In [13]:
def find_similar(dataset, shingle_size=10, similarity_threshold=0.5):

    # CONTAINER
    candidates = []

    # LOOP THROUGH UNIQUE EVENT PAIRS
    for pair in itertools.combinations(dataset, 2):
        
        # GENERATE HASHED SHINGLES FOR PAIR
        first = hashed_shingles(pair[0], size=shingle_size)
        second = hashed_shingles(pair[1], size=shingle_size)
        
        # CALCULATE JACCARD SIMILARITY SCORE FOR PAIR
        score = jaccard_similarity(first, second)

        # IF THE SIMILARITY THRESHOLD IS BREACHED
        if score > similarity_threshold:
                
            # APPEND TO CONTAINER
            candidates.append(pair)
    
    # PRINT RESULTS
    print('SIMILARITY THRESHOLD:\t\t', similarity_threshold)
    print('NUMBER OF SIMILAR ITEMS:\t', len(candidates))
    
    return candidates

In [14]:
items = find_similar(
    normal_dataset,
    shingle_size=5,
    similarity_threshold=0.90
)

SIMILARITY THRESHOLD:		 0.9
NUMBER OF SIMILAR ITEMS:	 19


### CREATE EVENT CLUSTERS & CENTROIDS

In [16]:
def clusterize(data):
    
    # CONTAINERS
    clusters = []
    centroids = []
    
    # CREATE CLUSTERS
    if len(data) > 1:
        tmp = [data[0]]
        for i in range(1,len(data)):
            if data[i][0] == data[i-1][1] or data[i][1] == data[i-1][0] or data[i][1] == data[i-1][1] or data[i][0] == data[i-1][0]:
                tmp.append(data[i])
            else:
                clusters.append(tmp)
                tmp = [data[i]]
        clusters.append(tmp)
    else:
        clusters = data

    # CREATE CENTROIDS
    for elem in clusters:
        b = Counter([i for sub in elem for i in sub])
        mx = max(b.items(), key=operator.itemgetter(1))[0]
        centroids.append(mx)
        
    # PRINT RESULTS
    print('CLUSTER COUNT:\t\t\t', len(clusters))
    print('CENTROID COUNT:\t\t\t', len(centroids))

    return clusters, centroids

In [17]:
clusters, centroids = clusterize(items)

CLUSTER COUNT:			 10
CENTROID COUNT:			 10
