In [104]:
import pandas as pd
import numpy as np

In [105]:
import binascii
import itertools

### LOAD DATAFRAME

In [106]:
dataframe = pd.read_csv('data/cleaned_data.csv', index_col='Unnamed: 0')

In [107]:
dataframe.head(5)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...


### CONVERT MESSAGES TO NUMPY ARRAY

In [108]:
dataset = np.array(dataframe['message'])

### BLACKLISTED PHRASES -- ANOMALIES

In [109]:
blacklist = [
    'ropsten',
    'Created slice User',
    'Started Session c',
    'New USB device found',
    '@reboot jobs',
    'Started System Logging'
]

### SPLIT INTO NORMAL & ANOMALOUS EVENTS

In [120]:
def split(dataset, blacklist):
    
    # CONTAINERS
    normal, anomalous = [], []
    
    # LOOP THROUGH DATASET
    for event in dataset:
        
        # CHECK IF EVENT CONTAINS BLACKLISTED PHRASES
        anomaly = contains(event, blacklist)
        
        # APPEND TO ANOMALOUS EVENTS
        if anomaly:
            anomalous.append(event)
            
        # APPEND TO NORMAL EVENTS
        else:
            normal.append(event)
        
    # RETURN UNIQUE EVENTS
    return list(set(normal)), list(set(anomalous))

In [121]:
def contains(message, blacklist):
    
    # LOOP THROUGH BLACKLIST
    for phrase in blacklist:
        
        # CHECK IF THE MESSAGE CONTAINS THE BAD PHRASE
        if phrase in message:
            return True
        
    return False

In [122]:
normal_dataset, anomalous_dataset = split(dataset, blacklist)

### CONVERT MESSAGE TO HASHED SHINGLES

In [129]:
def hashed_shingles(message, size=10):
    
    # SHINGLE CONTAINER -- USING SET TO ELIMINATE DUPLICATED
    shingles = set()

    # CRAWL THROUGH THE MESSAGE
    for i in range(len(message)-size+1):

        # CREATE SHINGLE & HASH IT TO A 32-BIT INTEGER
        shingle = message[i:i+size]
        crc = binascii.crc32(shingle.encode('utf-8'))

        # APPEND TO CONTAINER
        shingles.add(crc)

    return shingles

### JACCARD SIMILARITY SCORE

In [130]:
def jaccard_similarity(first_shingle, second_shingle):
    
    # CALCULATE CARDINALITIES
    intersection_cardinality = len(first_shingle.intersection(second_shingle))
    union_cardinality = len(first_shingle.union(second_shingle))
    
    # RETURN DIFFERENCE
    return intersection_cardinality / float(union_cardinality)

### FIND SIMILAR EVENTS

In [159]:
def find_similar(dataset, shingle_size=10, similarity_threshold=0.5, debug=False):

    # CONTAINER
    candidates = []

    # LOOP THROUGH UNIQUE EVENT PAIRS
    for pair in itertools.combinations(dataset, 2):
        
        # GENERATE HASHED SHINGLES FOR PAIR
        first = hashed_shingles(pair[0], size=shingle_size)
        second = hashed_shingles(pair[1], size=shingle_size)
        
        # CALCULATE JACCARD SIMILARITY SCORE FOR PAIR
        score = jaccard_similarity_score(first, second)

        # IF THE SIMILARITY THRESHOLD IS BREACHED
        if score > similarity_threshold:
            
            # IF DEBUGGING, PRINT PAIR
            if debug:
                print(pair, '\n')
                
            # APPEND TO CONTAINER
            candidates.append(pair)
    
    # PRINT RESULTS
    print('SIMILARITY THRESHOLD:\t\t', similarity_threshold)
    print('NUMBER OF SIMILAR ITEMS:\t', len(candidates))
    
    return candidates

### EXECUTE

In [160]:
similar_items = find_similar(
    normal_dataset,
    shingle_size=5,
    similarity_threshold=0.95
)

SIMILARITY THRESHOLD:		 0.95
NUMBER OF SIMILAR ITEMS:	 6


In [161]:
for pair in similar_items:
    print(pair[0])
    print(pair[1])
    print()

bcm2835-codec bcm2835-codec: Device registered as /dev/video12
bcm2835-codec bcm2835-codec: Device registered as /dev/video11

bcm2835-codec bcm2835-codec: Device registered as /dev/video12
bcm2835-codec bcm2835-codec: Device registered as /dev/video10

NET: Registered protocol family 16
NET: Registered protocol family 1

NET: Registered protocol family 17
NET: Registered protocol family 1

NET: Registered protocol family 1
NET: Registered protocol family 10

bcm2835-codec bcm2835-codec: Device registered as /dev/video11
bcm2835-codec bcm2835-codec: Device registered as /dev/video10



In [162]:
# Reverse similarity algo. We're interested in finding the most dissimilar items.

In [None]:
# CURRENT SOLUTION BOTTOMS OUT AT SCORE 0