In [4]:
import gzip
import pandas as pd
import os
import nbimporter

In [5]:
import utils
import machine_learning as ml

Importing Jupyter notebook from utils.ipynb
Importing Jupyter notebook from machine_learning.ipynb


### CREATE HASH DISTANCE DATASET

In [13]:
def create_dataset(directory):
    
    # CONTAINER
    container = []
    
    # FIND & SORT LOG FILES IN DIR
    files = os.listdir(directory)
    files.sort()
    files.reverse()
    
    # LOOP THROUGH LOG FILES
    for file in files:
        
        # FILE PATH
        path = '{}/{}'.format(directory, file)
    
        # OPEN & READ THE COMPRESSED FILE
        with gzip.open(path, 'rb') as file:
            for line in file:

                # DECODE AS STRING
                decoded = line.decode('utf-8')

                # PARSE LINE & EXTRACT PARAMS
                module, code, message, timestamp = utils.parse_line(decoded)
                
                # APPEND TO CONTAINER
                container.append([timestamp, module, code, message])
           
    # CREATE DATAFRAME
    dataframe = pd.DataFrame(container, columns=[
        'timestamp',
        'module',
        'code',
        'message'
    ])
                
    return dataframe

In [14]:
dataset = create_dataset('data/logs')

['syslog.7.gz', 'syslog.6.gz', 'syslog.5.gz', 'syslog.4.gz', 'syslog.3.gz', 'syslog.2.gz']


In [17]:
dataset.head(10)

Unnamed: 0,timestamp,module,code,message
0,1613190301,libloggingstdlog,none,"[origin software=""rsyslogd"" swVersion=""8.24.0""..."
1,1613193421,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...
2,1613197021,CRON,6546,(root) CMD ( cd / && run-parts --report /etc/c...
3,1613200621,CRON,12800,(root) CMD ( cd / && run-parts --report /etc/c...
4,1613204221,CRON,19277,(root) CMD ( cd / && run-parts --report /etc/c...
5,1613207821,CRON,25771,(root) CMD ( cd / && run-parts --report /etc/c...
6,1613211421,CRON,32248,(root) CMD ( cd / && run-parts --report /etc/c...
7,1613214001,CRON,4465,(root) CMD (./home/wickstjo/scripts/ropsten.sh)
8,1613214001,CRON,4461,"(CRON) info (No MTA installed, discarding output)"
9,1613215021,CRON,6268,(root) CMD ( cd / && run-parts --report /etc/c...


### CREATE SANITIZED DATASET

In [62]:
def classify(directory):
    
    # CONTAINERS
    events = {}
    timelines = {}
    
    # FIND & SORT LOG FILES IN DIR
    files = os.listdir(directory)
    files.sort()
    files.reverse()
    
    # ROW COUNT
    rows = 0
    
    # LOOP THROUGH LOG FILES
    for file in files:
        
        # FILE PATH
        path = '{}/{}'.format(directory, file)
    
        # OPEN & READ THE COMPRESSED FILE
        with gzip.open(path, 'rb') as file:
            for line in file:
                
                # INCREMENT ROWS
                rows += 1

                # DECODE AS STRING
                decoded = line.decode('utf-8')

                # PARSE LINE & EXTRACT PARAMS
                module, code, message, timestamp = utils.parse_line(decoded)

                # GENERATE AN EVENT HASH
                hash_id = utils.hash_data({
                    'module': module,
                    'message': message
                })

                # IF THE EVENT HAS OCCURRED BEFORE
                if hash_id in events:

                    # INCREMENT OCCURRENCE
                    events[hash_id]['occurrence'] += 1
                    
                    # PUSH TIMESTAMP
                    timelines[hash_id].append(timestamp)

                # OTHERWISE, ADD PROPERTY TO CONTAINER
                else:
                    events[hash_id] = {
                        'module': module,
                        'code': code,
                        'message': message,
                        'timestamp': timestamp,
                        'hash': hash_id,
                        'occurrence': 1
                    }
                    
                    timelines[hash_id] = [timestamp]

    # CONSTRUCT A DATAFRAME
    dataframe = pd.DataFrame.from_dict(events, orient='index')

    # DROP THE UNNECESSARY HASH COL (INDEX)
    dataframe.drop(['hash'], axis=1, inplace=True)

    # RE-ORDER COLUMNS
    dataframe = dataframe.reindex(columns=[
        'occurrence',
        'module',
        'code',
        'message',
        'timestamp'
    ])

    # SORT MY MESSAGE COL
    dataframe = dataframe.sort_values(by=['message'])
    
    # PRINT LENGTHS
    print('TOTAL ROWS FOUND:\t', rows)
    print('DATAFRAME LENGTH:\t', len(dataframe))

    return dataframe, timelines

### CREATE DATAFRAME

In [63]:
dataframe, timelines = classify('data/logs')

['syslog.7.gz', 'syslog.6.gz', 'syslog.5.gz', 'syslog.4.gz', 'syslog.3.gz', 'syslog.2.gz']
TOTAL ROWS FOUND:	 3281
DATAFRAME LENGTH:	 802


In [64]:
dataframe.head(5)

Unnamed: 0,occurrence,module,code,message,timestamp
76cd1e55f2e8bf23e56048b524fcf533b9a2c1aa960c79dcceb507142e0b42d5,4,cron,325,(CRON) INFO (Running @reboot jobs),1613389600
53eac99acba6c33cfabdf631e4c7a0196f2e1647a0303190375ea680190510ed,4,cron,325,(CRON) INFO (pidfile fd = 3),1613389600
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931,6,CRON,4461,"(CRON) info (No MTA installed, discarding output)",1613214001
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7,144,CRON,32398,(root) CMD ( cd / && run-parts --report /etc/c...,1613193421
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16,6,CRON,4465,(root) CMD (./home/wickstjo/scripts/ropsten.sh),1613214001


### SAVE IT

In [65]:
#dataframe.to_csv('data/all-files-ignore-code-kernel-fix.csv')

### VECTORIZE MESSAGES TO FEATURES

In [66]:
features, vectorizer = ml.create_features(dataframe['message'])

### TRAIN ISOLATION FOREST

In [67]:
model = ml.train_isolation({
    'estimators': 1,
    'features': features
})

### PREDICT ON DATA

In [68]:
predictions = model.predict(features)

### CHECK STUFF

In [None]:
# LOOP THROUGH PREDICTIONS
for index, prediction in enumerate(predictions):
    
    # FIND THE ANOMALIES
    if prediction == -1:
        
        # DECONSTRUCT VALUES
        occurrence, module, code, msg, timestamp = dataframe.iloc[[index]].to_numpy()[0]
        
        print('OCCURRENCE:\t', occurrence)
        print('MODULE:\t\t', module)
        print('CODE:\t\t', code)
        print('MESSAGE:\t', msg[:70])
        print()

### ANALYZE TIMELINES

In [70]:
def find_deltas(data):
    
    # CONTAINER
    deltas = {}
    
    # LOOP THROUGH KEYS & CREATE PROPERTY IN CONTAINER
    for key in data.keys():
        deltas[key] = []
        
        # LOOP THROUGH TIMESTAMPS
        for index, timestamp in enumerate(data[key]):
            
            # IF THERE
            if index + 1 < len(data[key]):
                delta = data[key][index + 1] - data[key][index]
                deltas[key].append(delta)
                
    return deltas

In [71]:
foo = find_deltas(timelines)

In [74]:
foo_df = pd.DataFrame.from_dict(foo, orient='index')

In [79]:
foo_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,133,134,135,136,137,138,139,140,141,142
182ccd09b9f45118a38545c37bbf9baaebcfa0acc71d48e45d49d7c4dcb95725,86401.0,0.0,86400.0,0.0,,,,,,,...,,,,,,,,,,
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3601.0,3599.0,...,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0,3600.0
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16,86400.0,86400.0,86400.0,86400.0,86400.0,,,,,,...,,,,,,,,,,
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931,86400.0,86400.0,86400.0,86400.0,86400.0,,,,,,...,,,,,,,,,,
01b3fcf531f552e14b0cd7360f41186dde5b2596806bf363922f52482a2b1c23,86454.0,87440.0,3660.0,7638.0,86440.0,86420.0,86460.0,,,,...,,,,,,,,,,


In [80]:
#foo_df.to_csv('data/occurrence-tbl.csv')

In [72]:
foo

{'182ccd09b9f45118a38545c37bbf9baaebcfa0acc71d48e45d49d7c4dcb95725': [86401,
  0,
  86400,
  0],
 '30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7': [3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3601,
  3599,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3601,
  3599,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3601,
  3599,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3601,
  3599,
  3601,
  3599,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3600,
  3

In [81]:
timelines['90af297b91404dae33ab96d893b33a5757f77d4a01aab04c5b979e6f7ffe5fb3']

[1613391313,
 1613391313,
 1613391313,
 1613391313,
 1613391313,
 1613391348,
 1613391348,
 1613391348,
 1613391348,
 1613391348,
 1613391348,
 1613391348,
 1613391348,
 1613391348]

In [34]:
for key in timelines.keys():
    for

3953c66143c45d8bb46f3849d284ba660c546dd5373ed264d7c55510aacb0daf
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7
9cfa109eb45871ebdc2e06b54a1e2437bffd1c4de512f32c17454c57d2869fb3
734acb3f31c95e1aa855c6798109f101c34f3bc2840bada75368144cd51e892f
f08a39d08f15d3a282379764e9b9bb223cb97ab7ef63970f2bddd76483c050f1
0e6a6b6e47b3ad2199f48f055c73f09417120214063a5bf841d3e5f7ec1a24ca
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931
de6c92fc7669dd303d66fad3d08919d7d14d4e67b0d763ac9e4014b1ffff6f19
c63af0610b4d20ab385543e73b7f13237624ff5b723c9bc9fee717cb4a8997e0
f4a990c064743a050ed20cec455000e50d2f10e720fae9abc8a38f23b0c67c97
7beafcb1e8a4c8f88c0086517bdfbbf0e605da4e3e0c953cd75673be2d424300
8b4de8d29d864100b58b89922ab0be67940c26af6b640bc343b5737ba9352e0e
7e3a7dc974ec9c18605f1e841d8ebd46e6682407eaf55d44f3006c24c0ee36ad
31a1894e09fad6151d60c9309533cd305f3f4afa377c091f73cf67b81d2e7e4c
b74679d6b2e840ec416fff676