In [3]:
import gzip
import pandas as pd
import os
import nbimporter

In [4]:
import utils
import machine_learning as ml

### CLASSIFY EVENTS

In [5]:
def classify(directory):
    
    # CONTAINERS
    events = {}
    
    # FIND & SORT LOG FILES IN DIR
    files = os.listdir(directory)
    files.sort()
    
    # ROW COUNT
    rows = 0
    
    # LOOP THROUGH LOG FILES
    for file in files:
        
        # FILE PATH
        path = '{}/{}'.format(directory, file)
    
        # OPEN & READ THE COMPRESSED FILE
        with gzip.open(path, 'rb') as file:
            for line in file:
                
                # INCREMENT ROWS
                rows += 1

                # DECODE AS STRING
                decoded = line.decode('utf-8')

                # PARSE LINE & EXTRACT PARAMS
                module, code, message, timestamp = utils.parse_line(decoded)

                # GENERATE AN EVENT HASH
                hash_id = utils.hash_data({
                    'module': module,
                    'message': message
                })

                # IF THE EVENT HAS OCCURRED BEFORE
                if hash_id in events:

                    # INCREMENT OCCURRENCE
                    events[hash_id]['occurrence'] += 1

                # OTHERWISE, DD PROPERTY TO CONTAINER
                else:
                    events[hash_id] = {
                        'module': module,
                        'code': code,
                        'message': message,
                        'timestamp': timestamp,
                        'hash': hash_id,
                        'occurrence': 1
                    }

    # CONSTRUCT A DATAFRAME
    dataframe = pd.DataFrame.from_dict(events, orient='index')

    # DROP THE UNNECESSARY HASH COL (INDEX)
    dataframe.drop(['hash'], axis=1, inplace=True)

    # RE-ORDER COLUMNS
    dataframe = dataframe.reindex(columns=[
        'occurrence',
        'module',
        'code',
        'message',
        'timestamp'
    ])

    # SORT MY MESSAGE COL
    dataframe = dataframe.sort_values(by=['message'])
    
    # PRINT LENGTHS
    print('TOTAL ROWS FOUND:\t', rows)
    print('DATAFRAME LENGTH:\t', len(dataframe))

    return dataframe

### CREATE DATAFRAME

In [6]:
dataframe = classify('data/logs')

TOTAL ROWS FOUND:	 3281
DATAFRAME LENGTH:	 802


In [7]:
dataframe.head(5)

Unnamed: 0,occurrence,module,code,message,timestamp
76cd1e55f2e8bf23e56048b524fcf533b9a2c1aa960c79dcceb507142e0b42d5,4,cron,325,(CRON) INFO (Running @reboot jobs),1613389600
53eac99acba6c33cfabdf631e4c7a0196f2e1647a0303190375ea680190510ed,4,cron,325,(CRON) INFO (pidfile fd = 3),1613389600
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931,6,CRON,15203,"(CRON) info (No MTA installed, discarding output)",1613646001
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7,144,CRON,6126,(root) CMD ( cd / && run-parts --report /etc/c...,1613625421
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16,6,CRON,15207,(root) CMD (./home/wickstjo/scripts/ropsten.sh),1613646001


### SAVE IT

In [8]:
#dataframe.to_csv('data/all-files-ignore-code-kernel-fix.csv')

### VECTORIZE MESSAGES TO FEATURES

In [27]:
features, vectorizer = ml.create_features(dataframe['message'])

### TRAIN ISOLATION FOREST

In [28]:
model = ml.train_isolation({
    'estimators': 1,
    'features': features
})

### PREDICT ON DATA

In [29]:
predictions = model.predict(features)

### CHECK STUFF

In [None]:
# LOOP THROUGH PREDICTIONS
for index, prediction in enumerate(predictions):
    
    # FIND THE ANOMALIES
    if prediction == -1:
        
        # DECONSTRUCT VALUES
        occurrence, module, code, msg, timestamp = dataframe.iloc[[index]].to_numpy()[0]
        
        print('OCCURRENCE:\t', occurrence)
        print('MODULE:\t\t', module)
        print('CODE:\t\t', code)
        print('MESSAGE:\t', msg[:70])
        print()