In [1]:
import gzip
import pandas as pd
import nbimporter

In [2]:
import utils

Importing Jupyter notebook from utils.ipynb


### CLASSIFY EVENTS

In [20]:
def classify(path):
    
    # CONTAINERS
    events = {}
    
    # OPEN & READ THE COMPRESSED FILE
    with gzip.open(path,'rb') as file:
        for line in file:
            
            # DECODE AS STRING
            decoded = line.decode("utf-8")
            
            # PARSE LINE & EXTRACT PARAMS
            module, code, message, timestamp = utils.parse_line(decoded)
            
            # GENERATE AN EVENT HASH
            hash_id = utils.hash_data({
                'module': module,
                'message': message
            })
            
            # IF THE EVENT HAS OCCURRED BEFORE
            if hash_id in events:
                
                # INCREMENT OCCURRENCE
                events[hash_id]['occurrence'] += 1
                
            # OTHERWISE, DD PROPERTY TO CONTAINER
            else:
                events[hash_id] = {
                    'module': module,
                    'code': code,
                    'message': message,
                    'timestamp': timestamp,
                    'hash': hash_id,
                    'occurrence': 1
                }
            
    # CONSTRUCT A DATAFRAME
    dataframe = pd.DataFrame.from_dict(events, orient='index')

    # DROP THE UNNECESSARY HASH COL (INDEX)
    dataframe.drop(['hash'], axis=1, inplace=True)
    
    # RE-ORDER COLUMNS
    dataframe = dataframe.reindex(columns=[
        'occurrence',
        'module',
        'code',
        'message',
        'timestamp'
    ])
    
    # SORT MY MESSAGE COL
    dataframe = dataframe.sort_values(by=['message'])
    
    return dataframe

### EXECUTE

In [21]:
dataframe = classify('data/iot-syslog.gz')

In [24]:
dataframe.loc[dataframe['module'] == 'kernel']

Unnamed: 0,occurrence,module,code,message,timestamp
272f526b1398f4b6a01a300398da1bdaa48501ed906b4bff8ed5864f9da63cf5,4,kernel,,[ 0.000000] .bss : 0x(ptrval) - 0x(ptrval) ( 8...,1613389600
d64119970eeab4153109eb8e0be66d6bd30684c068cdf98bc428e6b1146be6e9,1,kernel,,[ 0.000000] .data : 0x(ptrval) - 0x(ptrval) ( ...,1613389600
945fcff5388c81b7b3d4adbe8f70ab0666d89f6d802f657710e043dc7f13bfb4,3,kernel,,[ 0.000000] .data : 0x(ptrval) - 0x(ptrval) ( ...,1613392397
492a9a954b6f56be31e2cbff1266ad8af2339666d6588af2bdcd0280de3da58c,4,kernel,,[ 0.000000] .init : 0x(ptrval) - 0x(ptrval) (1...,1613389600
cfad33b78dc60a6b0ec0c0d8070a424faacfdbb9086a6744e321e60024a768c3,4,kernel,,[ 0.000000] .text : 0x(ptrval) - 0x(ptrval) (9...,1613389600
...,...,...,...,...,...
6c4e4d9c6caed5b676b45623dda1cd910aadbe8f7e07d8b8ece95022086f01ad,1,kernel,,[ 8.857613] IPv6: ADDRCONF(NETDEV_UP): eth0: l...,1613389602
1716c8a3cb7f63e4997159243e8b26146c590c285fd142f1fa195cee20deec87,1,kernel,,[ 8.907380] IPv6: ADDRCONF(NETDEV_UP): eth0: l...,1613392399
82667efb1ca188e52438836589caf5288cdab4b1aab336e006feeacfd48fdec0,1,kernel,,[ 9.140396] Adding 102396k swap on /var/swap. ...,1613389602
c1ee25cb94f45a52ac8472dc053cf7895a0c170714853687287ecf6b13b57d93,1,kernel,,[ 9.214176] Adding 102396k swap on /var/swap. ...,1613392399


In [22]:
dataframe.head(5)

Unnamed: 0,occurrence,module,code,message,timestamp
76cd1e55f2e8bf23e56048b524fcf533b9a2c1aa960c79dcceb507142e0b42d5,4,cron,325,(CRON) INFO (Running @reboot jobs),1613389600
53eac99acba6c33cfabdf631e4c7a0196f2e1647a0303190375ea680190510ed,4,cron,325,(CRON) INFO (pidfile fd = 3),1613389600
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931,1,CRON,1152,"(CRON) info (No MTA installed, discarding output)",1613386801
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7,24,CRON,18192,(root) CMD ( cd / && run-parts --report /etc/c...,1613366221
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16,1,CRON,1156,(root) CMD (./home/wickstjo/scripts/ropsten.sh),1613386801


In [19]:
#dataframe.to_csv('data/ignore-code.csv')

### KERNEL CODE FIX

In [29]:
import re
from dateutil import parser

In [73]:
def kernel_fix(line):
    
    # TRUNCATE MULTI-SPACING
    line = re.sub(' +', ' ', line)
    
    # REMOVE LINEBREAK CHAR
    line = re.sub('\n', '', line)
    
    # FIND EACH OCCURRENCE OF SPACES IN LINE
    spaces = [i for i, ltr in enumerate(line) if ltr == ' ']
    
    # PROCESS MODULE
    raw_module = line[spaces[3] + 1:spaces[4] - 1]
    
    print(raw_module)
    
    # MODULE CODE
    raw_code = re.search(r"\[(\w+)\]", raw_module)
    
    # IF THERE IS NO CODE
    if raw_code == None:
        code = 'None'
        
    # OTHERWISE, EXTRACT IT
    else:
        code = raw_code.group(0)
    
    # MODULE NAME
    module = raw_module.replace('[' + str(code) + ']', '')
    
    # PROCESS TIMESTAMP
    raw_date = line[:spaces[2]]
    unix = parser.parse(raw_date).timestamp()
    
    # MODULE, CODE, MESSAGE, TIMESTAMP
    return module, code, line[spaces[4] + 1:], int(unix)

In [74]:
foob = 'Feb 15 13:46:40 raspberrypi kernel: [    0.000000] cma: Reserved 8 MiB at 0x37800000'

In [75]:
kernel_fix(foob)

kernel


('kernel', 'None', '[ 0.000000] cma: Reserved 8 MiB at 0x37800000', 1613389600)

In [63]:
re.search(r"\[(\w+)\]", foob)

In [64]:
foob

'Feb 15 13:46:40 raspberrypi kernel: [    0.000000] cma: Reserved 8 MiB at 0x37800000'

In [66]:
re.match(r"[^[]*\[([^]]*)\]", foob).groups()[0].replace(' ', '')

'0.000000'