In [15]:
import gzip
import pandas as pd
import nbimporter

In [16]:
import utils

### CLASSIFY EVENTS

In [17]:
def classify(path):
    
    # CONTAINERS
    events = {}
    
    # OPEN & READ THE COMPRESSED FILE
    with gzip.open(path,'rb') as file:
        for line in file:
            
            # DECODE AS STRING
            decoded = line.decode("utf-8")
            
            # PARSE LINE & EXTRACT PARAMS
            module, code, message, timestamp = utils.parse_line(decoded)
            
            # GENERATE AN EVENT HASH
            hash_id = utils.hash_data({
                'module': module,
                'message': message
            })
            
            # IF THE EVENT HAS OCCURRED BEFORE
            if hash_id in events:
                
                # INCREMENT OCCURRENCE
                events[hash_id]['occurrence'] += 1
                
            # OTHERWISE, DD PROPERTY TO CONTAINER
            else:
                events[hash_id] = {
                    'module': module,
                    'code': code,
                    'message': message,
                    'timestamp': timestamp,
                    'hash': hash_id,
                    'occurrence': 1
                }
            
    # CONSTRUCT A DATAFRAME
    dataframe = pd.DataFrame.from_dict(events, orient='index')

    # DROP THE UNNECESSARY HASH COL (INDEX)
    dataframe.drop(['hash'], axis=1, inplace=True)
    
    # RE-ORDER COLUMNS
    dataframe = dataframe.reindex(columns=[
        'occurrence',
        'module',
        'code',
        'message',
        'timestamp'
    ])
    
    # SORT MY MESSAGE COL
    dataframe = dataframe.sort_values(by=['message'])
    
    return dataframe

### EXECUTE

In [18]:
dataframe = classify('data/iot-syslog.gz')

In [24]:
len(dataframe)

769

In [23]:
dataframe['occurrence'].value_counts()

4     467
1     144
2      47
3      31
5      19
8      19
19      7
6       7
12      6
7       5
23      4
13      3
15      3
14      3
18      1
24      1
25      1
26      1
Name: occurrence, dtype: int64

In [19]:
dataframe.head(5)

Unnamed: 0,occurrence,module,code,message,timestamp
76cd1e55f2e8bf23e56048b524fcf533b9a2c1aa960c79dcceb507142e0b42d5,4,cron,325,(CRON) INFO (Running @reboot jobs),1613389600
53eac99acba6c33cfabdf631e4c7a0196f2e1647a0303190375ea680190510ed,4,cron,325,(CRON) INFO (pidfile fd = 3),1613389600
81759ec7de5efa9657b37800e1ec317692fe41c67ca7de759b23fed45015a931,1,CRON,1152,"(CRON) info (No MTA installed, discarding output)",1613386801
30129d78176e0d7a3e9fe5948b2df6bd3b4afd183f6b47f4c67a585239bf8fd7,24,CRON,18192,(root) CMD ( cd / && run-parts --report /etc/c...,1613366221
c4ec89aae91fea6f4c675192f0d774c5796e101500d50d7db77bcac276ee0a16,1,CRON,1156,(root) CMD (./home/wickstjo/scripts/ropsten.sh),1613386801


In [20]:
#dataframe.to_csv('data/ignore-code-kernel-fix.csv')

### FOO

In [10]:
first = 'Feb 15 13:46:40 raspberrypi kernel: [    0.000000] cma: Reserved 8 MiB at 0x37800000'

In [11]:
second = 'Feb 15 13:02:29 raspberrypi systemd[576]: Stopped target Default.'

### FUNC

In [12]:
def smoothen(line):
    
    # FISH OUT THE CODE
    raw_code = re.match(r"[^[]*\[([^]]*)\]", line).groups()[0]
    code = raw_code.replace(' ', '')
    
    # REMOVE THE BRACKETED CODE FROM THE LINE
    temp_block = '[{}]'.format(raw_code)
    line = line.replace(temp_block, '')
    
    # FIND ALL SPACES IN STRING
    spaces = [i for i, ltr in enumerate(line) if ltr == ' ']
    
    # FISH OUT THE DATE
    raw_date = line[:spaces[2]]
    timestamp = int(parser.parse(raw_date).timestamp())
    
    # FISH OUT THE MODULE
    raw_module = line[spaces[3]:spaces[4]]
    module = re.sub(r'[^A-Za-z]', '', raw_module)
    
    # FISH OUT THE REMAINING MESSAGE
    message = line[spaces[4]:].strip()
    message = re.sub(' +', ' ', message)
    
    return module, code, message, timestamp

In [13]:
smoothen(first)

('kernel', '0.000000', 'cma: Reserved 8 MiB at 0x37800000', 1613389600)

### FOO

In [14]:
spaces = [i for i, ltr in enumerate(foob) if ltr == ' ']

NameError: name 'foob' is not defined

In [None]:
spaces

### DATE

In [None]:
date = foob[:spaces[2]]

In [None]:
date

In [None]:
result = re.match(r"[^[]*\[([^]]*)\]", first).groups()[0]

In [None]:
result

### CODE

In [None]:
first

In [None]:
re.findall(r"\[([0-9. ]*[0-9]+)\]", second)

In [None]:
result = re.match(r"[^[]*\[([^]]*)\]", first).groups()[0]

In [None]:
type(result)

In [None]:
result

In [None]:
foo = re.match(r'[\d\.]', first)

In [None]:
type(foo)

In [None]:
re.match(r"[^[]*\[([^]]*)\]", first).groups()[0]

In [None]:
raw_code.groups()

In [None]:
type(raw_code)

In [None]:
code = raw_code.replace(' ', '')

In [None]:
code

### MODULE

In [None]:
block = '[{}]'.format(raw_code)

In [None]:
temp = first.replace(block, '')

In [None]:
temp

In [None]:
foob[spaces[3]:spaces[5]]

In [None]:
re.search(r"\[(\w+)\]", first)

In [None]:
foob

In [None]:
re.match(r"[^[]*\[([^]]*)\]", first).groups()[0]