In [None]:
# Common code

import re
import pickle
import sqlite3

class LogFile:
    __path = None
    __hostname_pattern = re.compile('^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$')
    __id_suffix_pattern = re.compile('^([a-z]+-)+[a-f0-9]+-.{5}$') # something-and-else-fb6afe71-ea3fd
    __id_prefix_pattern1 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}-(.*)$') # fb6afe71-6f53-48dd-920f-743ab83e1c2f-something
    __id_prefix_pattern2 = re.compile('^[a-f0-9]{32}-(.*)$') # 32-digit hex number prefix
    __id_suffix_pattern3 = re.compile('^([a-z]+-)+[a-f0-9]{5}$') # something-else-ea3fd
    __id_suffix_pattern4 = re.compile('^([a-z]+-)+[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # something-16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __id_token_pattern1 = re.compile('^[a-z-]+-') # just words separated by dashes
    __id_token_pattern2 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # 16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __numeric_token_pattern = re.compile('^[+]?[0-9]+[.]*[0-9]*[mug]*i*[sb]*$') # +4.653ms and such
    __hex_num_pattern = re.compile('[a-z0-9]+')
    __ttable = str.maketrans('{}[]()=:/\\,&?_', '              ', '"\'')
    __db = None
    __dbc = None
    
    def __init__(self, path = None, db_path = None, clear = True):
        if db_path is None:
            self.__db = sqlite3.connect(":memory:")
        else:
            self.__db = sqlite3.connect(db_path)
        self.__dbc = self.__db.cursor()
        self.__dbc.execute("create table if not exists log_lines(ID integer primary key, line text not NULL)")
        self.__dbc.execute("create table if not exists tokens(ID integer primary key, token text unique not NULL, freq integer not null default 1)")
        self.__dbc.execute("create table if not exists identifiers(ID integer primary key, identifier text unique not NULL)")
        self.__dbc.execute("create table if not exists log_line_tokens(line_ID integer not null, token_ID integer not null, primary key (line_ID, token_ID))")
        self.__dbc.execute("create table if not exists log_line_identifiers(line_ID integer not null, identifier_ID integer not null, primary key (line_ID, identifier_ID))")
        if clear:
            self.__dbc.execute("delete from log_lines") # this should make the class idempotent but allows only for one log file to be processed
        self.set_path(path)

    def __fini__(self):
        self.__db.close()

    def __is_hostname(self, s):
        return s.count('.') > 0 and self.__hostname_pattern.match(s)

    def __is_hash(self, s):
        if (len(s) == 32 or len(s) == 64) and self.__hex_num_pattern.match(s):
            return True
        if self.__id_token_pattern2.match(s):
            return True
        return False

    def __is_numberlike(self, s):
        if s.isnumeric() or self.__numeric_token_pattern.match(s):
            return True
        return False

    # 'some-operator-69854dc866-sbpwz' > 'some-operator-id_suffix'
    def __replace_id_suffix(self, s):
        m = None
        if s.startswith("--"):
            return(s, False)
        if self.__id_suffix_pattern.match(s):
            m = self.__id_token_pattern1.match(s)
        if m != None:
            return (m.group() + 'id_suffix', True)
        m = self.__id_suffix_pattern4.match(s)
        if m is not None:
            return (m.group(1) + 'id_suffix', True)
        return (s, False)

    def __replace_id_prefix(self, s):
        m = self.__id_prefix_pattern1.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        m = self.__id_prefix_pattern2.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        return (s, False)
        
    def __log_line_tokenize(self, line):
        ret = []
        tokens = line.split(" ", 8)
        # Filter out things that can't be possibly kubernetes and even the k8s startup log lines
        if not (tokens[4].startswith("kubenswrapper") and (tokens[5].startswith("i") or tokens[5].startswith("w") or tokens[5].startswith("e"))):
             raise IndexError
        identifiers = []
        log_text = tokens.pop(-1)
        log_text = log_text.translate(self.__ttable)
        log_text_tokens = log_text.split()
        i = 0
        for t in log_text_tokens:
            log_text_tokens[i] = t.strip(". ?!*#,+")
            i = i + 1
        tokens = [tokens[5]]
        tokens[0] = tokens[0][0] + '000'
        tokens = tokens + log_text_tokens
        for t in tokens:
            tr = t
            if self.__is_numberlike(t):
                continue
            if self.__is_hostname(t):
                tr = "_hostname_token"
            elif self.__is_hash(t):
                if not t in identifiers:
                    identifiers.append(t)
                tr = "_hash_token"
            else:
                (tr, found) = self.__replace_id_prefix(t)
                if found and not t in identifiers:
                    identifiers.append(t)
                (tr, found) = self.__replace_id_suffix(tr)
                if found and not t in identifiers:
                    identifiers.append(t)
            ret.append(tr)
        return (ret, identifiers)
    
    def set_path(self, path):
        self.__path = path

    def parse_log(self, limit = 0):
        logfile = open(self.__path, "r")
        line = logfile.readline().casefold()
        line_num = 1 # this will become line_ID
        while (len(line) > 0 and limit == 0) or (len(line) > 0 and limit != 0 and line_num <= limit):
            try:
                tokens, identifiers = self.__log_line_tokenize(line)
                self.__dbc.execute("insert into log_lines (ID, line) values (?, ?)", (line_num, line))
                l_id = self.__dbc.lastrowid
                for t in tokens:
                    if t.isspace():
                        continue
                    self.__dbc.execute("insert into tokens (token) values (?) on conflict do update set freq = freq + 1", (t,))
                    t_id = self.__dbc.execute("select id from tokens where (token = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_tokens (line_ID, token_ID) values (?,?)", (l_id, t_id))
                for t in identifiers:
                    self.__dbc.execute("insert or ignore into identifiers (identifier) values (?)", (t,))
                    i_id = self.__dbc.execute("select id from identifiers where (identifier = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_identifiers (line_ID, identifier_ID) values (?,?)", (l_id, i_id))
            except IndexError:
                pass
            line = logfile.readline().casefold()
            line_num = line_num + 1
        self.__db.commit()
        logfile.close()

    def export_freq_table(self, filename):
        with open(filename, "w") as fw:
            for r in self.__dbc.execute("select token, freq from tokens"):
                fw.write(f"\"{r[0]}\", {r[1]}\n")

    def export_words_table(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"attach database '{filename}' as words_db")
        self.__dbc.execute("create table if not exists words_db.words (id integer primary key, word text unique not null)")
        self.__dbc.execute("insert or ignore into words_db.words (word) select token from tokens")
        self.__db.commit()
    
    def import_words_table(self, filename):
        r = self.__dbc.execute("create table if not exists words (id integer primary key, word text unique not null)")
        self.__dbc.execute("delete from words")
        with open(filename, "r") as fr:
            for line in fr:
                self.__dbc.execute("insert or ignore into words (word) values (?)",(line.rstrip(),))
        self.__db.commit()

    def get_words_count(self):
        r = self.__dbc.execute("select count(*) from words")
        return r.fetchone()[0]

    def save_db(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"vacuum into '{filename}'")

    def db_handle(self):
        return self.__db

In [None]:
# Build the raw dictionary with token counts
#logfile = LogFile("journal-aws-efs-operator-e2e.0", 'efs-e2e.db')
logfile = LogFile("journal-aws-efs-operator-e2e.0")
logfile.parse_log()

In [None]:
con = logfile.db_handle()
cur = con.cursor()
r = cur.execute("select count(ID) from log_lines")
print(r.fetchone()[0])
for r in cur.execute("select * from log_lines limit 20"):
    print(r)

In [None]:
r = cur.execute("select count(ID) from tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from tokens order by freq desc limit 20"):
    print(r)

In [None]:
logfile.export_freq_table("frequencies.csv")

In [None]:
r = cur.execute("select count(id) from identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from identifiers"):
    print(r)

In [None]:
r = cur.execute("select count(line_ID) from log_line_identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_identifiers order by line_ID limit 20"):
    print(r)

In [None]:
r = cur.execute("select count(line_ID) from log_line_tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_tokens order by token_ID limit 20"):
    print(r)


In [None]:
logfile.export_words_table('words.db')
r = cur.execute("create table words (id integer primary key, word text unique not null)")

# Here: review the db / words table and export as csv to be imported as the label set

In [None]:
import torch
import torch.nn as nn

logfile = LogFile("journal-aws-efs-operator-e2e.0", 'test2.db')
logfile.import_words_table('words.csv')
logfile.parse_log()

con = logfile.db_handle()
cur = con.cursor()

words_num = logfile.get_words_count()
print(f"Dictitonary size: {words_num} words")

prev_identifier_id = 0
prev_line_id = 0
raw_dataset = []
line_words = torch.zeros(words_num)
raw_dataset.append(line_words)
for r in cur.execute("select words.id as word_id, log_line_identifiers.identifier_ID, log_line_identifiers.line_ID from words \
        join tokens on tokens.token = words.word \
            join log_line_tokens on log_line_tokens.token_ID = tokens.ID \
                join log_line_identifiers on  log_line_identifiers.line_ID = log_line_tokens.line_ID \
                        order by log_line_identifiers.identifier_ID, log_line_identifiers.line_ID, word_id"):
    (word_id, identifier_id, line_id) = r
    if line_id != prev_line_id:
        raw_dataset.append(line_words)
        line_words = torch.zeros(words_num)
        prev_line_id = line_id
    line_words[word_id - 1] = 1.0
training_set = torch.stack(raw_dataset)

print(training_set.shape)

In [None]:
class LSTM(nn.Module):
    def __init__(self, io_dim, h_dim):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(io_dim, h_dim)
        self.linear = nn.Linear(h_dim, io_dim)

    def forward(self, x):
        x,_ = self.lstm(x)
        x = self.linear(x)
        return x

