In [7]:
# Common code

import re
import pickle
import sqlite3

class LogFile:
    __path = None
    __hostname_pattern = re.compile('^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$')
    __id_suffix_pattern = re.compile('^([a-z]+-)+[a-f0-9]+-.{5}$') # something-and-else-fb6afe71-ea3fd
    __id_prefix_pattern1 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}-(.*)$') # fb6afe71-6f53-48dd-920f-743ab83e1c2f-something
    __id_prefix_pattern2 = re.compile('^[a-f0-9]{32}-(.*)$') # 32-digit hex number
    __id_suffix_pattern3 = re.compile('^([a-z]+-)+[a-f0-9]{5}$') # something-else-ea3fd
    __id_suffix_pattern4 = re.compile('^([a-z]+-)+[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # something-16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __id_token_pattern1 = re.compile('^[a-z-]+-')
    __id_token_pattern2 = re.compile('^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$') # 16258d4f-7f2d-47f2-a11a-ef10e66dfc12
    __numeric_token_pattern = re.compile('^[+]?[0-9]+[.]*[0-9]*[mug]*i*[sb]*$') # +4.653ms and such
    __hex_num_pattern = re.compile('[a-z0-9]+')
    __ttable = str.maketrans('{}[]()=:/\\,&?_', '              ', '"\'')
    __db = None
    __dbc = None
    
    def __init__(self, path = None, db_path = None):
        if db_path is None:
            self.__db = sqlite3.connect(":memory:")
        else:
            self.__db = sqlite3.connect(db_path)
        self.__dbc = self.__db.cursor()
        self.__dbc.execute("create table log_lines(ID integer primary key autoincrement, line text not NULL)")
        self.__dbc.execute("create table tokens(ID integer primary key, token text unique not NULL, freq integer not null default 1)")
        self.__dbc.execute("create table identifiers(ID integer primary key, identifier text unique not NULL)")
        self.__dbc.execute("create table log_line_tokens(line_ID integer not null, token_ID integer not null, primary key (line_ID, token_ID))")
        self.__dbc.execute("create table log_line_identifiers(line_ID integer not null, identifier_ID integer not null, primary key (line_ID, identifier_ID))")
        self.set_path(path)

    def __fini__(self):
        self.__db.close()

    def __is_hostname(self, s):
        return s.count('.') > 0 and self.__hostname_pattern.match(s)

    def __is_hash(self, s):
        if (len(s) == 32 or len(s) == 64) and self.__hex_num_pattern.match(s):
            return True
        if self.__id_token_pattern2.match(s):
            return True
        return False

    def __is_numberlike(self, s):
        if s.isnumeric() or self.__numeric_token_pattern.match(s):
            return True
        return False

    # 'some-operator-69854dc866-sbpwz' > 'some-operator-id_suffix'
    def __replace_id_suffix(self, s):
        m = None
        if s.startswith("--"):
            return(s, False)
        if self.__id_suffix_pattern.match(s):
            m = self.__id_token_pattern1.match(s)
        if m != None:
            return (m.group() + 'id_suffix', True)
        m = self.__id_suffix_pattern4.match(s)
        if m is not None:
            return (m.group(1) + 'id_suffix', True)
        return (s, False)

    def __replace_id_prefix(self, s):
        m = self.__id_prefix_pattern1.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        m = self.__id_prefix_pattern2.match(s)
        if m is not None:
            return ('id-prefix-' + m.group(1), True)
        return (s, False)
        
    def __log_line_tokenize(self, line):
        ret = []
        tokens = line.split(" ", 8)
        # Filter out things that can't be possibly kubernetes and even the k8s startup log lines
        if not (tokens[4].startswith("kubenswrapper") and (tokens[5].startswith("i") or tokens[5].startswith("w") or tokens[5].startswith("e"))):
             raise IndexError
        identifiers = []
        log_text = tokens.pop(-1)
        log_text = log_text.translate(self.__ttable)
        log_text_tokens = log_text.split()
        i = 0
        for t in log_text_tokens:
            log_text_tokens[i] = t.strip(". ?!*#,+")
            i = i + 1
        tokens = [tokens[5]]
        tokens[0] = tokens[0][0] + '000'
        tokens = tokens + log_text_tokens
        for t in tokens:
            tr = t
            if self.__is_numberlike(t):
                continue
            if self.__is_hostname(t):
                tr = "_hostname_token"
            elif self.__is_hash(t):
                if not t in identifiers:
                    identifiers.append(t)
                tr = "_hash_token"
            else:
                (tr, found) = self.__replace_id_prefix(t)
                if found and not t in identifiers:
                    identifiers.append(t)
                (tr, found) = self.__replace_id_suffix(tr)
                if found and not t in identifiers:
                    identifiers.append(t)
            ret.append(tr)
        return (ret, identifiers)
    
    def set_path(self, path):
        self.__path = path

    def parse_log(self, limit = 0):
        logfile = open(self.__path, "r")
        line = logfile.readline().casefold()
        count = 0
        while (len(line) > 0 and limit == 0) or (len(line) > 0 and limit != 0 and count < limit):
            try:
                tokens, identifiers = self.__log_line_tokenize(line)
                self.__dbc.execute("insert into log_lines (line) values (?)", (line,))
                l_id = self.__dbc.lastrowid
                for t in tokens:
                    if t.isspace():
                        continue
                    self.__dbc.execute("insert into tokens (token) values (?) on conflict do update set freq = freq + 1", (t,))
                    t_id = self.__dbc.execute("select id from tokens where (token = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_tokens (line_ID, token_ID) values (?,?)", (l_id, t_id))
                for t in identifiers:
                    self.__dbc.execute("insert or ignore into identifiers (identifier) values (?)", (t,))
                    i_id = self.__dbc.execute("select id from identifiers where (identifier = ?)", (t,)).fetchone()[0]
                    self.__dbc.execute("insert or ignore into log_line_identifiers (line_ID, identifier_ID) values (?,?)", (l_id, i_id))
            except IndexError:
                pass
            line = logfile.readline().casefold()
            count = count + 1
        self.__db.commit()
        logfile.close()

    def export_freq_table(self, filename):
        with open(filename, "w") as fw:
            for r in self.__dbc.execute("select token, freq from tokens"):
                fw.write(f"\"{r[0]}\", {r[1]}\n")

    def export_words_table(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"attach database '{filename}' as words_db")
        self.__dbc.execute("create table if not exists words_db.words (id integer primary key, word text unique not null)")
        self.__dbc.execute("insert or ignore into words_db.words (word) select token from tokens")
        self.__db.commit()

    def save_db(self, filename):
        self.__db.commit()
        self.__dbc.execute(f"vacuum into '{filename}'")

    def db_handle(self):
        return self.__db

In [8]:
# Build the raw dictionary with token counts
logfile = LogFile("journal-aws-efs-operator-e2e.0", 'efs-e2e.db')
logfile.parse_log()

In [9]:
con = logfile.db_handle()
cur = con.cursor()
r = cur.execute("select count(ID) from log_lines")
print(r.fetchone()[0])
for r in cur.execute("select * from log_lines limit 20"):
    print(r)

3837
(1, 'jul 12 00:48:06.737256 ip-10-0-63-36 kubenswrapper[2204]: i0712 00:48:06.737146    2204 server.go:205] "--pod-infra-container-image will not be pruned by the image garbage collector in kubelet and should also be set in the remote runtime"\n')
(2, 'jul 12 00:48:06.740300 ip-10-0-63-36 kubenswrapper[2204]: w0712 00:48:06.740283    2204 feature_gate.go:232] unrecognized feature gate: networkdiagnosticsconfig\n')
(3, 'jul 12 00:48:06.740300 ip-10-0-63-36 kubenswrapper[2204]: w0712 00:48:06.740298    2204 feature_gate.go:232] unrecognized feature gate: vspherestaticips\n')
(4, 'jul 12 00:48:06.740346 ip-10-0-63-36 kubenswrapper[2204]: w0712 00:48:06.740303    2204 feature_gate.go:232] unrecognized feature gate: baremetalloadbalancer\n')
(5, 'jul 12 00:48:06.740346 ip-10-0-63-36 kubenswrapper[2204]: w0712 00:48:06.740308    2204 feature_gate.go:232] unrecognized feature gate: gcplabelstags\n')
(6, 'jul 12 00:48:06.740346 ip-10-0-63-36 kubenswrapper[2204]: w0712 00:48:06.740312    2

In [10]:
r = cur.execute("select count(ID) from tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from tokens order by freq desc limit 20"):
    print(r)
logfile.export_freq_table("frequencies.csv")

1584
(2, '_hostname_token', 7005)
(560, 'pod', 3216)
(1, 'i000', 2989)
(256, '_hash_token', 2219)
(398, 'for', 1627)
(22, 'feature', 1148)
(315, 'volume', 1084)
(800, 'uniquename', 898)
(663, 'uid', 882)
(596, 'failed', 744)
(591, 'started', 638)
(1027, 'probe', 630)
(636, 'reconciler', 607)
(21, 'w000', 577)
(24, 'gate', 568)
(23, 'unrecognized', 544)
(148, 'container', 422)
(5, 'not', 415)
(780, 'poduid', 399)
(476, 'event', 392)


In [11]:
logfile.export_freq_table("frequencies.csv")

In [12]:
r = cur.execute("select count(id) from identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from identifiers"):
    print(r)

459
(1, 'df2e783959ef7f4305a2f2cbe728f1d02e51824447fd561f5051cdbef24c4414')
(2, '212867cb-8ead-41c5-b4d0-b34ad64c9eb4')
(3, '4f22fe9c-056f-4e25-8bba-dca75a5931e2')
(4, 'ec271cea7f0cb75f9af55da9385edea8')
(5, 'ec271cea-7f0c-b75f-9af5-5da9385edea8')
(6, '5440253c-b32e-4946-80c8-a10ea8ea38d6')
(7, '50a60a6f0fa58f23f3c7111e344b8ab1')
(8, '50a60a6f0fa58f23f3c7111e344b8ab1-etc-kube')
(9, '50a60a6f0fa58f23f3c7111e344b8ab1-var-lib-kubelet')
(10, '24554ae5-4667-45e3-8e0a-732b2e0d0562')
(11, '0c62307d-d207-41f9-a86b-56ec6321ac3d')
(12, '3545d058-ac7d-482e-bc25-45cf79c2a785')
(13, '8179ae69-de5c-4b4f-bdcd-64303b54d925')
(14, 'ab4293d3-3bd3-43d4-903c-417321a82b46')
(15, 'b721e1a7-a54f-4167-be94-c280719b8080')
(16, '102eeb34-1249-4e0d-9c33-b1596446bb4e')
(17, '17b9af46-2e28-43b3-ae50-5b91fa93a218')
(18, 'd699f903-6c11-4bc0-97fe-4732e3844255')
(19, 'c6c3ba97-0f9d-49df-bb1c-08231f5a19a7')
(20, '0c62307d-d207-41f9-a86b-56ec6321ac3d-cnibin')
(21, '8179ae69-de5c-4b4f-bdcd-64303b54d925-host-run-netns')
(

In [13]:
r = cur.execute("select count(line_ID) from log_line_identifiers")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_identifiers order by line_ID limit 20"):
    print(r)

3569
(179, 1)
(457, 2)
(457, 3)
(460, 4)
(460, 5)
(460, 6)
(718, 1)
(998, 2)
(998, 3)
(1000, 4)
(1000, 5)
(1000, 6)
(1099, 7)
(1102, 7)
(1102, 8)
(1103, 7)
(1103, 9)
(1104, 7)
(1104, 8)
(1105, 7)


In [14]:
r = cur.execute("select count(line_ID) from log_line_tokens")
print(r.fetchone()[0])
for r in cur.execute("select * from log_line_tokens order by token_ID limit 20"):
    print(r)


49274
(1, 1)
(73, 1)
(74, 1)
(75, 1)
(76, 1)
(77, 1)
(78, 1)
(79, 1)
(80, 1)
(81, 1)
(82, 1)
(83, 1)
(84, 1)
(85, 1)
(86, 1)
(87, 1)
(88, 1)
(89, 1)
(90, 1)
(91, 1)


In [15]:
logfile.export_words_table('words.db')