In [None]:
## Step 0 - Parameters and Libraries

import DrainMethod
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import pandas as pd
import regex as re
import contextlib
import pickle
from sklearn.metrics.pairwise import pairwise_distances 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
import pandas as pd 
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from umap import UMAP
from bertopic import BERTopic
from sklearn_extra.cluster import KMedoids
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer


## General parameters 

dataset = "bgl" # The name of the dataset being tested


input_dir = os.path.join(os.getcwd(), "ground_truths") # The input directory of raw logs
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs
logName = dataset + '_lines.txt' # Name of file to be parsed
log_format = '<Content>' # Format of the file, if there are different fields
regex = [] # Regex strings for Drain execution
indir = os.path.join(input_dir, os.path.dirname(logName))
log_file = os.path.basename(logName)

In [None]:
## Auxiliary methods

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Preprocesses dataframe with regexes, if necessary - more preprocessing to add
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx,'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers = []
    splitters = re.split(r'(<[^<>]+>)', log_format)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Function to transform log file to dataframe 
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    linecount = 0
    with open(log_file, 'r') as fin:
        for line in fin.readlines():
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    
    path_to_file = os.path.join(vector_dir, logName + '_vectors_TFIDF.vec')
    path = Path(path_to_file)
    vectors_tfidf = []

    if (path.is_file()):
        vectors_tfidf = pickle.load(open(path_to_file, 'rb'))
    else:
        # Using TFIDF Vectorizer 
        print("Iniciando encode")
        tr_idf_model  = TfidfVectorizer()
        vectors_tfidf = tr_idf_model.fit_transform(raw_content)
        pickle.dump(vectors_tfidf, open(path_to_file, 'wb'))
    
    print(type(vectors_tfidf))
    return vectors_tfidf

In [None]:
## Main methods

# Parse logs using Drain

def parse_logs(st=0.5, depth=5):
    st = st # Drain similarity threshold
    depth = depth # Max depth of the parsing tree

    ## Code
    print('\n=== Starting Drain Parsing ===')
    parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
    parser.parse(log_file)

    parsedresult=os.path.join(output_dir, log_file + '_structured.csv')   

# Creates embeddings for log file
def transform(logName):
    print('Transforming file: ' + os.path.join(input_dir, logName))
    log_df = load_data()
    log_df = preprocess_df(log_df)
    return transform_dataset(log_df["Content"])

# Creates distance matrix, using Euclidean distance
def create_distance_matrix(vector_df):
    # Using Euclidean Distance between the rows of the TFIDF Matrix
    tfidf_distance = pairwise_distances(vector_df, metric="euclidean", n_jobs=-1)
    #Normalizes Distance Matrix with Min-Max
    min_val = np.min(tfidf_distance)
    max_val = np.max(tfidf_distance)
    tfidf_distance = (tfidf_distance - min_val) / (max_val - min_val)
    print("As dimensões da matriz de embeddings são {}".format(tfidf_distance.shape))
    return (tfidf_distance)

# Creates variable matrix, using Jaccard distance
def create_variable_matrix():
    ## General Parameters
    output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
    output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

    ## Code
    # Reads parameters list
    full_df = pd.read_csv(output_csv)
    var_df = full_df["ParameterList"]

    # Breaks the string into lists
    for i, line in var_df.items():
        var_df.at[i] = literal_eval(var_df.at[i])

    # Transforms variable list to variable sparse matrix
    mlb = MultiLabelBinarizer(sparse_output=True)
    var_df = mlb.fit_transform(var_df)
    print ("A matrix parseada de variaveis tem o formato {}".format(var_df.shape))
    var_distance = pairwise_distances(np.asarray(var_df.todense()), metric="jaccard", n_jobs=-1)
    return (var_distance)

def creates_closeness_matrix(tfidf_distance):
    # Creates Count Matrix using line numbers from log lines as the counter
    count_list = []
    n = len(tfidf_distance)
    count_distance = np.zeros(shape=(n, n), dtype=int)
    for i in range(n):
            count_list.append(i)

    # Using a Subtraction Distance using the line numbers as a Count Matrix
    count_array = np.array(count_list)
    for x in count_array:
        for y in count_array:
            count_distance[x,y] = abs(x-y)
    # Normalizes Distance Matrix with Min-Max
    min_val = np.min(count_distance)
    max_val = np.max(count_distance)
    count_distance = (count_distance - min_val) / (max_val - min_val)
    print("As dimensões da matriz de contadores são {}".format(count_distance.shape))

def saves_matrices(distance_mat, variable_mat, closeness_mat):
    np.save("tfidf_distance_" + logName + ".csv", distance_mat)
    np.save("var_distance_" + logName + ".csv", variable_mat)
    np.save("count_distance_" + logName + ".csv", closeness_mat)

def loads_matrices():
    tfidf_distance = np.load("tfidf_distance_" + logName + ".csv")
    count_distance = np.load("count_distance_" + logName + ".csv")
    var_distance = np.load("var_distance_" + logName + ".csv") 
    return (tfidf_distance, count_distance, var_distance)

def joins_matrices(tfidf_distance, var_distance, count_distance, alpha, beta, gamma):
    alpha = 0.7
    beta = 0.2
    gamma = 0.1

    if alpha+beta+gamma > 1:
        raise Exception("Valores devem somar 1!")

    # New matrices, corrected by the weights
    tfidf_distance_wtd = np.dot(alpha,tfidf_distance)
    var_distance_wtd = np.dot(beta, var_distance)
    count_distance_wtd = np.dot(gamma, count_distance)

    # Sums remaining matrices
    unified_matrix = np.asarray(tfidf_distance_wtd + var_distance_wtd + count_distance_wtd)
    return (unified_matrix)

def cluster_hdbscan(unified_matrix):
    ## Clusters with HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=None,metric='precomputed',
                                cluster_selection_epsilon=0.75, alpha=1.0, leaf_size=40, 
                                allow_single_cluster=False,cluster_selection_method='eom',
                                gen_min_span_tree=True)

    clusterer.fit(unified_matrix)

    print ("O numero de clusters e {}".format(clusterer.labels_.max()))
    print ("Os clusters de cada elemento são {}".format(clusterer.labels_))

    ## Checks number of outliers
    cont = np.count_nonzero(clusterer.labels_ == -1)

    print("O número de outliers é {}".format(cont))
    print("O número de total de elementos é {}".format(len(clusterer.labels_)))
    return (clusterer)

def creates_lists(clusterer):
    ## General Parameters

    cluster_idxs = []
    cluster_lines = []
    output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
    output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

    ## Code

    # Reads parameters list
    full_df = pd.read_csv(output_csv)
    elem_df = full_df["EventTemplate"]

    # Creates blank lists
    for elem in range (clusterer.labels_.max()+1):
        cluster_idxs.append([])
        cluster_lines.append([])

    # Populate the lists with cluster elements
    for idx, elem in np.ndenumerate(clusterer.labels_):
        if elem != -1:
            cluster_idxs[elem].append(idx[0])
            cluster_lines[elem].append(elem_df[idx[0]])

    # Check sizes of each cluster
    for i in range(len(cluster_idxs)):
        print("O tamanho do cluster {} é {}".format(i,len(cluster_idxs[i])))
        
    return (cluster_idxs, cluster_lines)

def find_topics_bertopic(cluster_list, cluster_number, num_topics):
        
        umap_model = UMAP(init='random')
        cluster_model = KMedoids(n_clusters = 1)
        vectorizer_model = CountVectorizer(stop_words="english")
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")        
        topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model, 
                               vectorizer_model=vectorizer_model, umap_model=umap_model, top_n_words=10)

        #Applies BertTopic
        topics, probs = topic_model.fit_transform(cluster_list[cluster_number])

        #Gets summary of topics
        topic_model.get_topic(0)
        top_topic = topic_model.get_topic(0)
        words = [i[0] for i in top_topic]
        summary = ' '.join(words)

        return (summary)

def bertopic_previous_clustering(clusterer, cluster_lines):
    cluster_topic = []
    topic_summaries = []

    ## Creates list of boolean values, representing summarized topics
    for idx in range(clusterer.labels_.max()):
        cluster_topic.append(None)

    for i, elem in enumerate(clusterer.labels_):

        ## For each cluster, maps topics, and defines them as the summary
        if (cluster_topic[elem-1] == None):
            summary = find_topics_bertopic(cluster_lines, elem-1, 1)
            cluster_topic[elem-1] = summary
        
        if elem == -1:
            topic_summaries.append("")
        else:
            topic_summaries.append(cluster_topic[elem-1])

    ## Writes external file with created topics
    with open ("ground_truths/" + dataset + "_bert_topics_local.txt", "w") as f:
        for line in topic_summaries:
            f.write(f"{line}\n")




In [None]:
## Pipeline of methods

parse_logs(0.5,5)
vector_df = transform(os.path.basename(logName))
distance_matrix = create_distance_matrix(vector_df)
variable_matrix = create_variable_matrix()
closeness_matrix = creates_closeness_matrix(distance_matrix)
#saves_matrices(distance_matrix, variable_matrix, closeness_matrix)
#distance_matrix, variable_matrix, closeness_matrix = loads_matrices()
joint_matrix = joins_matrices(distance_matrix, variable_matrix, closeness_matrix, 0.7, 0.2, 0.1)
clustering = cluster_hdbscan(joint_matrix)
cluster_idxs, cluster_lines = creates_lists(clustering)
bertopic_previous_clustering(clustering, cluster_lines)