In [1]:
## Step 0 - Parameters and Libraries

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances 
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
from sklearn_extra.cluster import KMedoids
from bertopic import BERTopic
from ast import literal_eval
from pathlib import Path
from umap import UMAP
import pandas as pd
import numpy as np
import regex as re
import DrainMethod
import contextlib
import hdbscan
import pickle
import sys
import os

## General parameters 

dataset = "bgl" # The name of the dataset being tested

input_dir = os.path.join(os.getcwd(), "ground_truths") # The input directory of raw logs
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs
logName = dataset + '_lines.txt' # Name of file to be parsed
log_format = '<Content>' # Format of the file, if there are different fields
regex = [] # Regex strings for Drain execution
indir = os.path.join(input_dir, os.path.dirname(logName))
log_file = os.path.basename(logName)

In [2]:
## Auxiliary methods

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Preprocesses dataframe with regexes, if necessary - more preprocessing to add
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx,'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers = []
    splitters = re.split(r'(<[^<>]+>)', log_format)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Function to transform log file to dataframe 
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    linecount = 0
    with open(log_file, 'r') as fin:
        for line in fin.readlines():
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    
    path_to_file = os.path.join(vector_dir, logName + '_vectors_TFIDF.vec')
    path = Path(path_to_file)
    vectors_tfidf = []

    if (path.is_file()):
        vectors_tfidf = pickle.load(open(path_to_file, 'rb'))
    else:
        # Using TFIDF Vectorizer 
        print("Iniciando encode")
        tr_idf_model  = TfidfVectorizer()
        vectors_tfidf = tr_idf_model.fit_transform(raw_content)
        pickle.dump(vectors_tfidf, open(path_to_file, 'wb'))
    
    return vectors_tfidf

def creates_lists(clusterer):
    ## General Parameters

    cluster_idxs = []
    cluster_lines = []
    output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
    output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

    ## Code

    # Reads parameters list
    full_df = pd.read_csv(output_csv)
    elem_df = full_df["EventTemplate"]

    # Creates blank lists
    for elem in range (clusterer.labels_.max()+1):
        cluster_idxs.append([])
        cluster_lines.append([])

    # Populate the lists with cluster elements
    for idx, elem in np.ndenumerate(clusterer.labels_):
        if elem != -1:
            cluster_idxs[elem].append(idx[0])
            cluster_lines[elem].append(elem_df[idx[0]])
        
    return (cluster_idxs, cluster_lines)

In [13]:
## Main methods

# Parse logs using Drain

def parse_logs(st=0.5, depth=5):
    st = st # Drain similarity threshold
    depth = depth # Max depth of the parsing tree

    ## Code
    parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
    parser.parse(log_file)

    parsedresult=os.path.join(output_dir, log_file + '_structured.csv')   

# Creates embeddings for log file
def transform(logName):
    log_df = load_data()
    log_df = preprocess_df(log_df)
    return transform_dataset(log_df["Content"])

# Creates distance matrix, using Euclidean distance
def create_distance_matrix(vector_df):
    # Using Euclidean Distance between the rows of the TFIDF Matrix
    tfidf_distance = pairwise_distances(vector_df, metric="euclidean", n_jobs=-1)
    #Normalizes Distance Matrix with Min-Max
    min_val = np.min(tfidf_distance)
    max_val = np.max(tfidf_distance)
    tfidf_distance = (tfidf_distance - min_val) / (max_val - min_val)
    return (tfidf_distance)

# Creates variable matrix, using Jaccard distance
def create_variable_matrix():
    ## General Parameters
    output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
    output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

    ## Code
    # Reads parameters list
    full_df = pd.read_csv(output_csv)
    var_df = full_df["ParameterList"]

    # Breaks the string into lists
    for i, line in var_df.items():
        var_df.at[i] = literal_eval(var_df.at[i])

    # Transforms variable list to variable sparse matrix
    mlb = MultiLabelBinarizer(sparse_output=True)
    var_df = mlb.fit_transform(var_df)
    var_distance = pairwise_distances(np.asarray(var_df.todense()), metric="jaccard", n_jobs=-1)
    return (var_distance)

def creates_closeness_matrix(tfidf_distance):
    # Creates Count Matrix using line numbers from log lines as the counter
    count_list = []
    n = len(tfidf_distance)
    count_distance = np.zeros(shape=(n, n), dtype=int)
    for i in range(n):
            count_list.append(i)

    # Using a Subtraction Distance using the line numbers as a Count Matrix
    count_array = np.array(count_list)
    for x in count_array:
        for y in count_array:
            count_distance[x,y] = abs(x-y)
    # Normalizes Distance Matrix with Min-Max
    min_val = np.min(count_distance)
    max_val = np.max(count_distance)
    count_distance = (count_distance - min_val) / (max_val - min_val)
    return (count_distance)

def saves_matrices(distance_mat, variable_mat, closeness_mat):
    np.save("tfidf_distance_" + logName + ".csv", distance_mat)
    np.save("var_distance_" + logName + ".csv", variable_mat)
    np.save("count_distance_" + logName + ".csv", closeness_mat)

def loads_matrices():
    tfidf_distance = np.load("tfidf_distance_" + logName + ".csv")
    count_distance = np.load("count_distance_" + logName + ".csv")
    var_distance = np.load("var_distance_" + logName + ".csv") 
    return (tfidf_distance, count_distance, var_distance)

def joins_matrices(tfidf_distance, var_distance, count_distance, alpha, beta, gamma):

    if alpha+beta+gamma > 1:
        raise Exception("Valores devem somar 1!")

    # New matrices, corrected by the weights
    tfidf_distance_wtd = np.dot(alpha,tfidf_distance)
    var_distance_wtd = np.dot(beta, var_distance)
    count_distance_wtd = np.dot(gamma, count_distance)

    # Sums remaining matrices
    unified_matrix = np.asarray(tfidf_distance_wtd + var_distance_wtd + count_distance_wtd)
    return (unified_matrix)

def cluster_hdbscan(unified_matrix, cluster_size, mn_samples, cluster_selection_epsilon):
    ## Clusters with HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=cluster_size,min_samples=mn_samples,metric='precomputed',
                                cluster_selection_epsilon=cluster_selection_epsilon, alpha=1.0, leaf_size=40, 
                                allow_single_cluster=False,cluster_selection_method='eom',
                                gen_min_span_tree=True)

    clusterer.fit(unified_matrix)

    ## Checks number of outliers
    cont = np.count_nonzero(clusterer.labels_ == -1)

    #print("O número de outliers é {}".format(cont))
    #print("O número de total de elementos é {}".format(len(clusterer.labels_)))
    return (clusterer)

def find_topics_bertopic(cluster_list, cluster_number, num_topics):
        
        umap_model = UMAP(init='random')
        cluster_model = KMedoids(n_clusters = 1)
        vectorizer_model = CountVectorizer(stop_words="english")
        topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", hdbscan_model=cluster_model, 
                               vectorizer_model=vectorizer_model, umap_model=umap_model, 
                               top_n_words=10,verbose=True)

        #Applies BertTopic
        topics, probs = topic_model.fit_transform(cluster_list[cluster_number])

        #Gets summary of topics
        topic_model.get_topic(0)
        top_topic = topic_model.get_topic(0)
        words = [i[0] for i in top_topic]
        summary = ' '.join(words)

        return (summary)

def bertopic_previous_clustering(clusterer):
    cluster_idxs, cluster_lines = creates_lists(clusterer)
    cluster_topic = []
    topic_summaries = []

    ## Creates list of boolean values, representing summarized topics
    for idx in range(clusterer.labels_.max()):
        cluster_topic.append(None)

    for i, elem in enumerate(clusterer.labels_):

        ## For each cluster, maps topics, and defines them as the summary
        if (cluster_topic[elem-1] == None):
            summary = find_topics_bertopic(cluster_lines, elem-1, 1)
            cluster_topic[elem-1] = summary
        
        if elem == -1:
            topic_summaries.append("")
        else:
            topic_summaries.append(cluster_topic[elem-1])

    return topic_summaries

def create_new_bertopic_model():
    lines = []
    with open('ground_truths/' + dataset + '_lines.txt', 'r') as line_file:
        for line in line_file:
            lines.append(line)

    umap_model = UMAP(init='random')
    vectorizer_model = CountVectorizer(stop_words="english")
    embedding_model = "all-mpnet-base-v2"
    topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model, 
                        umap_model=umap_model, top_n_words=10, verbose=True)
    topics, probs = topic_model.fit_transform(lines)
    return (topic_model)

# def creates_lists_bertopic(topic_model):

#     ## General Parameters
#     cluster_idxs = []
#     cluster_lines = []
#     output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
#     output_csv = os.path.join(output_dir, log_file + '_structured.csv') 
#     ## Code

#     # Reads parameters list
#     full_df = pd.read_csv(output_csv)
#     elem_df = full_df["EventTemplate"]

#     # Creates blank lists
#     #for elem in range (clusterer.labels_.max()+1):
#     for elem in range (max(topic_model.topics_)+1):
#         cluster_idxs.append([])
#         cluster_lines.append([])

#     # Populate the lists with cluster elements
#     for idx, elem in np.ndenumerate(topic_model.topics_):
#         if elem != -1:
#             cluster_idxs[elem].append(idx[0])
#             cluster_lines[elem].append(elem_df[idx[0]])

#     #print(cluster_lines[10][9])

def bertopic_new_clustering():

    topic_model = create_new_bertopic_model()
    cluster_topic = []
    topic_summaries = []

    # ## Creates list of boolean values, representing summarized topics
    # for idx in range(clusterer.labels_.max()):
    #     cluster_topic.append(None)

    for elem in topic_model.topics_:
        
        line_topic = topic_model.get_topic(elem)
        words = [i[0] for i in line_topic]
        summary = ' '.join(words)
        topic_summaries.append(summary)

    ## Writes external file with created topics
    with open ("ground_truths/" + dataset + "_bert_topics_global.txt", "w") as f:
        for line in topic_summaries:
            f.write(f"{line}\n")

def calculates_metrics(target_file):
    
    from rouge import Rouge 
    rouge = Rouge()

    count_precision = 0
    count_recall = 0
    count_f1 = 0
    total_lines = 2000

    # Opens external files with ground truth summaries and created topics
    with open('ground_truths/' + dataset + '_summaries.txt', 'r') as summaries, \
        open('ground_truths/' + dataset + target_file, 'r') as topics:
        for line_summary, line_topic in zip(summaries, topics):
            line_summary = line_summary[:-2]
            line_summaries = line_summary.split(";")

            for summary in line_summaries:
                current_precision = 0
                current_recall = 0
                current_f1 = 0
                metrics = rouge.get_scores(line_topic, summary)[0]['rouge-1']    
                ## If the summary improves the f1 score, saves its metrics
                if (current_f1 < metrics['f']):
                    current_precision = metrics['p']
                    current_recall = metrics['r']
                    current_f1 = metrics['f']
            
            count_precision += current_precision
            count_recall += current_recall        
            count_f1 += current_f1

    final_precision = count_precision/total_lines
    final_recall = count_recall/total_lines
    final_f1 = count_f1/total_lines

    print("The precision is {}".format(final_precision))
    print("The recall is {}".format(final_recall))
    print("The f1 score is {}".format(final_f1))

In [15]:
## Pipeline of methods

drain_st = 0.5
drain_depth = 5
alpha = 0.7
beta = 0.2
gamma = 0.1
min_cluster_size = 5
min_samples = 5
cluster_selection_epsilon = 0.75

print("Parsing logs...")
parse_logs(drain_st, drain_depth)
print("Transforming data using TFIDF...")
vector_df = transform(os.path.basename(logName))
print("Creating distance matrix...")
distance_matrix = create_distance_matrix(vector_df)
print("Creating variable matrix...")
variable_matrix = create_variable_matrix()
print("Creating closeness matrix...")
closeness_matrix = creates_closeness_matrix(distance_matrix)
#print("Saving matrices...")
#saves_matrices(distance_matrix, variable_matrix, closeness_matrix)
#print("Loading matrices...")
#distance_matrix, variable_matrix, closeness_matrix = loads_matrices()
print("Joining matrices...")
joint_matrix = joins_matrices(distance_matrix, variable_matrix, closeness_matrix, 
                              alpha, beta, gamma)
print("Clustering using HDBSCAN...")
clustering = cluster_hdbscan(joint_matrix, min_cluster_size, min_samples, cluster_selection_epsilon)
print("Creating validation file, using BerTopic on previous clustering...")
topic_summaries = bertopic_previous_clustering(clustering)
print("Writing external file with topic summaries...")
target_file = "ground_truths/" + dataset + "_bert_topics_tests.txt"
with open (target_file, "w") as f:
        for line in topic_summaries:
            f.write(f"{line}\n")
print("Calculating metrics...")
calculates_metrics(target_file)
print("Done!")
#print("Creating validation file, using BerTopic on new clustering...")
#bertopic_new_clustering()

Parsing logs...
Parsing file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\LineTracker-OLD\LineTracker\ground_truths\bgl_lines.txt


Parsing Progress: 100%|██████████| 2000/2000 [00:00<00:00, 16993.72it/s]


Parsing done. [Time taken: 0:00:00.586319]
Transforming data using TFIDF...
Creating distance matrix...
Creating variable matrix...
Creating closeness matrix...
Joining matrices...
Clustering using HDBSCAN...
<class 'int'>
<class 'int'>
0.75
Creating validation file, using BerTopic on previous clustering...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:49:32,674 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:36,590 - BERTopic - Reduced dimensionality
2024-06-06 18:49:36,603 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2024-06-06 18:49:38,274 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:41,071 - BERTopic - Reduced dimensionality
2024-06-06 18:49:41,075 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-06 18:49:44,749 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:46,633 - BERTopic - Reduced dimensionality
2024-06-06 18:49:46,644 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:49:47,022 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:48,654 - BERTopic - Reduced dimensionality
2024-06-06 18:49:48,654 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:49:49,389 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:51,199 - BERTopic - Reduced dimensionality
2024-06-06 18:49:51,202 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-06 18:49:51,940 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:54,167 - BERTopic - Reduced dimensionality
2024-06-06 18:49:54,175 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:49:54,579 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:56,264 - BERTopic - Reduced dimensionality
2024-06-06 18:49:56,267 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-06-06 18:49:57,078 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:49:58,790 - BERTopic - Reduced dimensionality
2024-06-06 18:49:58,793 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:49:59,192 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:00,824 - BERTopic - Reduced dimensionality
2024-06-06 18:50:00,824 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-06 18:50:01,268 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:03,192 - BERTopic - Reduced dimensionality
2024-06-06 18:50:03,195 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:50:03,495 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:05,009 - BERTopic - Reduced dimensionality
2024-06-06 18:50:05,009 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-06 18:50:05,424 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:06,993 - BERTopic - Reduced dimensionality
2024-06-06 18:50:06,995 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-06 18:50:07,395 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:09,001 - BERTopic - Reduced dimensionality
2024-06-06 18:50:09,011 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-06-06 18:50:09,809 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:11,897 - BERTopic - Reduced dimensionality
2024-06-06 18:50:11,897 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2024-06-06 18:50:13,535 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:15,829 - BERTopic - Reduced dimensionality
2024-06-06 18:50:15,835 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-06-06 18:50:16,204 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:17,928 - BERTopic - Reduced dimensionality
2024-06-06 18:50:17,928 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-06-06 18:50:18,570 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:20,865 - BERTopic - Reduced dimensionality
2024-06-06 18:50:20,870 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-06-06 18:50:22,174 - BERTopic - Transformed documents to Embeddings
2024-06-06 18:50:24,924 - BERTopic - Reduced dimensionality
2024-06-06 18:50:24,924 - BERTopic - Clustered reduced embeddings


Writing external file with topic summaries...
Calculating metrics...


FileNotFoundError: [Errno 2] No such file or directory: 'ground_truths/bglground_truths/bgl_bert_topics_tests.txt'

In [None]:
## Evaluates Accuracy

## Step 12 - Calculates average recall, precision and f1

## Initial tests with rouge

from rouge import Rouge 
rouge = Rouge()

count_precision = 0
count_recall = 0
count_f1 = 0
total_lines = 2000
#target_file = "_luhn.txt"
#target_file = "_lsa.txt"
#target_file = "_local_topics.txt"
#target_file = "_global_topics.txt"
#target_file = "_lda_topics.txt"
#target_file = "_lexrank.txt"
#target_file = "_textrank.txt"
#target_file = "_bert_topics_local.txt"
target_file = "_bert_topics_global.txt"

# Opens external files with ground truth summaries and created topics
with open('ground_truths/' + dataset + '_summaries.txt', 'r') as summaries, \
     open('ground_truths/' + dataset + target_file, 'r') as topics:
    for line_summary, line_topic in zip(summaries, topics):
        line_summary = line_summary[:-2]
        line_summaries = line_summary.split(";")

        for summary in line_summaries:
            current_precision = 0
            current_recall = 0
            current_f1 = 0
            metrics = rouge.get_scores(line_topic, summary)[0]['rouge-1']    
            ## If the summary improves the f1 score, saves its metrics
            if (current_f1 < metrics['f']):
                current_precision = metrics['p']
                current_recall = metrics['r']
                current_f1 = metrics['f']
        
        count_precision += current_precision
        count_recall += current_recall        
        count_f1 += current_f1

final_precision = count_precision/total_lines
final_recall = count_recall/total_lines
final_f1 = count_f1/total_lines

print(final_precision)
print(final_recall)
print(final_f1)