In [2]:
## Step 0 - Parameters and Libraries

import DrainMethod
import sys
import os

## General parameters 

input_dir = os.path.join(os.getcwd(), "ground_truths") # The input directory of raw logs
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs
dataset = "bgl" # The name of the dataset being tested
logName = dataset + '_lines.txt' # Name of file to be parsed
log_format = '<Content>' # Format of the file, if there are different fields
regex = [] # Regex strings for Drain execution

In [3]:
## Step 1 - Log Parsing Using Drain

## Drain parameters

st = 0.5 # Drain similarity threshold
depth = 5 # Max depth of the parsing tree

## Code

print('\n=== Starting Drain Parsing ===')
indir = os.path.join(input_dir, os.path.dirname(logName))
print(indir)
log_file = os.path.basename(logName)

parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
parser.parse(log_file)

parsedresult=os.path.join(output_dir, log_file + '_structured.csv')   


=== Starting Drain Parsing ===
c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\ground_truths\
Parsing file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\ground_truths\bgl_lines.txt


Parsing Progress:   0%|          | 0/2000 [00:00<?, ?it/s]

Parsing Progress: 100%|██████████| 2000/2000 [00:00<00:00, 4641.92it/s]


Parsing done. [Time taken: 0:00:01.078581]


In [4]:
## Step 2 - Vector Creation Using TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import pandas as pd
import regex as re
import contextlib
import pickle

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Preprocesses dataframe with regexes, if necessary - more preprocessing to add
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx,'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers = []
    splitters = re.split(r'(<[^<>]+>)', log_format)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Function to transform log file to dataframe 
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    linecount = 0
    with open(log_file, 'r') as fin:
        for line in fin.readlines():
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    
    path_to_file = os.path.join(vector_dir, logName + '_vectors_TFIDF.vec')
    path = Path(path_to_file)
    vectors_tfidf = []

    if (path.is_file()):
        vectors_tfidf = pickle.load(open(path_to_file, 'rb'))
    else:
        # Using TFIDF Vectorizer 
        print("Iniciando encode")
        tr_idf_model  = TfidfVectorizer()
        vectors_tfidf = tr_idf_model.fit_transform(raw_content)
        pickle.dump(vectors_tfidf, open(path_to_file, 'wb'))
    
    print(type(vectors_tfidf))
    return vectors_tfidf

# Creates embeddings for log file
def transform(logName):
    print('Transforming file: ' + os.path.join(input_dir, logName))
    log_df = load_data()
    log_df = preprocess_df(log_df)
    return transform_dataset(log_df["Content"])

vector_df = transform(os.path.basename(logName))

Transforming file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\ground_truths\bgl_lines.txt
<class 'scipy.sparse._csr.csr_matrix'>


In [5]:
## Step 3 - Creates matrix of parsed items

from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
import pandas as pd 

## General Parameters

output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

## Code

# Reads parameters list
full_df = pd.read_csv(output_csv)
var_df = full_df["ParameterList"]

# Breaks the string into lists
for i, line in var_df.items():
    var_df.at[i] = literal_eval(var_df.at[i])

# Transforms variable list to variable sparse matrix
mlb = MultiLabelBinarizer(sparse_output=True)
var_df = mlb.fit_transform(var_df)
print ("A matrix parseada de variaveis tem o formato {}".format(var_df.shape))
print(type(var_df))

A matrix parseada de variaveis tem o formato (2000, 1395)
<class 'scipy.sparse._csr.csr_matrix'>


In [6]:
# Step 4 - Creates distance matrix 

from sklearn.metrics.pairwise import pairwise_distances 
import numpy as np

# Using Euclidean Distance between the rows of the TFIDF Matrix
tfidf_distance = pairwise_distances(vector_df, metric="euclidean", n_jobs=-1)
#Normalizes Distance Matrix with Min-Max
min_val = np.min(tfidf_distance)
max_val = np.max(tfidf_distance)
tfidf_distance = (tfidf_distance - min_val) / (max_val - min_val)
print("As dimensões da matriz de embeddings são {}".format(tfidf_distance.shape))

# Using Jaccard Distance between the rows of the Variable Matrix
var_distance = pairwise_distances(np.asarray(var_df.todense()), metric="jaccard", n_jobs=-1)
print("As dimensões da matriz de variáveis são {}".format(var_distance.shape))

# Creates Count Matrix using line numbers from log lines as the counter
count_list = []
n = len(tfidf_distance)
count_distance = np.zeros(shape=(n, n), dtype=int)
for i in range(n):
        count_list.append(i)

# Using a Subtraction Distance using the line numbers as a Count Matrix
count_array = np.array(count_list)
for x in count_array:
  for y in count_array:
    count_distance[x,y] = abs(x-y)
# Normalizes Distance Matrix with Min-Max
min_val = np.min(count_distance)
max_val = np.max(count_distance)
count_distance = (count_distance - min_val) / (max_val - min_val)
print("As dimensões da matriz de contadores são {}".format(count_distance.shape))

As dimensões da matriz de embeddings são (2000, 2000)




As dimensões da matriz de variáveis são (2000, 2000)
As dimensões da matriz de contadores são (2000, 2000)


In [6]:
## Saving matrices

print(type(tfidf_distance))
np.save("tfidf_distance_" + logName + ".csv", tfidf_distance)
print(type(var_distance))
np.save("var_distance_" + logName + ".csv", var_distance)
print(type(count_distance))
np.save("count_distance_" + logName + ".csv", count_distance)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
## Loads matrices

tfidf_distance = np.load("tfidf_distance_" + logName + ".csv")
count_distance = np.load("count_distance_" + logName + ".csv")
var_distance = np.load("var_distance_" + logName + ".csv") 

In [7]:
# Step 5 - Using alpha to define the weight of the TFIDF Matrix,  
# Beta to define the weight of the Variable Matrix,
# and Gamma to define the weight of the Count Matrix
alpha = 0.3
beta = 0.5
gamma = 0.2

if alpha+beta+gamma > 1:
   raise Exception("Valores devem somar 1!")

# New matrices, corrected by the weights
tfidf_distance_wtd = np.dot(alpha,tfidf_distance)
var_distance_wtd = np.dot(beta, var_distance)
count_distance_wtd = np.dot(gamma, count_distance)

# Sums remaining matrices
distance_matrix = np.asarray(tfidf_distance_wtd + var_distance_wtd + count_distance_wtd)

In [8]:
## Step 6 - Clustering with HDBScan Using Pre-defined Distance Matrix

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan

# min_cluster_size:int, optional (default=5)
# The minimum size of clusters; single linkage splits that contain fewer points than this will 
# be considered points “falling out” of a cluster rather than a cluster splitting into two new clusters.

# min_samples:int, optional (default=None)
# The number of samples in a neighbourhood for a point to be considered a core point.

# p :int, optional (default=None)
# p value to use if using the minkowski metric.

# alpha :float, optional (default=1.0)
# A distance scaling parameter as used in robust single linkage. See [3] for more information.

# cluster_selection_epsilon: float, optional (default=0.0)
# A distance threshold. Clusters below this value will be merged.
# See [5] for more information.

# algorithm :string, optional (default=’best’)
# Exactly which algorithm to use; hdbscan has variants specialised for different characteristics 
# of the data. By default this is set to best which chooses the “best” algorithm given the nature 
# of the data. You can force other options if you believe you know better. Options are: 'best',
# 'generic', 'prims_kdtree', 'prims_balltree', 'boruvka_kdtree' and 'boruvka_balltree'

# leaf_size: int, optional (default=40)
# If using a space tree algorithm (kdtree, or balltree) the number of points ina leaf node of the tree. 
# This does not alter the resulting clustering, but may have an effect on the runtime of the algorithm.

# cluster_selection_method :string, optional (default=’eom’)
# The method used to select clusters from the condensed tree. The standard approach for HDBSCAN is 
# to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can 
# instead select the clusters at the leaves of the tree – this provides the most fine grained and 
# homogeneous clusters. Options are: 'eom' and 'leaf'

# allow_single_cluster :bool, optional (default=False)
# By default HDBSCAN will not produce a single cluster, setting this to True will override this 
# and allow single cluster results in the case that you feel this is a valid result for your dataset.

## Clusters with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=None,metric='precomputed',
                            cluster_selection_epsilon=0.75, alpha=1.0, leaf_size=40, 
                            allow_single_cluster=False,cluster_selection_method='eom',
                            gen_min_span_tree=True)


clusterer.fit(distance_matrix)

print ("O numero de clusters e {}".format(clusterer.labels_.max()))
print ("Os clusters de cada elemento são {}".format(clusterer.labels_))

## Checks number of outliers
cont = np.count_nonzero(clusterer.labels_ == -1)

print("O número de outliers é {}".format(cont))
print("O número de total de elementos é {}".format(len(clusterer.labels_)))

O numero de clusters e 10
Os clusters de cada elemento são [9 9 9 ... 9 9 9]
O número de outliers é 8
O número de total de elementos é 2000


In [8]:
## A TESTAR
## TESTAR TAMBÉM COM K-MEDOIDS

## Step 6.1 - Uses K-Means for clustering

from sklearn.cluster import KMeans

clusterer = KMeans(n_clusters=100, random_state=0, n_init=10)
clusterer.fit(distance_matrix)
cluster_num = clusterer.labels_.max()
print ("O numero de clusters e {}".format(cluster_num))
cluster_labels = clusterer.labels_
print ("Os clusters de cada elemento sao {}".format(cluster_labels))

O numero de clusters e 99
Os clusters de cada elemento sao [57 57 57 ...  6  6 96]


In [46]:
## Step 7 - Checks number of outliers

cont = 0

for elem in clusterer.labels_:
   if (elem == -1):
      cont += 1

print("O número de outliers é {}".format(cont))
print("O número de total de elementos é {}".format(len(clusterer.labels_)))

O número de outliers é 0
O número de total de elementos é 2000


In [9]:
## Step 8 - Creates a list of lists representing the clusters

import numpy as np

## General Parameters

cluster_idxs = []
cluster_lines = []
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

## Code

# Reads parameters list
full_df = pd.read_csv(output_csv)
elem_df = full_df["EventTemplate"]

# Creates blank lists
for elem in range (clusterer.labels_.max()+1):
    cluster_idxs.append([])
    cluster_lines.append([])

# Populate the lists with cluster elements
for idx, elem in np.ndenumerate(clusterer.labels_):
  if elem != -1:
    cluster_idxs[elem].append(idx[0])
    cluster_lines[elem].append(elem_df[idx[0]])

# Check sizes of each cluster
for i in range(len(cluster_idxs)):
   print("O tamanho do cluster {} é {}".format(i,len(cluster_idxs[i])))

#print(cluster_lines[10][9])

O tamanho do cluster 0 é 62
O tamanho do cluster 1 é 20
O tamanho do cluster 2 é 26
O tamanho do cluster 3 é 20
O tamanho do cluster 4 é 20
O tamanho do cluster 5 é 8
O tamanho do cluster 6 é 49
O tamanho do cluster 7 é 30
O tamanho do cluster 8 é 39
O tamanho do cluster 9 é 32
O tamanho do cluster 10 é 20
O tamanho do cluster 11 é 30
O tamanho do cluster 12 é 80
O tamanho do cluster 13 é 11
O tamanho do cluster 14 é 42
O tamanho do cluster 15 é 40
O tamanho do cluster 16 é 60
O tamanho do cluster 17 é 45
O tamanho do cluster 18 é 18
O tamanho do cluster 19 é 41
O tamanho do cluster 20 é 20
O tamanho do cluster 21 é 20
O tamanho do cluster 22 é 3
O tamanho do cluster 23 é 20
O tamanho do cluster 24 é 20
O tamanho do cluster 25 é 20
O tamanho do cluster 26 é 20
O tamanho do cluster 27 é 20
O tamanho do cluster 28 é 10
O tamanho do cluster 29 é 10
O tamanho do cluster 30 é 24
O tamanho do cluster 31 é 60
O tamanho do cluster 32 é 20
O tamanho do cluster 33 é 21
O tamanho do cluster 34 é 

In [10]:
## Step 9 - Eliminates stopwords on each cluster

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint
import gensim

# Parameters
stop_words = stopwords.words('english')
stop_words.extend(['teste'])

# Converts sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# Removes stopwords from each sentence
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [49]:
print(cluster_idxs)

[[600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 667, 669, 673, 675, 677, 678, 679], [680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699], [1340, 1341, 1342, 1343, 1344, 1345, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459], [340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359], [1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959], [1081, 1083, 1085, 1088, 1091, 1096, 1326, 1335], [1640, 1641, 1644, 1650, 1653, 1655, 1656, 1657, 1658, 1660, 1661, 1668, 1671, 1672, 1673, 1674, 1677, 1679, 1680, 1683, 1684, 1685, 1688, 1689, 1691, 16

In [50]:
print(cluster_lines)



In [51]:
print(clusterer.labels_)

[57 57 57 ...  6  6 96]


In [11]:
## Step 10 - Finds topics of a given cluster

# Finds topic of a given cluster, defining the number of topics
def find_topics(cluster_list, cluster_number, num_topics):
    # Converts to words
    data_words = list(sent_to_words(cluster_list[cluster_number]))
    # Removes stop words
    #data_words = remove_stopwords(data_words)
    # Creates dictionary
    id2word = corpora.Dictionary(data_words)
    # Creates corpora
    corpus = [id2word.doc2bow(text) for text in data_words]
    # Builds LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
    return lda_model

topics = find_topics(cluster_lines, 9, 1)

# Gets word topics
x = topics.show_topics(num_topics=1, num_words=10,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    a =  " ".join(words)
    print(a)

ras to info kernel failed app fatal on ciod stream


In [12]:
## Step 11 - Builds new file with topic modeling summaries

cluster_topic = []
topic_summaries = []

## Creates list of boolean values, representing summarized topics
for idx in range(clusterer.labels_.max()):
    cluster_topic.append(None)

for i, elem in enumerate(clusterer.labels_):

    ## For each cluster, maps topics, and defines them as the summary
    if (cluster_topic[elem-1] == None):
        topics = find_topics(cluster_lines, elem-1, 1)
        x = topics.show_topics(num_topics=1, num_words=10,formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
        for topic,words in topics_words:
            summary = " ".join(words)
        cluster_topic[elem-1] = summary
    
    if elem == -1:
        topic_summaries.append("")
    else:
        topic_summaries.append(cluster_topic[elem-1])

## Writes external file with created topics
with open ("ground_truths/" + dataset + "_topics.txt", "w") as f:
     for line in topic_summaries:
          f.write(f"{line}\n")


In [28]:
## Initial tests with rouge

from rouge import Rouge 
rouge = Rouge()
hypothesis = "dear iar kernel exceptions info detected integer alignment ras"
reference = "Kernel detected integer alignment exceptions"

a, b, c = rouge.get_scores(hypothesis, reference)[0]['rouge-1']

print(rouge.get_scores(hypothesis, reference)[0]['rouge-1'])
print(type(rouge.get_scores(hypothesis, reference)[0]['rouge-1']))
metricas = rouge.get_scores(hypothesis, reference)[0]['rouge-1']
print(metricas['f'])
print(b)
print(c)

print(rouge.get_scores(hypothesis, reference))
print(rouge.get_scores(hypothesis, reference)[0]['rouge-1'])

{'r': 0.8, 'p': 0.4444444444444444, 'f': 0.5714285668367348}
<class 'dict'>
0.5714285668367348
p
f
[{'rouge-1': {'r': 0.8, 'p': 0.4444444444444444, 'f': 0.5714285668367348}, 'rouge-2': {'r': 0.5, 'p': 0.25, 'f': 0.33333332888888895}, 'rouge-l': {'r': 0.6, 'p': 0.3333333333333333, 'f': 0.4285714239795918}}]
{'r': 0.8, 'p': 0.4444444444444444, 'f': 0.5714285668367348}


In [29]:
## Step 12 - Calculates average recall, precision and f1

## Initial tests with rouge

from rouge import Rouge 
rouge = Rouge()

count_precision = 0
count_recall = 0
count_f1 = 0
total_lines = 2000

# Opens external files with ground truth summaries and created topics
with open('ground_truths/' + dataset + '_summaries.txt', 'r') as summaries, open('ground_truths/' + dataset + '_topics.txt', 'r') as topics:
    for line_summary, line_topic in zip(summaries, topics):
        line_summary = line_summary[:-2]
        line_summaries = line_summary.split(";")

        for summary in line_summaries:
            current_precision = 0
            current_recall = 0
            current_f1 = 0
            metrics = rouge.get_scores(line_topic, summary)[0]['rouge-1']            
            ## If the summary improves the f1 score, saves its metrics
            if (current_f1 < metrics['f']):
                current_precision = metrics['p']
                current_recall = metrics['r']
                current_f1 = metrics['f']
        
        count_precision += current_precision
        count_recall += current_recall        
        count_f1 += current_f1

final_precision = count_precision/total_lines
final_recall = count_recall/total_lines
final_f1 = count_f1/total_lines

print(final_precision)
print(final_recall)
print(final_f1)


        


## CORRIGIR ARQUIVO DE SUMMARY - ELE SÓ TEM 1981 LINHAS

ValueError: Reference is empty.