Step 0 - Loading Parameters and Libraries

In [50]:
from __future__ import division
import DrainMethod
import sys
import os

## General parameters 

input_dir = os.path.join(os.getcwd(), "logs") # The input directory of raw logs
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs

#logName = 'Ciena_error_lin0es_20220701-20220715.txt' # Name of file to be parsed
logName = 'ciena-mini.txt' # Name of file to be parsed
#logName = 'Andriod_2k.log' # Name of file to be parsed
#logName = 'trat3_production_1650_1700_20231411_raw.txt' # Name of file to be parsed

log_format = '<Content>' # Format of the file, if there are different fields
regex = [] # Regex strings for Drain execution

Step 1 - Log Parsing Using Drain

In [51]:
from pathlib import Path
import pickle

## Drain parameters
st = 0.5 # Drain similarity threshold
depth = 5 # Max depth of the parsing tree

## Parses file, using DrainMethod
print('\n=== Starting Drain Parsing ===')
indir = os.path.join(input_dir, os.path.dirname(logName))
print(indir)
log_file = os.path.basename(logName)

parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
parser.parse(log_file)

parsedresult=os.path.join(output_dir, log_file + '_structured.csv')


=== Starting Drain Parsing ===
c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\LineTracker\LineTracker\logs\
Parsing file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\LineTracker\LineTracker\logs\ciena-mini.txt


Parsing Progress: 100%|██████████| 15710/15710 [00:01<00:00, 11755.27it/s]


Parsing done. [Time taken: 0:00:03.776132]


Step 2 - Creating Embeddings Using TFIDF

In [52]:
## Step 2 - Vector Creation Using TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import pandas as pd
import regex as re
import contextlib
import pickle

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Preprocesses dataframe with regexes, if necessary - more preprocessing to add
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx,'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers = []
    splitters = re.split(r'(<[^<>]+>)', log_format)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Function to transform log file to dataframe 
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    linecount = 0
    with open(log_file, 'r') as fin:
        for line in fin.readlines():
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    
    path_to_file = os.path.join(vector_dir, logName + '_vectors_TFIDF.vec')
    path = Path(path_to_file)
    vectors_tfidf = []

    # Saves transformed file, for further executions
    if (path.is_file()):
        print("Using Previous Embeddings File")
        vectors_tfidf = pickle.load(open(path_to_file, 'rb'))
    else:
        # Using TFIDF Vectorizer 
        print("Starting Encoding")
        tr_idf_model  = TfidfVectorizer()
        vectors_tfidf = tr_idf_model.fit_transform(raw_content)
        pickle.dump(vectors_tfidf, open(path_to_file, 'wb'))
    
    return vectors_tfidf

# Creates embeddings for log file
def transform(logName):
    print('Transforming file: ' + os.path.join(input_dir, logName))
    log_df = load_data()
    log_df = preprocess_df(log_df)
    return transform_dataset(log_df["Content"])

vector_df = transform(os.path.basename(logName))
print("The dimensions of the embedding matrix are ({}, {})".format(vector_df.shape[0], vector_df.shape[1]))

Transforming file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\LineTracker\LineTracker\logs\ciena-mini.txt
Using Previous Embeddings File
The dimensions of the embedding matrix are (15710, 8224)


Step 3 - Creating Distance Matrix, Using Cosine Distance

In [53]:
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

tfidf_distance = cosine_distances(vector_df)
print("The dimensions of the distance matrix are {}".format(tfidf_distance.shape))

The dimensions of the distance matrix are (15710, 15710)


Step 3 - Creating Matrix of Parsed Items (Variable Matrix)

In [55]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
import pandas as pd 

## General Parameters
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

# Reads parameters list
full_df = pd.read_csv(output_csv)
var_df = full_df["ParameterList"]

# Breaks the string into lists
for i, line in var_df.items():
    var_df.at[i] = literal_eval(var_df.at[i])

# Transforms variable list to variable SPARSE matrix
mlb = MultiLabelBinarizer(sparse_output=True)
var_df = mlb.fit_transform(var_df)

# Code for Jaccard Distance
def jaccard_distance(mat):
    rows_sum = mat.getnnz(axis=1)
    ab = mat * mat.T

    # For rows
    aa = np.repeat(rows_sum, ab.getnnz(axis=1))
    # For columns
    bb = rows_sum[ab.indices]

    # Calculates Jaccard similarity
    similarities = ab.copy()
    similarities.data = similarities.data/(aa + bb - ab.data)

    # Calculated Jaccard distance
    distance = 1 - similarities.todense()

    return distance

var_distance = jaccard_distance(var_df)
print("The dimensions of the variable matrix are {}".format(var_distance.shape))

The dimensions of the variable matrix are (15710, 15710)


Step 4 - Creates Closeness Matrix