In [17]:
## Start - Parameters and Libraries

import DrainMethod
import sys
import os

## General parameters 

input_dir = os.path.join(os.getcwd(), "logs") # The input directory of raw logs
output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs
#logName = 'Ciena_error_lines_20220701-20220715.txt' # Name of file to be parsed
#logName = 'ciena-mini.txt' # Name of file to be parsed
logName = 'Andriod_2k.log' # Name of file to be parsed
log_format = '<Content>' # Format of the file, if there are different fields
regex = [] # Regex strings for Drain execution

In [18]:
## First Step - Log Parsing Using Drain

## Drain parameters

st = 0.5 # Drain similarity threshold
depth = 5 # Max depth of the parsing tree

## Code

print('\n=== Starting Drain Parsing ===')
indir = os.path.join(input_dir, os.path.dirname(logName))
print(indir)
log_file = os.path.basename(logName)

parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
parser.parse(log_file)

parsedresult=os.path.join(output_dir, log_file + '_structured.csv')   


=== Starting Drain Parsing ===
c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\logs\
Parsing file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\logs\Andriod_2k.log


Parsing Progress: 100%|██████████| 2000/2000 [00:00<00:00, 13266.04it/s]


Parsing done. [Time taken: 0:00:00.582818]


In [2]:
## Second Step - Embedding Creation Using Transformers

from sentence_transformers import SentenceTransformer
from pathlib import Path
import pandas as pd
import regex as re
import contextlib
import pickle

## General Parameters

vector_dir = os.path.join(os.getcwd(), "vectors")  # The vector directory of converted logs
#logName = 'Ciena_error_lines_20220701-20220715.txt' # Name of file to be parsed
logName = 'ciena-mini.txt' # Name of file to be parsed
indir = os.path.join(input_dir, os.path.dirname(logName)) # Input directory

## Code

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers = []
    splitters = re.split(r'(<[^<>]+>)', log_format)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

# Function to transform log file to dataframe 
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    linecount = 0
    with open(log_file, 'r') as fin:
        for line in fin.readlines():
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
                linecount += 1
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', None)
    logdf['LineId'] = [i + 1 for i in range(linecount)]
    return logdf

# Preprocesses dataframe with regexes, if necessary - more preprocessing to add
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx,'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    
    path_to_file = os.path.join(vector_dir, logName + '_vectors.vec')
    path = Path(path_to_file)
    vectors = []

    if (path.is_file()):
        vectors = pickle.load(open(path_to_file, 'rb'))
    else:
        # Using standard MPNet transformer
        model = SentenceTransformer('all-mpnet-base-v2')
        print("Iniciando encode")
        vectors = model.encode(raw_content)
        pickle.dump(vectors, open(path_to_file, 'wb'))
    
    return vectors

# Creates embeddings for log file
def transform(logName):
    print('Transforming file: ' + os.path.join(input_dir, logName))
    log_df = load_data()
    log_df = preprocess_df(log_df)
    return transform_dataset(log_df["Content"])

vector_df = transform(os.path.basename(logName))


Transforming file: c:\Users\vbert\OneDrive\DOUTORADO Poly Mtl\Projeto\CSL\CSL-1\logs\Ciena_error_lines_20220701-20220715.txt


In [7]:
print(type(vector_df))
print("O número de linhas do arquivo transformado é {}".format(len(vector_df)))
print("O número de colunas do arquivo transformado é {}".format(len(vector_df[0])))
print(vector_df[2][1])
print(vector_df[0][0])

<class 'numpy.ndarray'>
O número de linhas do arquivo transformado é 15710
O número de colunas do arquivo transformado é 768
0.0185639
0.005103809


In [59]:
## Third Step - Creates matrix of parsed items

from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
import pandas as pd 

## General Parameters

output_dir = os.path.join(os.getcwd(), "results")  # The output directory of parsing results
logName = 'Andriod_2k.log' # Name of file to be parsed
output_csv = os.path.join(output_dir, log_file + '_structured.csv') 

## Code

# Reads parameters list
full_df = pd.read_csv(output_csv)
var_df = full_df["ParameterList"]

# Breaks the string into lists
for i, line in var_df.items():
    var_df.at[i] = literal_eval(var_df.at[i])

# Transforms variables list to variable matrix
mlb = MultiLabelBinarizer()
var_matrix = pd.DataFrame(mlb.fit_transform(var_df),columns=mlb.classes_)
print (var_matrix.shape)

(2000, 1839)
-1,
-2147483632
0, 0),
0|com.tencent.mm|4097|null|10112
0|com.tencent.mobileqq|121|null|10111
1
1,
10020
10027
10037,callingPid
10057
