In [None]:
## Transforms files for general topic modeling

import re

dataset = "Zookeeper"
file = open("ground_truths/" + dataset + ".txt", "r")
LINE_PATTERN = "#[0-9]+#"
SUMMARY_PATTERN = "#summary:#"
line_file = []
summary_file = []
line_match = True
summary_match = True
line_count = 0

for line in file:

        if len(line) == 1:                  
            summary_match = False
            line_match = False
            continue
        elif re.match(LINE_PATTERN, line):            
            line_match = True
            continue
        elif re.match(SUMMARY_PATTERN, line):
            summary_match = True
            continue
        elif (line_match):
            line_file.append(line)
            line_count += 1
            continue
        elif (summary_match):
            for elem in range(line_count):                
                summary_file.append(line.rstrip() + '\n')
            line_count = 0   
            continue

with open ("ground_truths/" + dataset + "_lines.txt", "w") as f:
     for line in line_file:
          f.write(f"{line}")

with open ("ground_truths/" + dataset + "_summaries.txt", "w") as f:
     for line in summary_file:
          f.write(f"{line}")

In [None]:
## Transforms files for general topic modeling, parsing before

## Drain parameters

import DrainMethod
import os

## Step 1 - Log Parsing Using Drain

dataset = "Zookeeper"
input_dir = os.path.join(os.getcwd(), "ground_truths/") # The input directory of raw logs
log_format = '<Content>' # Format of the file, if there are different fields
output_dir = input_dir  # The output directory of parsing results
logName = dataset + "_lines.txt" # Name of file to be parsed
#file = open("ground_truths/" + dataset + "_lines.txt", "r")
regex = [] # Regex strings for Drain execution
depth = 5 # Max depth of the parsing tree
st = 0.6 # Drain similarity threshold


## Code

print('\n=== Starting Drain Parsing ===')
indir = os.path.join(input_dir, os.path.dirname(logName))
print(indir)
log_file = os.path.basename(logName)

parser = DrainMethod.LogParser(log_format=log_format, indir=indir, outdir=output_dir, rex=regex, depth=depth, st=st)
parser.parse(log_file)

parsedresult=os.path.join(output_dir, log_file + '_structured.csv')   


    


In [None]:
## Method to find the most representative line inside the cluster

from nltk.tokenize import WhitespaceTokenizer

tk = WhitespaceTokenizer()

## raw_lines = list of lines inside LogSummary's cluster
## word_list = list of tokens composed by the LDA/BertTopic
def find_best_line(raw_lines, word_list):
    closest_line = 0
    similar_tokens = 0
    max_similarity = 0
    for idx, line in enumerate(raw_lines):
        tokenized_line = tk.tokenize(line.lower())
        for token in tokenized_line:
            if token in word_list:
                similar_tokens += 1
        #print ("Line {} has {} identical tokens".format(idx, similar_tokens))
        if similar_tokens > max_similarity:
           max_similarity = similar_tokens
           closest_line = idx
        similar_tokens = 0
    print("The closest line is {}, with {} identical tokens to the topic".format(closest_line, max_similarity)) 
    print("Line {} is: {}".format(closest_line, raw_lines[closest_line]))   
    return (raw_lines[closest_line])        

#find_similar_lines(cluster_lines, 0, words)

In [None]:
## LDA Tests

import gensim.corpora as corpora
import pandas as pd
import gensim

dataset = "Zookeeper"
csv = pd.read_csv("ground_truths/Zookeeper_lines.txt_structured.csv")
content = csv["EventTemplate"]
num_topics = 10
line_file = []
line_set = []

# Converts sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

for idx, line in enumerate(content):
    line_set.append(line + '\n')

    if (idx % 20 == 19):
  
        # Converts to words
        data_words = list(sent_to_words(line_set))
        # Creates dictionary
        id2word = corpora.Dictionary(data_words)
        # Creates corpora
        corpus = [id2word.doc2bow(text) for text in data_words]
        # Builds LDA model
        lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics)
        # Gets word topics
        x = lda_model.show_topics(num_topics=1, num_words=10,formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        #Below Code Prints Only Words 
        for topic,words in topics_words:
            summary =  " ".join(words)    

        # #Appends summary to general line file
        # for num in range(20):
        #     line_file.append(summary)

        #Finds most representative line inside the cluster
        best_line = find_best_line(line_set, summary)

        for num in range(20):
            line_file.append(summary)

        line_set = []

## Writes external file with created topics
with open ("ground_truths/" + dataset + "_lda_topics.txt", "w") as f:
     for line in line_file:
          f.write(f"{line}\n")

In [None]:
## Tests with BerTopic

from sklearn_extra.cluster import KMedoids
from bertopic import BERTopic
import pandas as pd
import numpy as np
import pickle
import os


dataset = "Zookeeper"
csv = pd.read_csv("ground_truths/Zookeeper_lines.txt_structured.csv")
content = csv["EventTemplate"]
num_topics = 10
line_file = []
line_set = []
cluster_model = KMedoids(n_clusters = 1)
topic_model = BERTopic(hdbscan_model=cluster_model)

for idx, line in enumerate(content):

    line_set.append(line + '\n')

    if (idx % 20 == 19):
  
        print("Chegamos ao idx {}".format(idx))

        #Applies BertTopic
        topics, probs = topic_model.fit_transform(line_set)

        #Gets summary of topics
        topic_model.get_topic(0)
        top_topic = topic_model.get_topic(0)
        words = [i[0] for i in top_topic]
        summary = ' '.join(words)

        #Finds most representative line inside the cluster
        best_line = find_best_line(line_set, summary)

        for num in range(20):
            line_file.append(summary)

        line_set = []

## Writes external file with created topics
with open ("ground_truths/" + dataset + "_bert_topics.txt", "w") as f:
     for line in line_file:
          f.write(f"{line}\n")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
## Tests with Bertopic, using own embeddings

from sklearn_extra.cluster import KMedoids
from bertopic import BERTopic
import pandas as pd
import numpy as np
import pickle
import os

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from sentence_transformers import SentenceTransformer
from torch import bfloat16
from torch import cuda
#from umap import UMAP
import umap.umap_ as UMAP
import transformers
import accelerate

# Informações do dataset
dataset = "Zookeeper"
csv = pd.read_csv("ground_truths/Zookeeper_lines.txt_structured.csv")
content = csv["EventTemplate"]

# Models parameters
model_id = 'meta-llama/Llama-2-13b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'; print(device)

# # Quantization to load an LLM with less GPU memory
# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,  # 4-bit quantization
#     bnb_4bit_quant_type='nf4',  # Normalized float 4
#     bnb_4bit_use_double_quant=True,  # Second quantization after the first
#     bnb_4bit_compute_dtype=bfloat16  # Computation type
# )

# System Prompt
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""
# Final prompt
prompt = system_prompt + example_prompt + main_prompt
# Llama 2 Tokenizerumap
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    #quantization_config=bnb_config,
    device_map='auto',
)
model.eval()
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)
# KeyBERT
keybert = KeyBERTInspired()
# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)
# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)
# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
#embeddings = embedding_model.encode(content, show_progress_bar=True)
umap_model = UMAP.UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
# Uses K-Medoids
cluster_model = KMedoids(n_clusters = 1)

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=cluster_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

num_topics = 10
line_file = []
line_set = []

for idx, line in enumerate(content):

    line_set.append(line + '\n')

    if (idx % 20 == 19):
  
        print("Chegamos ao idx {}".format(idx))

        #Creates Embeddings
        embeddings = embedding_model.encode(line_set, show_progress_bar=True)

        #Applies BertTopic
        topics, probs = topic_model.fit_transform(line_set, embeddings)

        #Gets summary of topics
        topic_model.get_topic(0)
        top_topic = topic_model.get_topic(0)
        words = [i[0] for i in top_topic]
        summary = ' '.join(words)

        #Finds most representative line inside the cluster
        best_line = find_best_line(line_set, summary)

        for num in range(20):
            line_file.append(summary)

        line_set = []

## Writes external file with created topics
with open ("ground_truths/" + dataset + "_bert_topics.txt", "w") as f:
     for line in line_file:
          f.write(f"{line}\n")

In [None]:
import accelerate as ac

print(ac.__version__)
# 2.0.1