In [None]:
################################## LIBRARIES ##################################

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn_extra.cluster import KMedoids
from bertopic import BERTopic
from nltk.tokenize import WhitespaceTokenizer
from ast import literal_eval
from pathlib import Path
from tqdm import tqdm
from umap import UMAP
import pandas as pd
import numpy as np
import regex as re
import DrainMethod
import contextlib
import pickle
import os
from rouge import Rouge
import matplotlib.pyplot as plt

############################## AUXILIARY METHODS ##############################

# Code for reading HuggingFace token
def get_huggingface_token():
    return Path("huggingface_token.txt").read_text().strip()

# Calls conversion from data to dataframe
def load_data():
    headers, regex = generate_logformat_regex(log_format)
    return log_to_dataframe(os.path.join(indir, logName), regex, headers, log_format)

# Preprocesses dataframe with regexes
def preprocess_df(df_log):
    for idx, content in df_log["Content"].items():
        for currentRex in regex:
            df_log.at[idx, 'Content'] = re.sub(currentRex, '<*>', content)
    return df_log

# Function to generate regular expression to split log messages
def generate_logformat_regex(log_format):
    headers, regex = [], ''
    splitters = re.split(r'(<[^<>]+>)', log_format)
    for i, splitter in enumerate(splitters):
        if i % 2 == 0:
            regex += re.sub(' +', '\\\s+', splitter)
        else:
            header = splitter.strip('<>').strip()
            regex += f'(?P<{header}>.*?)'
            headers.append(header)
    return headers, re.compile('^' + regex + '$')

# Function to transform log file to dataframe
def log_to_dataframe(log_file, regex, headers, logformat):
    log_messages = []
    with open(log_file, 'r') as fin:
        for line in fin:
            with contextlib.suppress(Exception):
                match = regex.search(line.strip())
                log_messages.append([match.group(header) for header in headers])
    logdf = pd.DataFrame(log_messages, columns=headers)
    logdf.insert(0, 'LineId', range(1, len(logdf) + 1))
    return logdf

# Transforms the dataset, creating raw vector file
def transform_dataset(raw_content):
    path_to_file = os.path.join(vector_dir, logName + '_vectors_TFIDF.vec')
    if Path(path_to_file).is_file():
        vectors_tfidf = pickle.load(open(path_to_file, 'rb'))
    else:
        tr_idf_model = TfidfVectorizer()
        vectors_tfidf = tr_idf_model.fit_transform(raw_content)
        pickle.dump(vectors_tfidf, open(path_to_file, 'wb'))
    return vectors_tfidf

# Creates distance matrix, using Euclidean distance
def create_distance_matrix(vector_df):
    tfidf_distance = pairwise_distances(vector_df, metric="euclidean", n_jobs=-1)
    return (tfidf_distance - tfidf_distance.min()) / (tfidf_distance.max() - tfidf_distance.min())

# Creates variable matrix, using Jaccard distance
def create_variable_matrix():
    output_csv = os.path.join(os.getcwd(), "results", log_file + '_structured.csv')
    var_df = pd.read_csv(output_csv)["ParameterList"]
    var_df = var_df.apply(literal_eval)
    mlb = MultiLabelBinarizer(sparse_output=True)
    var_distance = pairwise_distances(mlb.fit_transform(var_df).todense(), metric="jaccard", n_jobs=-1)
    return var_distance

# Creates closeness matrix
def creates_closeness_matrix(tfidf_distance):
    n = len(tfidf_distance)
    count_distance = np.abs(np.subtract.outer(range(n), range(n)))
    return (count_distance - count_distance.min()) / (count_distance.max() - count_distance.min())

# Saves matrices to files
def saves_matrices(distance_mat, variable_mat, closeness_mat):
    np.save(f"tfidf_distance_{logName}.csv", distance_mat)
    np.save(f"var_distance_{logName}.csv", variable_mat)
    np.save(f"count_distance_{logName}.csv", closeness_mat)

# Combines matrices using weights
def joins_matrices(tfidf_distance, var_distance, count_distance, alpha, beta, gamma):
    if alpha + beta + gamma > 1:
        raise ValueError("Values must sum to 1!")
    return alpha * tfidf_distance + beta * var_distance + gamma * count_distance

# Clusters with KMedoids
def cluster_kmedoids(unified_matrix, cluster_num):
    clusterer = KMedoids(n_clusters=cluster_num, method='pam', init='random').fit(unified_matrix)
    return clusterer
