In [1]:
import sys
sys.path.append("..")
import linetracker.main as m
import linetracker.parser.parser as p
import linetracker.embeddings.llm as llm_embedding
import linetracker.embeddings.distances as d
import linetracker.line_distance as ld
import linetracker.parser.variables_matrix as ev
import linetracker.clustering.kmedoid as clustK

import numpy as np
import h5py
import json
import time
import tqdm
from rich.console import Console

In [2]:
with open("data.json") as fp:
    splits_samples = json.load(fp)
splits_samples = {k:v for k,v in sorted(splits_samples.items(),key=lambda x:int(x[0]))}
print(list(splits_samples.keys()))

['2', '3', '10', '148']


In [3]:
# build the functions for the pipeline
parser = lambda logs:p.get_parsing_drainparser([e['text'] for e in logs],depth=3,similarity_threshold=0.1,max_children=5)
models_names = ["meta-llama/Llama-2-7b-chat-hf","WhereIsAI/UAE-Large-V1", "BAAI/bge-large-zh-v1.5"]
model_name = models_names[2]
init_embedder = llm_embedding.generate_embeddings_llm(model_name=model_name,token="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", use_cpu=True)
pooling_fn = llm_embedding.get_pooling_function()
embedder = lambda logs: init_embedder(logs, pooling_fn,limit_tokens=100,precision=np.float16)# type: ignore
embedding_distance_fn = d.normalized_cosine_distance
line_distance_fn = ld.get_absolute_line_distance_matrix
clustering_fn = lambda combined_matrix: clustK.get_clustering_kmedoid(combined_matrix)
float_precision = np.float16
triplet_coefficient = m.TripletCoef(coef_variables_matrix=0.4, coef_embeddings_matrix=0.6, coef_count_matrix=0.0)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-large-zh-v1.5 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
for size, [log_name, logs] in splits_samples.items():
    print(f"{size:-^100}")
    start = time.perf_counter()
    m.execute_full_pipeline(
        logs,
        triplet_coefficient,
        parser,
        embedder,
        embedding_distance_fn,
        line_distance_fn,
        clustering_fn,
        float_precision,
    )
    end = time.perf_counter()
    print(f"duration {end-start:.2f}s ({(end-start)/len(logs):.2f}s/log line)")

-------------------------------------------------2--------------------------------------------------
duration 0.85s (0.42s/log line)
-------------------------------------------------3--------------------------------------------------
duration 1.30s (0.43s/log line)
-------------------------------------------------10-------------------------------------------------
duration 4.50s (0.45s/log line)
------------------------------------------------148-------------------------------------------------
duration 75.97s (0.51s/log line)


In [10]:
import pandas as pd
df = pd.read_json("../data/stats_dataset.json")
counts_by_log_plan = df.groupby("name").size().reset_index(name='count')
print(df.columns)
print(counts_by_log_plan.describe())
n_lines = float(counts_by_log_plan['count'].sum())
print(f"{n_lines=}")
from datetime import timedelta
print(timedelta(seconds=n_lines*0.51))

Index(['dup_id', 'event_id', 'group_id', 'line_num', 'log_name', 'planid',
       'raw', 'template', 'text', 'variables', 'name', 'build_log'],
      dtype='object')
              count
count  36599.000000
mean      18.587065
std       19.757756
min        1.000000
25%       10.000000
50%       15.000000
75%       21.000000
max      148.000000
n_lines=680268.0
4 days, 0:22:16.680000


In [12]:
n = 5
# sampled_df = counts_by_log_plan.groupby('count', group_keys=False).apply(lambda group: group.sample(min(n, len(group))))
# sampled_df.to_json("./sampled_samples.json",orient="records")
sampled_df = pd.read_json("./sampled_samples.json")
n_elements = sampled_df['count'].sum()
print(timedelta(seconds=n_elements*0.5*10))
sampled_df.sort_values(by=['count'],inplace=True)
sampled_df

22:11:50


Unnamed: 0,name,count
0,"243488, COREBASE_CTM_ppc",1
1,"244856, COREBASE_CTM_ppc",1
2,"242656, COREBASE69_SP3AUX_ppc",1
3,"241129, COREBASE69_SP2_ppc",1
4,"226431, wr-container-wr-hal-dnx",1
...,...,...
370,"245445, ddf_vx_simbc69",145
371,"240760, ddf_vx_simbc69",147
372,"240761, ddf_vx_simbc69",147
373,"240698, ddf_vx_simbc69",148


In [13]:
import itertools as it
l_triplet = []
for (a,b,c) in it.product([0,0.25,1/3.,0.5,0.75,1],repeat=3):
    if abs(a+b+c-1) < 1e-2:
        print(a,b,c)
        l_triplet.append(m.TripletCoef(coef_variables_matrix=a, coef_embeddings_matrix=b, coef_count_matrix=c))


0 0 1
0 0.25 0.75
0 0.5 0.5
0 0.75 0.25
0 1 0
0.25 0 0.75
0.25 0.25 0.5
0.25 0.5 0.25
0.25 0.75 0
0.3333333333333333 0.3333333333333333 0.3333333333333333
0.5 0 0.5
0.5 0.25 0.25
0.5 0.5 0
0.75 0 0.25
0.75 0.25 0
1 0 0


In [17]:
class Encoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.int32) or isinstance(obj, np.int64):
            return int(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, list):
            return [self.default(e) for e in obj.tolist()]
        if isinstance(obj, dict):
            return {k:self.default(v) for k,v in obj.items()}
        return json.JSONEncoder.default(self, obj)

L = []
with open("./results.json") as fp:
    L = json.load(fp)
df.loc[:,'line_number'] = df['line_num']

i = 0
with tqdm.tqdm(total=len(sampled_df['name'].unique())*len(l_triplet)) as pbar:
    for build_log in sampled_df['name'].unique():
        df_build_log = df.query(f"build_log == '{build_log}'")
        logs = [d for d in df_build_log[['text','event_id', "line_number"]].to_dict(orient="records")]
        for triplet_coefficient in l_triplet:
            if i == len(L):
                clustering_output = m.execute_full_pipeline(
                    logs,
                    triplet_coefficient,
                    parser,
                    embedder,
                    embedding_distance_fn,
                    line_distance_fn,
                    clustering_fn,
                    float_precision,
                )
                L.append({"build_log": build_log, **clustering_output})
            pbar.update(1)
            i += 1

with open("./results.json", "w") as fp:
    json.dump(L, fp, cls=Encoder)

  0%|          | 0/6000 [00:00<?, ?it/s]

 23%|██▎       | 1404/6000 [1:39:50<12:26:51,  9.75s/it]