In [13]:
# !module --ignore_cache load python/3.9.6
# !pip uninstall torch -y
# !pip uninstall transformers -y
# !rm -r ~/.local/lib/python3.9/site-packages/torch-2.0.1+computecanada.dist-info/
# !pip install ~/torch-2.0.1+computecanada-cp39-cp39-linux_x86_64.whl
# !pip install numpy==1.24.4 tqdm scikit-learn torch==2.0.1 transformers==4.37.1 rich

In [1]:
import sys
sys.path.append("..")
import linetracker.main as m
import linetracker.parser.parser as p
import linetracker.embeddings.llm as llm_embedding
import linetracker.embeddings.distances as d
import linetracker.line_distance as ld
import linetracker.parser.variables_matrix as ev
import linetracker.clustering.kmedoid as clustK

import numpy as np
import h5py
import json
import time
import tqdm
from rich.console import Console

## 1. Get example log files

```python
splits = m2.get_split_build_logs("../data/splitted_event_ids.json")# type: ignore
print("Sizes available: ",np.unique([len(s) for s in splits.values()]))
```
```
Sizes available:  [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  75
  77 127 129 130 131 132 145 147 148]
```

```python
# get log file of size specified
sizes = [2, 3, 10, 148]
splits_samples = {}
with h5py.File("../data/trat3_production_1650_1700_20231411_v1.hdf5") as fp:
    splits_samples = {}
    for k,split in splits.items():
        if len(split) not in sizes or len(split) in splits_samples:
            continue
        L = []
        for e in tqdm.tqdm(split):
            L.append({**fp[e].attrs})
        splits_samples[len(split)] = [k,L]
```
```
  0%|          | 0/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<00:00, 31.80it/s]
100%|██████████| 2/2 [00:00<00:00, 38.69it/s]
100%|██████████| 3/3 [00:00<00:00, 38.14it/s]
100%|██████████| 148/148 [00:03<00:00, 45.37it/s]
```

```python
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        print(type(obj),obj)
        if isinstance(obj, np.int32):
            return int(obj)
        if isinstance(obj, np.ndarray):
            return self.default(obj.tolist())
        if isinstance(obj, list):
            return [self.default(e) for e in obj.tolist()]
        if isinstance(obj, dict):
            return {k:self.default(v) for k,v in obj.items()}
        else:
            return obj

with open("./data.json", "w") as fp:
    json.dump(splits_samples, fp, cls=CustomEncoder)

## 2. Execute the pipeline

In [2]:
with open("data.json") as fp:
    splits_samples = json.load(fp)
splits_samples = {k:v for k,v in sorted(splits_samples.items(),key=lambda x:int(x[0]))}
print(list(splits_samples.keys()))

['2', '3', '10', '148']


We have the doc

```python
def execute_full_pipeline(
    logs: List[LogData],
    triplet_coefficient: TripletCoef,
    parser: Callable[[List[LogData]], List[p.ParsedLine]],
    embedder: Callable[[List[str]], Generator[np.ndarray, None, None]],
    embedding_distance_fn: Callable[[List[np.ndarray]], np.ndarray],
    line_distance_fn: Callable[[List[LogData]], np.ndarray],
    clustering_fn: Callable[[np.ndarray], c.ClusteringAlgorithmOutput],
    float_precision: type = np.float32,
) -> c.ClusteringAlgorithmOutput:
    """Cluster logs provided in argument into groups of related log lines
    # Arguments
    - logs: List[LogData], the log lines
    - triplet_coefficient: TripletCoef, the three coefficients to use to ponderate the matrices
    - parser: Callable[[List[LogData]], List[p.ParsedLine]], a function that from the list of logs lines can generate for each line
    - embedder: Callable[[List[str]], Generator[np.ndarray, None, None]], the function that can generate embeddings from logs
    - embedding_distance_fn: Callable[[List[np.ndarray]], np.ndarray], given all embeddings of each log lines of the same log file, generate the normalized (between 0 and 1) distances between all embeddings
    - line_distance_fn: Callable[[List[str]],np.ndarray], a function that can generate a matrix with the distance between each log line
    - clustering_fn:  Callable[[np.ndarray], c.ClusteringAlgorithmOutput], taking the combined matrix with the coefficients provided, clusters the logs
    - float_precision: type = np.float32, the precision to use for all floating point matrices
    """
```

In [3]:
# build the functions for the pipeline
parser = lambda logs:p.get_parsing_drainparser([e['text'] for e in logs],depth=3,similarity_threshold=0.1,max_children=5)
models_names = ["meta-llama/Llama-2-7b-chat-hf","WhereIsAI/UAE-Large-V1", "BAAI/bge-large-en-v1.5"]
model_name = models_names[2]
init_embedder = llm_embedding.generate_embeddings_llm(model_name=model_name,token="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", use_cpu=True)
pooling_fn = lambda embedding:embedding
embedder = lambda logs: init_embedder(logs, pooling_fn,limit_tokens=100,precision=np.float16)# type: ignore
embedding_distance_fn = d.normalized_cosine_distance
line_distance_fn = ld.get_absolute_line_distance_matrix
clustering_fn = lambda combined_matrix: clustK.get_clustering_kmedoid(combined_matrix)['clustering']
float_precision = np.float16
triplet_coefficient = m.TripletCoef(coef_variables_matrix=0.4, coef_embeddings_matrix=0.6, coef_count_matrix=0.0)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-large-zh-v1.5 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We will execute each step of the pipeline:
```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
    # 3. build the embeddings
    embeddings: List[np.ndarray] = [embedding for embedding in embedder(logs_texts)]
    # 4. build the distance matrix
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    del embeddings
    # 5. build the count matrix
    count_matrix = line_distance_fn(logs).astype(float_precision)
    # 6. merge the matrices with triplet coefficient
    combined_matrix = combine_matrices(
        TripletMatrix(
            variables_matrix=variables_distance_matrix,
            embeddings_matrix=embeddings_distance_matrix,
            count_matrix=count_matrix,
        ),
        triplet_coef=triplet_coefficient,
    ).astype(float_precision)
    # note: values will be between 0 and 3 (addition of 3 matrices normalized between 0 and 3)
    del variables_distance_matrix
    del embeddings_distance_matrix
    # 7. run the clustering algorithm with the constraints
    clustering_output = clustering_fn(combined_matrix)
    # 8. return the result
    return clustering_output
```

First step:
```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
```

In [4]:
dict_parsed_variables = {}
# for each of our log file with different number of lines, we apply step 1 and save the result
for size,[build_log_name, logs] in splits_samples.items():
    print(f"{build_log_name:-^100}")
    parsed_logs = parser(logs)
    # Apply step 1
    logs_texts = [e['text'] for e in logs]
    parsed_variables = [e['variables'] for e in parsed_logs]
    # Show the result (truncated)
    for i,(text, variables) in enumerate(zip(logs_texts, parsed_variables)):
        print(variables,": ",text)
        if i > 10:
            print("...")
            break
    # And save the result
    dict_parsed_variables[size] = parsed_variables

----------------------------------------243909--TCSBASE6_ppc----------------------------------------
[] :  2023-11-03 06:51:16 sed: can't read dw_stubsA.d: No such file or directory

[] :  2023-11-03 06:52:10 ldppc: BFD 2.15 assertion fail ../../binutils-2.15/bfd/linker.c:619

------------------------------------244245--COREBASE69_SPAP3_sim------------------------------------
['14:20:06'] :  2023-11-05 14:20:06 make[2]: cleartool: Command not found

['14:20:07'] :  2023-11-05 14:20:07 make[2]: cleartool: Command not found

[] :  cp: cannot stat '/localdisk/6500_repo/ome/vobs/equinox_ne_build/basebuild/EQBASE/sim/COREBASE69_SPAP3/*elf': No such file or directory

------------------------------------------242344--otsc_ppc------------------------------------------
['sed:', "can't read *.h.temp:"] :  2023-10-27 07:05:36 sed: can't read *.h.temp: No such file or directory

['07:06:21'] :  2023-10-27 07:06:21 tput: No value for $TERM and no -T specified

['07:05:36'] :  2023-10-27 07:05:36 t

Then step2: 
```python
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
```

In [5]:
dict_variables_distance_matrix = {}
for size,parsed_variables in dict_parsed_variables.items():
    variables_distance_matrix = ev.get_variable_matrix(parsed_events=parsed_variables)
    dict_variables_distance_matrix[size] = variables_distance_matrix
    print(f"{size:-^100}")
    for i,v in enumerate(parsed_variables):
        print(i,":",v)
        if i > 10:
            print("...")
            break
    print(dict_variables_distance_matrix[size])
    print("-"*100)
    

-------------------------------------------------2--------------------------------------------------
0 : []
1 : []
[[0. 0.]
 [0. 0.]]
----------------------------------------------------------------------------------------------------
-------------------------------------------------3--------------------------------------------------
0 : ['14:20:06']
1 : ['14:20:07']
2 : []
[[0. 1. 1.]
 [1. 0. 1.]
 [1. 1. 1.]]
----------------------------------------------------------------------------------------------------
-------------------------------------------------10-------------------------------------------------
0 : ['sed:', "can't read *.h.temp:"]
1 : ['07:06:21']
2 : ['07:05:36']
3 : ['07:05:36']
4 : ['07:05:14']
5 : ['07:05:35']
6 : ['07:05:14']
7 : ['07:06:21']
8 : ['mv:', "cannot stat '*.h':"]
9 : ['07:05:35']
[[0.66666667 1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         0.         1.         1.         1.         1.
  1.      

Then step 3
```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
    # 3. build the embeddings
    embeddings: np.ndarray = np.array(
        [embedding for embedding in embedder(logs_texts)]
    ).astype(float_precision)
    # 4. build the distance matrix
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    del embeddings
```

In [6]:
%%time
dict_embeddings_distance_matrix = {}
for size,[build_log_name, logs] in splits_samples.items():
    print(f"{size:-^100}")
    logs_texts = [e['text'] for e in logs]
    start = time.perf_counter()
    embeddings = np.array(
        [embedding for embedding in embedder(logs_texts)]
    ).astype(float_precision)
    diff = time.perf_counter()-start
    print(f"{embeddings.shape=}, obtained in {diff} second ({diff/embeddings.shape[0]} s/embedding)")
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    print("*"*100)
    print(embeddings_distance_matrix)
    dict_embeddings_distance_matrix[size] = embeddings_distance_matrix
    del embeddings

-------------------------------------------------2--------------------------------------------------


embeddings.shape=(2, 21128), obtained in 1.0482574999999983 second (0.5241287499999991 s/embedding)
****************************************************************************************************
[[0.     0.1793]
 [0.1793 0.    ]]
-------------------------------------------------3--------------------------------------------------
embeddings.shape=(3, 21128), obtained in 0.8414668999999968 second (0.2804889666666656 s/embedding)
****************************************************************************************************
[[0.000e+00 2.649e-04 3.894e-01]
 [2.649e-04 0.000e+00 3.872e-01]
 [3.894e-01 3.872e-01 0.000e+00]]
-------------------------------------------------10-------------------------------------------------
embeddings.shape=(10, 21128), obtained in 2.7178889999999996 second (0.27178889999999994 s/embedding)
****************************************************************************************************
[[0.       0.1074   0.09576  0.09576  0.104    0.0978   0.


```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
    # 3. build the embeddings
    embeddings: List[np.ndarray] = [embedding for embedding in embedder(logs_texts)]
    # 4. build the distance matrix
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    del embeddings
    # 5. build the count matrix
    count_matrix = line_distance_fn(logs).astype(float_precision)
```

In [7]:
%%time
splits_samples = {k:splits_samples[k] for k in sorted(splits_samples)}#type: ignore
dict_count_matrix = {}
for size,[build_log_name, logs] in splits_samples.items():
    print(f"{size:-^100}")
    count_matrix = line_distance_fn(logs).astype(float_precision)
    dict_count_matrix[size] = count_matrix
    print(count_matrix)

-------------------------------------------------10-------------------------------------------------
[[0.     0.1111 0.2222 0.3333 0.4443 0.5557 0.6665 0.778  0.8887 1.    ]
 [0.1111 0.     0.1111 0.2222 0.3333 0.4443 0.5557 0.6665 0.778  0.8887]
 [0.2222 0.1111 0.     0.1111 0.2222 0.3333 0.4443 0.5557 0.6665 0.778 ]
 [0.3333 0.2222 0.1111 0.     0.1111 0.2222 0.3333 0.4443 0.5557 0.6665]
 [0.4443 0.3333 0.2222 0.1111 0.     0.1111 0.2222 0.3333 0.4443 0.5557]
 [0.5557 0.4443 0.3333 0.2222 0.1111 0.     0.1111 0.2222 0.3333 0.4443]
 [0.6665 0.5557 0.4443 0.3333 0.2222 0.1111 0.     0.1111 0.2222 0.3333]
 [0.778  0.6665 0.5557 0.4443 0.3333 0.2222 0.1111 0.     0.1111 0.2222]
 [0.8887 0.778  0.6665 0.5557 0.4443 0.3333 0.2222 0.1111 0.     0.1111]
 [1.     0.8887 0.778  0.6665 0.5557 0.4443 0.3333 0.2222 0.1111 0.    ]]
------------------------------------------------148-------------------------------------------------
[[0.     0.0068 0.0136 ... 0.9863 0.993  1.    ]
 [0.0068 0.     0.

```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
    # 3. build the embeddings
    embeddings: np.ndarray = np.array(
        [embedding for embedding in embedder(logs_texts)]
    ).astype(float_precision)
    # 4. build the distance matrix
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    del embeddings
    # 5. build the count matrix
    count_matrix = line_distance_fn(logs).astype(float_precision)
    # 6. merge the matrices with triplet coefficient
    combined_matrix = combine_matrices(
        TripletMatrix(
            variables_matrix=variables_distance_matrix,
            embeddings_matrix=embeddings_distance_matrix,
            count_matrix=count_matrix,
        ),
        triplet_coef=triplet_coefficient,
    ).astype(float_precision)
    # note: values will be between 0 and 3 (addition of 3 matrices normalized between 0 and 3)
    del variables_distance_matrix
    del embeddings_distance_matrix
```

In [8]:
dico_combined_matrix = {}
for size in dict_count_matrix:
    print(size)
    for i,mat in enumerate([dict_variables_distance_matrix[size],dict_embeddings_distance_matrix[size],dict_count_matrix[size]]):
        assert np.unique(np.diag(mat)).tolist() == [0], f"Error for matrix {i}\n{mat}"
    dico_combined_matrix[size] = m.combine_matrices(
        m.TripletMatrix(
            variables_matrix=dict_variables_distance_matrix[size],
            embeddings_matrix=dict_embeddings_distance_matrix[size],
            count_matrix=dict_count_matrix[size],
        ),
        triplet_coef=triplet_coefficient,
    ).astype(float_precision)
    

10


AssertionError: Error for matrix 0
[[0.66666667 1.         1.         1.         1.         1.
  1.         1.         1.         1.        ]
 [1.         0.         1.         1.         1.         1.
  1.         0.         1.         1.        ]
 [1.         1.         0.         0.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         0.         0.         1.         1.
  1.         1.         1.         1.        ]
 [1.         1.         1.         1.         0.         1.
  0.         1.         1.         1.        ]
 [1.         1.         1.         1.         1.         0.
  1.         1.         1.         0.        ]
 [1.         1.         1.         1.         0.         1.
  0.         1.         1.         1.        ]
 [1.         0.         1.         1.         1.         1.
  1.         0.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         0.66666667 1.        ]
 [1.         1.         1.         1.         1.         0.
  1.         1.         1.         0.        ]]

To finally have
```python
    # 1. parse the logs
    parsed_logs: List[p.ParsedLine] = parser(logs)
    logs_texts = [e["text"] for e in logs]
    parsed_variables = [e["variables"] for e in parsed_logs]
    # 2. build the variable matrix (alreay normalized matrix as it has values between 0 and 1)
    variables_distance_matrix = e.get_variable_matrix(parsed_variables).astype(float_precision)
    # 3. build the embeddings
    embeddings: np.ndarray = np.array(
        [embedding for embedding in embedder(logs_texts)]
    ).astype(float_precision)
    # 4. build the distance matrix
    embeddings_distance_matrix = embedding_distance_fn(embeddings).astype(
        float_precision
    )
    del embeddings
    # 5. build the count matrix
    count_matrix = line_distance_fn(logs).astype(float_precision)
    # 6. merge the matrices with triplet coefficient
    combined_matrix = combine_matrices(
        TripletMatrix(
            variables_matrix=variables_distance_matrix,
            embeddings_matrix=embeddings_distance_matrix,
            count_matrix=count_matrix,
        ),
        triplet_coef=triplet_coefficient,
    ).astype(float_precision)
    # note: values will be between 0 and 3 (addition of 3 matrices normalized between 0 and 3)
    del variables_distance_matrix
    del embeddings_distance_matrix
    # 7. run the clustering algorithm with the constraints
    clustering_output = clustering_fn(combined_matrix)
    # 8. return the result
    return clustering_output
```

In [10]:
dico_clustering_output = {}
for size,matrix in dico_combined_matrix.items():
    print(np.diag(matrix))
    dico_clustering_output[size] = clustering_fn(matrix)
    

[0.2666 0.     0.     0.     0.     0.     0.     0.     0.2666 0.    ]


ValueError: The precomputed distance matrix contains non-zero elements on the diagonal. Use np.fill_diagonal(X, 0).

Finally we show the text with each cluster

In [10]:
import colorsys

def generate_hsv_palette(num_colors, saturation=1.0, value=1.0):
    colors = []
    hue_step = 1.0 / num_colors

    for i in range(num_colors):
        hue = i * hue_step
        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
        rgb = tuple(int(x * 255) for x in rgb)
        colors.append(rgb)

    return colors



In [11]:
console = Console(color_system="auto", highlight=False, force_jupyter=True)
dico_clustering_output = {s:dico_clustering_output[s] for s in sorted(dico_clustering_output,key=lambda x:int(x))}
print(list(dico_clustering_output.keys()))
for size, _ in dico_clustering_output.items():
    console.print(f"{size:-^100}", style=f"white", end="" )
    clustering_output = list(dico_clustering_output[size].values())
    unique_clusters =  np.unique(clustering_output)
    mapping = {clust:col for clust,col in zip(unique_clusters,generate_hsv_palette(len(unique_clusters),saturation=0.75))}
    for line_id, (log,cluster) in enumerate(zip(splits_samples[size][1], clustering_output)):
        text = f"{line_id:03d}-{cluster}: {log['text']}"
        r,g,b = mapping[cluster]
        console.print(text, style=f"rgb({r},{g},{b})", end="" )

['2', '3', '10', '148']


In [9]:
import pandas as pd
df = pd.read_json("../data/stats_dataset.json")
counts_by_log_plan = df.groupby("name").size().reset_index(name='count')
print(counts_by_log_plan.describe())
from datetime import timedelta
print(timedelta(seconds=float(counts_by_log_plan['count'].sum())*0.40))


              count
count  36599.000000
mean      18.587065
std       19.757756
min        1.000000
25%       10.000000
50%       15.000000
75%       21.000000
max      148.000000


In [10]:
counts_by_log_plan['count'].sum()

680268