## How to evaluate the quality of embeddings for a given model

Overview:
1. Choose and prepare the model
2. Load the build log selected
3. Generate the embeddings
4. Evaluate the embedding qualitatively

In [1]:
import sys
sys.path.append("../..")
import linetracker.embeddings.llm as llm_embedding
import linetracker.embeddings.distances as d
import linetracker.pretty_print as pp
import linetracker.utils as u
from importlib import reload
reload(pp)
reload(d)
reload(u)
import re
import sklearn.preprocessing as prep
import numpy as np
import pandas as pd
import itertools as it
import random

## 1. Choose and prepare the model

(To load a pretrained model load the folder generated after training where there is a config.json file)

In [2]:
models_names = ["meta-llama/Llama-2-7b-chat-hf","WhereIsAI/UAE-Large-V1", "BAAI/bge-large-en-v1.5"]
model_name = models_names[2]
print(model_name)
init_embedder = llm_embedding.generate_embeddings_llm(model_name=model_name,token="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", use_cpu=True, local_files_only=True)

BAAI/bge-large-en-v1.5
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.


Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/robin/.cache/huggingface/token
Login successful


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-large-en-v1.5 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
pooling_fn = lambda x:x
embedder = lambda logs: init_embedder(logs, pooling_fn,limit_tokens=100,precision=np.float32)# type: ignore

## 2. Load the build log selected

In [4]:
df = pd.read_json("../../data/stats_dataset.json")

Sample a random build log

In [5]:
random.seed(1)
build_log = random.choice(df['build_log'].unique())
build_log_df = df.query(f"build_log == '{build_log}'")[['text','group_id']]
build_log_df.loc[:, "text"] = build_log_df["text"].apply(u.remove_date_time).apply(lambda x:x.strip())
encoder = prep.LabelEncoder()
build_log_df.loc[:, "group_id"] = prep.LabelEncoder().fit_transform(build_log_df["group_id"])
pd.set_option('display.max_colwidth', None)
build_log_df

Unnamed: 0,text,group_id
11440,cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*load': No such file or directory,1
42688,/localdisk/6500_repo/ome/vobs/optnet_os/vxworks/vxworks-6.9/target/h/tool/common/private/toolMacrosP.h:54: #error unsupported toolchain,3
57150,cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*tar': No such file or directory,1
75141,tput: No value for $TERM and no -T specified,4
167475,tput: No value for $TERM and no -T specified,4
170054,sed: can't read *.h.temp: No such file or directory,2
198626,cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*boot*po': No such file or directory,1
214987,cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*raw': No such file or directory,1
269450,cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*elf': No such file or directory,1
269537,mv: cannot stat '*.h': No such file or directory,2


## 3. Generate the embeddings

In [6]:
%%time
embeddings = [embedding for embedding in embedder(build_log_df['text'])]

CPU times: user 6min 3s, sys: 0 ns, total: 6min 3s
Wall time: 56.5 s


In [12]:
%%time
distance_matrix = d.normalized_cosine_distance([np.mean(e,axis=0) for e in embeddings])
np.set_printoptions(linewidth=np.inf)
print(distance_matrix)
np.min(distance_matrix), np.max(distance_matrix)

[[0.00000000e+00 1.99072033e-01 1.51048005e-02 2.23111808e-01 2.23111808e-01 1.63023382e-01 1.69473290e-02 1.73785985e-02 1.78820789e-02 1.16445065e-01 1.47961348e-01 2.23111808e-01 1.44622624e-02 1.44613087e-02 2.10027993e-02 2.23111808e-01 1.26137137e-02 2.23111808e-01 1.35820508e-02 1.03432000e-01 2.23111808e-01]
 [1.99072033e-01 0.00000000e+00 2.00393796e-01 2.99144566e-01 2.99144566e-01 3.01710904e-01 2.01731443e-01 1.93873316e-01 1.87370837e-01 2.55364448e-01 1.51185215e-01 2.99144566e-01 1.91107005e-01 1.92977220e-01 1.97891057e-01 2.99144566e-01 1.86625987e-01 2.99144566e-01 1.89566225e-01 2.31598943e-01 2.99144566e-01]
 [1.51048005e-02 2.00393796e-01 0.00000000e+00 2.25687832e-01 2.25687832e-01 1.62751585e-01 1.69256926e-02 1.45833194e-02 1.34807229e-02 1.17883444e-01 1.51372343e-01 2.25687832e-01 1.41175687e-02 1.58300996e-02 1.81334615e-02 2.25687832e-01 1.06563568e-02 2.25687832e-01 1.34331584e-02 1.05134875e-01 2.25687832e-01]
 [2.23111808e-01 2.99144566e-01 2.25687832e-01

(0.0, 0.3017109)

In [15]:
# flatten the matrix an rank pairs of lines that are the most similar
texts = build_log_df['text'].tolist()
groups = build_log_df['group_id'].tolist()
sorted_distances = [
    (texts[i],texts[j],distance_matrix[i,j],groups[i],groups[j]) for i,j in it.combinations(range(len(embeddings)),2)
]
sorted_distances.sort(key=lambda x:x[-3],reverse=False)

In [16]:
print(f"{len(embeddings)} lines")
print("\n".join([f"{g}- {t}" for g,t in zip(groups,texts)]))
print("/"*100)
for (l1,l2,similarity,g1,g2) in sorted_distances:
    print("*"*100)
    print(f"{similarity:.3f} {g1==g2=}")
    print(l1)
    print(l2)
    print("-"*100)
    i=input("")
    

21 lines
1- cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*load': No such file or directory
3- /localdisk/6500_repo/ome/vobs/optnet_os/vxworks/vxworks-6.9/target/h/tool/common/private/toolMacrosP.h:54: #error unsupported toolchain
1- cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*tar': No such file or directory
4- tput: No value for $TERM and no -T specified
4- tput: No value for $TERM and no -T specified
2- sed: can't read *.h.temp: No such file or directory
1- cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*boot*po': No such file or directory
1- cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*raw': No such file or directory
1- cp: cannot stat '/localdisk/6500_repo/ome/vobs/viking_build/build/ome/txqoriq/sim/hybrid_100G_trib_R1200/*elf': No such file or d

KeyboardInterrupt: Interrupted by user