# Embedding baseline

This is an evaluation based on embeddings as baseline.


Simple example to test environment.


In [16]:
from sentence_transformers import SentenceTransformer, util
import torch


# For e5 models, adding role-specific prefixes significantly improves performance
def to_query(text):
    return f"query: {text}"


def to_passage(text):
    return f"passage: {text}"


# Load the embedding model (use "cpu" if you don't have a GPU)
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print("Using device:", device)
model = SentenceTransformer("intfloat/e5-large-v2", device="cuda:1")

# Example: predefined labels and their explanatory paragraphs (translated to English)
test_labels = [
    ("Safety", "Policies and records related to operational safety and risk control."),
    ("Equipment", "Descriptions of equipment status, maintenance, and failures."),
    ("Geology", "Explanations of formation/reservoir properties and lithology."),
]

# 1) Encode label passages offline (one-time). We include the label name to strengthen identity.
label_texts = [to_passage(f"{name}. {desc}") for name, desc in test_labels]
label_emb = model.encode(label_texts, normalize_embeddings=True, convert_to_tensor=True)

# 2) Online classification: encode a short query
query = "Preventive measures for wellbore instability leading to downhole incidents"
q_emb = model.encode([to_query(query)], normalize_embeddings=True, convert_to_tensor=True)

# 3) Cosine similarity + Top-1 selection
scores = util.cos_sim(q_emb, label_emb)[0]  # shape: [num_labels]
best_idx = int(scores.argmax())
best_label, best_score = test_labels[best_idx][0], float(scores[best_idx])
print(best_label, best_score)


Using device: cuda
Safety 0.7543596029281616


## Evaluation on WITSML


In [17]:
from pathlib import Path
import sys

current_dir = Path().resolve()
project_root = current_dir.parent

sys.path.append(str(project_root))

import server.utils.configs.globals_config as glb  # noqa
import server.utils.filter as ft  # noqa


def label_dict_to_label_list(label_extraContent: dict) -> list[tuple[str, str]]:
    labels = []
    for key, meta in (label_extraContent).items():
        labels.append((key, str(meta)))
    return labels


# Kick out zzz#TBD, OutOfSet, Uncertain
kick_out_keys = ["zzz#TBD", "OutOfSetPrototypeData", "UncertainPrototypeData", "OutOfSetQuantity", "UncertainQuantity", "OutOfSetUnit", "UncertainUnit"]
for k in kick_out_keys:
    glb.quantity_fullList_extraContent.pop(k, None)
    glb.unit_fullList_extraContent.pop(k, None)
    glb.prototypeData_fullList_extraContent.pop(k, None)

labels_quantity = label_dict_to_label_list(glb.quantity_fullList_extraContent)
labels_unit = label_dict_to_label_list(glb.unit_fullList_extraContent)
labels_prototypedata = label_dict_to_label_list(glb.prototypeData_fullList_extraContent)

# 1) Encode label passages offline (one-time). We include the label name to strengthen identity.
label_texts_quantity = [to_passage(desc) for _, desc in labels_quantity]
label_emb_quantity = model.encode(label_texts_quantity, normalize_embeddings=True, convert_to_tensor=True)

label_texts_unit = [to_passage(desc) for _, desc in labels_unit]
label_emb_unit = model.encode(label_texts_unit, normalize_embeddings=True, convert_to_tensor=True)

label_texts_prototypedata = [to_passage(desc) for _, desc in labels_prototypedata]
label_emb_prototypedata = model.encode(label_texts_prototypedata, normalize_embeddings=True, convert_to_tensor=True)


print(label_texts_quantity)
print(label_texts_unit)
print(label_texts_prototypedata)

["passage: {'ddhub:Quantity': 'ForceRateOfChangeQuantity', 'rdfs:comment': ['A force rate of change is the time derivative of a force: $\\\\frac{dF}{dt}$, where $F$ is the mass density and $t$ is time.\\r\\nThe dimension of force rate of change is:\\r\\n$$[LMT^{-3}]$$.\\r\\nThe SI unit for **force rate of change** is: Newton per second with the associated unit label $\\\\frac{N}{s}$\\r\\n'], 'zzz:QuantityHasUnit': ['UncertainUnit', 'OutOfSetUnit'], 'zzz:PrototypeData': ['UncertainPrototypeData', 'OutOfSetPrototypeData']}", "passage: {'ddhub:Quantity': 'PressureRateOfChangeQuantity', 'rdfs:comment': ['A pressure rate of change is the time derivative of a pressure.\\r\\nThe dimension of pressure rate of change is:\\r\\n$$[ML^{-1}T^{-3}]$$.\\r\\nThe SI unit for **pressure rate of change** is: pascal per second with the associated unit label $\\\\frac{Pa}{s}$\\r\\n'], 'zzz:QuantityHasUnit': ['UncertainUnit', 'OutOfSetUnit'], 'zzz:PrototypeData': ['UncertainPrototypeData', 'OutOfSetPrototyp

An example on real WITSML metadata.


In [18]:
from typing import Union


def embedding_predict(sample: Union[dict, str], label_list: list, embedding_matrix: torch.Tensor, top_k=5):
    query = str(sample)
    q_emb = model.encode([to_query(query)], normalize_embeddings=True, convert_to_tensor=True)

    # Cosine similarity + Top-k selection
    scores = util.cos_sim(q_emb, embedding_matrix)[0]  # shape: [num_labels]
    idx = torch.argsort(scores, descending=True)[:top_k]
    cands = {label_list[i][0]: float(scores[i]) for i in idx}
    return cands


query = {"Mnemonic": "SPPA", "Description": "Standpipe pressure.", "Unit": "Pa"}
result = embedding_predict(query, labels_quantity, label_emb_quantity, top_k=5)
print(result)

{'PressureQuantity': 0.8400601148605347, 'PressureRateOfChangeQuantity': 0.8276913166046143, 'PressureLossConstantQuantity': 0.8253720998764038, 'PressureGradientPerLengthQuantity': 0.8240286111831665, 'FrequencyQuantity': 0.8195579051971436}


Now, run the recognition on real WITSML metadata.


In [19]:
from pathlib import Path
import json
import sys
import numpy as np

current_dir = Path().resolve()
project_root = current_dir.parent

sys.path.append(str(project_root))

import server.utils.configs.globals_config as glb  # noqa
import server.utils.filter as ft  # noqa


def softmax_dict(d: dict, temperature: float = 1.0) -> tuple:
    if temperature <= 0:
        raise ValueError("temperature must be > 0")

    values = np.array(list(d.values()), dtype=float)
    # subtract max for numerical stability, then divide by T
    scaled = (values - np.max(values)) / temperature
    exp_values = np.exp(scaled)
    softmax_values = exp_values / np.sum(exp_values)

    softmaxed = dict(zip(d.keys(), softmax_values))

    # key with max probability
    max_val = np.max(softmax_values)
    for k, v in softmaxed.items():
        if np.isclose(v, max_val):
            max_key = k
            break

    return softmaxed, max_key


query_files = [
    project_root / "data_store/test_data/new_labels/extracted_json/9-F-9_A+1+log+1+1+1+00001.json",
    project_root / "data_store/test_data/new_labels/extracted_json/9-F-9_A+1+log+2+1+1+00001.json",
    project_root / "data_store/test_data/new_labels/extracted_json/9-F-9_A+1+log+2+2+1+00001.json",
]

query_objs = []
for i in range(len(query_files)):
    with open(query_files[i], "r", encoding="utf-8") as f:
        data = json.load(f)
        query_tmp = {}
        for j in range(len(data)):
            mnemonic = data[j]["Mnemonic"]
            data[j].pop("true label PrototypeData")
            data[j].pop("true label Unit")
            data[j].pop("true label Quantity")
            query_tmp.update({mnemonic: data[j]})
        query_objs.append(query_tmp)
        filtered, rejected = ft.filter_metadata(query_objs[i])
        print("filtered count:", len(filtered))
        print("rejected count:", len(rejected))

    recognition_results = {}
    for k, v in filtered.items():
        result_quantity = embedding_predict(
            sample=v,
            label_list=labels_quantity,
            embedding_matrix=label_emb_quantity,
            top_k=5,
        )
        result_unit = embedding_predict(
            sample=v,
            label_list=labels_unit,
            embedding_matrix=label_emb_unit,
            top_k=5,
        )
        result_prototypeData = embedding_predict(
            sample=v,
            label_list=labels_prototypedata,
            embedding_matrix=label_emb_prototypedata,
            top_k=5,
        )
        softmax_temperature = 0.005
        recognition_results.update(
            {
                k: {
                    "Raw_content": str(v),
                    "Quantity_class": softmax_dict(result_quantity, softmax_temperature)[1],
                    "Quantity_class_candidates": softmax_dict(result_quantity, softmax_temperature)[0],
                    "Unit_class": softmax_dict(result_unit, softmax_temperature)[1],
                    "Unit_class_candidates": softmax_dict(result_unit, softmax_temperature)[0],
                    "PrototypeData_class": softmax_dict(result_prototypeData, softmax_temperature)[1],
                    "PrototypeData_class_candidates": softmax_dict(result_prototypeData, softmax_temperature)[0],
                }
            }
        )

    Path("Embedding").mkdir(parents=True, exist_ok=True)
    with open(f"./Embedding/recognition_results_{i + 1}.json", "w", encoding="utf-8") as f:
        json.dump(recognition_results, f, ensure_ascii=False, indent=2)


filtered count: 189
rejected count: 9


filtered count: 98
rejected count: 1
filtered count: 109
rejected count: 1


## Same evaluation on the public dataset clinc_oos

Evaluate the non-ID and ID classification performace using a public dataset [clinc_oos](https://huggingface.co/datasets/clinc/clinc_oos).
For simplication purpose, only test on the small/validation set, which including 3100 samples.


In [20]:
# Load dataset
import pandas as pd
import yaml

# dataset_path = project_root / "data_store/test_data/clinc_oos/small/validation-00000-of-00001.parquet"  # path to small/validation-00000-of-00001.parquet
dataset_path = project_root / "data_store/test_data/clinc_oos/small/reduced_validation_set.parquet"  # path to reduced dataset
df = pd.read_parquet(dataset_path)
true_label_dict = dict(zip(df["text"], df["intent"]))
print(true_label_dict)

labels_path = project_root / "data_store/test_data/clinc_oos/labels.yaml"
with open(labels_path, "r") as file:
    labels_obj = yaml.safe_load(file)
full_labels = labels_obj.get("names")

labels = {str(i): full_labels[str(i)] for i in range(30)}
labels.update({"42": "oos"})
splited_labels = [label.replace("_", " ") for label in labels.values()]
print(splited_labels)


def embedding_predict_for_clinc(sample: Union[dict, str], label_list: list, embedding_matrix: torch.Tensor, top_k=5):
    query = str(sample)
    q_emb = model.encode([to_query(query)], normalize_embeddings=True, convert_to_tensor=True)

    # Cosine similarity + Top-k selection
    scores = util.cos_sim(q_emb, embedding_matrix)[0]  # shape: [num_labels]
    idx = torch.argsort(scores, descending=True)[:top_k]
    cands = {label_list[i]: float(scores[i]) for i in idx}
    return cands


label_emb = model.encode(splited_labels, normalize_embeddings=True, convert_to_tensor=True)
result_clinc = {}
for i in range(len(df["text"].tolist())):
    q_str = df["text"].values[i]
    intent = str(true_label_dict[q_str])
    tmp = {"true_label": labels[intent], "candidates": embedding_predict_for_clinc(q_str, list(labels.values()), label_emb)}
    result_clinc.update({q_str: tmp})
print(result_clinc)

Path("Embedding").mkdir(parents=True, exist_ok=True)
with open("./Embedding/recognition_results_clinc.json", "w", encoding="utf-8") as f:
    json.dump(result_clinc, f, ensure_ascii=False, indent=2)

['restaurant reviews', 'nutrition info', 'account blocked', 'oil change how', 'time', 'weather', 'redeem rewards', 'interest rate', 'gas type', 'accept reservations', 'smart home', 'user name', 'report lost card', 'repeat', 'whisper mode', 'what are your hobbies', 'order', 'jump start', 'schedule meeting', 'meeting schedule', 'freeze account', 'what song', 'meaning of life', 'restaurant reservation', 'traffic', 'make call', 'text', 'bill balance', 'improve credit score', 'change language', 'oos']
