In [1]:
import pickle

In [2]:
with open("datasets/mitre_embeddings.pickle", "rb") as f:
    mitre_embeddings_w_id = pickle.load(f)

with open("datasets/nvidia-mitre-embeddings.pickle", "rb") as f:
    mitre_embeddings = pickle.load(f)

with open("datasets/nvidia-tram-train-embeddings.pickle", "rb") as f:
    tram_train_embeddings = pickle.load(f)

with open("datasets/nvidia-tram-test-embeddings.pickle", "rb") as f:
    tram_test_embeddings = pickle.load(f)

with open("datasets/nvidia-bosch-train-embeddings.pickle", "rb") as f:
    bosch_train_embeddings = pickle.load(f)

with open("datasets/nvidia-bosch-test-embeddings.pickle", "rb") as f:
    bosch_test_embeddings = pickle.load(f)

In [3]:
ttp_ids = list(mitre_embeddings_w_id.keys())
i = 0
mitre_embeddings_fix = {}
for batch in mitre_embeddings:
    for emb in batch:
        try:
            mitre_embeddings_fix[ttp_ids[i]] = emb
        except:
            print("can't find ttp id %s" % i)
        i+=1

can't find ttp id 637


In [4]:
import torch.nn.functional as F

device = "cuda:0"
tram_train_embeddings = {k.item(): F.normalize(v.to(device), p=2, dim=0) for k,v in tram_train_embeddings.items()}
tram_test_embeddings = {k: F.normalize(v.to(device), p=2, dim=0) for k,v in tram_test_embeddings.items()}

bosch_train_embeddings = {k.item(): F.normalize(v.to(device), p=2, dim=0) for k,v in bosch_train_embeddings.items()}
bosch_test_embeddings = {k: F.normalize(v.to(device), p=2, dim=0) for k,v in bosch_test_embeddings.items()}

mitre_embeddings_fix = {k: v.to(device) for k,v in mitre_embeddings_fix.items()}

In [5]:
import torch

In [6]:
ttp_embeddings = torch.vstack(list(mitre_embeddings_fix.values()))
ttp_embeddings = F.normalize(ttp_embeddings, p=2, dim=1)
ttp_embeddings.to(device)

tensor([[ 0.0007, -0.0003, -0.0106,  ...,  0.0073,  0.0012,  0.0085],
        [ 0.0103, -0.0072, -0.0098,  ..., -0.0055,  0.0033,  0.0030],
        [-0.0071, -0.0036, -0.0189,  ...,  0.0305,  0.0080,  0.0192],
        ...,
        [ 0.0077,  0.0140,  0.0342,  ...,  0.0055, -0.0175,  0.0068],
        [ 0.0034,  0.0177,  0.0283,  ..., -0.0124,  0.0017,  0.0082],
        [-0.0005,  0.0029, -0.0014,  ...,  0.0169, -0.0038,  0.0150]],
       device='cuda:0')

In [7]:
from const import TRAM_TECHNIQUES_LABELS, BOSCH_TECHNIQUES_LABELS

tram_bosch = sorted(list(set(BOSCH_TECHNIQUES_LABELS).union(TRAM_TECHNIQUES_LABELS)))

ttp_ids_bosch = [i for i,t in enumerate(ttp_ids) if t in BOSCH_TECHNIQUES_LABELS]
ttp_ids_tram = [i for i,t in enumerate(ttp_ids) if t in TRAM_TECHNIQUES_LABELS]
ttp_ids_all = [i for i,t in enumerate(ttp_ids)]

## TRAM

In [25]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

tram_results = {}

ids_per_label_set = {
    "tram_t": ttp_ids_tram,
    "bosch_t": ttp_ids_bosch,
    "all_mitre": ttp_ids_all # this contains ids
}

labels_per_label_set = {
    "tram_t": TRAM_TECHNIQUES_LABELS,
    "bosch_t": BOSCH_TECHNIQUES_LABELS,
    "all_mitre": ttp_ids # this contains labels
}

df = pd.read_json("datasets/tram_train.json")

for label_set in tqdm(["tram_t", "all_mitre"]):
    
    labels = labels_per_label_set[label_set]
    label_ids = ids_per_label_set[label_set]

    lb = MultiLabelBinarizer()
    lb.fit([labels])

    best_f1 = -1
    best_tau = 0.5

    for tau in np.linspace(0.25, 0.75, num=20):

        all_labels = []
        all_preds = []

        for i, emb in tram_train_embeddings.items():

            # Compute similarities
            similarities = emb @ ttp_embeddings.T  # Resulting shape: (number_of_embeddings,)
            
            # Create a mask based on ttp_ids_bosch_all
            mask = torch.zeros(similarities.shape, dtype=torch.bool)  # Match the shape of similarities
            mask[label_ids] = True  # Assume ttp_ids_bosch_all contains valid indices

            # Set values not in the mask to 0
            similarities[~mask] = 0  # Directly apply the mask to the 1D tensor
            indices_above_threshold = torch.where(similarities > tau)[0]
            ttp_values = [ttp_ids[i] for i in indices_above_threshold]
            tram_labels = df.iloc[i].labels
            tram_labels = [l for l in tram_labels if l in labels]
            all_labels.append(tram_labels)
            all_preds.append(ttp_values)

        y_true = lb.transform(all_labels)
        y_pred = lb.transform(all_preds)
        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0.0)
        if f1 > best_f1:
            best_f1 = f1
            best_tau = tau
        
    tram_results[label_set] = {
        "best_f1": best_f1,
        "best_tau": best_tau
    }
    
    print("label_set=%s best_f1=%s best_tau=%s" % (label_set, best_f1, best_tau))

 50%|█████     | 1/2 [01:27<01:27, 87.34s/it]

label_set=tram_t best_f1=0.172825459730935 best_tau=0.48684210526315785


100%|██████████| 2/2 [04:59<00:00, 149.55s/it]

label_set=all_mitre best_f1=0.17282545973093494 best_tau=0.48684210526315785





## AnnoCTR

In [26]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score


bosch_results = {}

df = pd.read_json("datasets/bosch_train.json")

for label_set in tqdm(["bosch_t", "all_mitre"]):
    
    labels = labels_per_label_set[label_set]
    label_ids = ids_per_label_set[label_set]

    lb = MultiLabelBinarizer()
    lb.fit([labels])

    best_f1 = -1
    best_tau = 0.5

    for tau in np.linspace(0.25, 0.75, num=20):

        all_labels = []
        all_preds = []

        for i, emb in bosch_train_embeddings.items():

            # Compute similarities
            similarities = emb @ ttp_embeddings.T  # Resulting shape: (number_of_embeddings,)
            
            # Create a mask based on ttp_ids_bosch_all
            mask = torch.zeros(similarities.shape, dtype=torch.bool)  # Match the shape of similarities
            mask[label_ids] = True  # Assume ttp_ids_bosch_all contains valid indices

            # Set values not in the mask to 0
            similarities[~mask] = 0  # Directly apply the mask to the 1D tensor
            indices_above_threshold = torch.where(similarities > tau)[0]
            ttp_values = [ttp_ids[i] for i in indices_above_threshold]
            bosch_labels = df.iloc[i].labels
            bosch_labels = [l for l in bosch_labels if l in labels]
            all_labels.append(bosch_labels)
            all_preds.append(ttp_values)

        y_true = lb.transform(all_labels)
        y_pred = lb.transform(all_preds)
        f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0.0)
        if f1 > best_f1:
            best_f1 = f1
            best_tau = tau
        
    bosch_results[label_set] = {
        "best_f1": best_f1,
        "best_tau": best_tau
    }
    
    print("label_set=%s best_f1=%s best_tau=%s" % (label_set, best_f1, best_tau))


  0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 1/2 [00:24<00:24, 24.29s/it]

label_set=bosch_t best_f1=0.18945444005115686 best_tau=0.4342105263157895


100%|██████████| 2/2 [01:13<00:00, 36.59s/it]

label_set=all_mitre best_f1=0.18967108320730455 best_tau=0.4342105263157895





In [None]:
# tram_results = {'tram_t': {'best_f1': np.float64(0.172825459730935),
#   'best_tau': np.float64(0.48684210526315785)},
#  'all_mitre': {'best_f1': np.float64(0.17282545973093494),
#   'best_tau': np.float64(0.48684210526315785)}}

# bosch_results = {'bosch_t': {'best_f1': np.float64(0.18945444005115686),
#   'best_tau': np.float64(0.4342105263157895)},
#  'all_mitre': {'best_f1': np.float64(0.18967108320730455),
#   'best_tau': np.float64(0.4342105263157895)}}

In [27]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

tram_out = {}

df = pd.read_json("datasets/tram_test.json")

for label_set in tqdm(["tram_t", "all_mitre"]):
    
    labels = labels_per_label_set[label_set]
    label_ids = ids_per_label_set[label_set]

    all_labels = []
    all_preds = []

    lb = MultiLabelBinarizer()
    lb.fit([labels])

    tau = tram_results[label_set]["best_tau"]
    tram_out[label_set] = {}

    for doc_name, df_doc in df.groupby("doc_title"):
        all_labels = []
        all_preds = []

        for idx, row in df_doc.iterrows():

            tram_labels = [l for l in row.labels if l in labels]
            emb = tram_test_embeddings[list(tram_test_embeddings.keys())[idx]].to(device)
            # Compute similarities
            similarities = emb @ ttp_embeddings.T  # Resulting shape: (number_of_embeddings,)
            
            # Create a mask based on ttp_ids_bosch_all
            mask = torch.zeros(similarities.shape, dtype=torch.bool)  # Match the shape of similarities
            mask[label_ids] = True  # Assume ttp_ids_bosch_all contains valid indices

            # Set values not in the mask to 0
            similarities[~mask] = 0  # Directly apply the mask to the 1D tensor
            indices_above_threshold = torch.where(similarities > tau)[0]
            ttp_values = [ttp_ids[i] for i in indices_above_threshold]

            all_labels.extend(tram_labels)
            all_preds.extend(ttp_values)

        tram_out[label_set][doc_name] = {
            "labels": list(set(all_labels)),
            "preds": list(set(all_preds))
        }

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:02<00:00,  1.37s/it]


In [28]:
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

bosch_out = {}

df = pd.read_json("datasets/bosch_test.json")

for label_set in tqdm(["bosch_t", "all_mitre"]):
    
    labels = labels_per_label_set[label_set]
    label_ids = ids_per_label_set[label_set]

    all_labels = []
    all_preds = []

    lb = MultiLabelBinarizer()
    lb.fit([labels])

    tau = bosch_results[label_set]["best_tau"]
    bosch_out[label_set] = {}

    for doc_name, df_doc in df.groupby("document"):
        all_labels = []
        all_preds = []

        for idx, row in df_doc.iterrows():

            bosch_labels = [l for l in row.labels if l in labels]

            emb = bosch_test_embeddings[idx]
            # Compute similarities
            similarities = emb @ ttp_embeddings.T  # Resulting shape: (number_of_embeddings,)
            
            # Create a mask based on ttp_ids_bosch_all
            mask = torch.zeros(similarities.shape, dtype=torch.bool)  # Match the shape of similarities
            mask[label_ids] = True  # Assume ttp_ids_bosch_all contains valid indices

            # Set values not in the mask to 0
            similarities[~mask] = 0  # Directly apply the mask to the 1D tensor
            indices_above_threshold = torch.where(similarities > tau)[0]
            ttp_values = [ttp_ids[i] for i in indices_above_threshold]

            all_labels.extend(bosch_labels)
            all_preds.extend(ttp_values)

        bosch_out[label_set][doc_name] = {
            "labels": list(set(all_labels)),
            "preds": list(set(all_preds))
        }

100%|██████████| 2/2 [00:01<00:00,  1.55it/s]


In [29]:
from test_common import calc_results_per_document

output = {
    "model_name": [],
    "dataset_name": [],
    "label_set": [],
    "f1": [],
    "accuracy": [],
    "precision": [],
    "recall": []
}

for dataset, out in zip(["tram", "bosch"], [tram_out, bosch_out]):
    for label_set in out:
        results_df = calc_results_per_document(out[label_set])
        f1 = results_df.f1.mean()
        accuracy = results_df.accuracy.mean()
        precision = results_df.precision.mean()
        recall = results_df.recall.mean()
        output["model_name"].append("nvidia-embed")
        output["dataset_name"].append(dataset)
        output["label_set"].append(label_set)
        output["f1"].append(f1)
        output["accuracy"].append(accuracy)
        output["precision"].append(precision)
        output["recall"].append(recall)

In [31]:
final_df = pd.DataFrame(output).sort_values(by="f1")
final_df.f1 = (final_df.f1 * 100).round(2)
final_df.precision = (final_df.precision * 100).round(2)
final_df.recall = (final_df.recall * 100).round(2)
final_df = final_df.sort_values(by=["dataset_name", "label_set"], ascending=[True, True])

final_df[["model_name", "dataset_name", "label_set", "f1", "precision", "recall"]]

Unnamed: 0,model_name,dataset_name,label_set,f1,precision,recall
3,nvidia-embed,bosch,all_mitre,9.5,5.85,63.19
2,nvidia-embed,bosch,bosch_t,29.51,24.17,62.45
1,nvidia-embed,tram,all_mitre,11.37,7.85,64.94
0,nvidia-embed,tram,tram_t,45.69,41.59,64.94
