In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from tqdm.auto import tqdm
import plotly.express as px

In [2]:
root_directory = os.getcwd()
log_directory = "PBN_transformer/prototypes_analysis_logs"

In [3]:
log_directory = os.path.join(root_directory, log_directory)
files = os.listdir(log_directory)

In [None]:
all_architectures = []
all_p1s = []
all_p2s = []
all_p3s = []
all_paths = []
all_datasets = []
all_attacks = []
all_num_protos = []
all_metrics = []

In [None]:
for file in tqdm(files):
    with open(os.path.join(log_directory, file)) as f:
        data = json.load(f)
        f.close()
    all_architectures.append(data["architecture"])
    all_p1s.append(data["p1_lamb"])
    all_p2s.append(data["p2_lamb"])
    all_p3s.append(data["p3_lamb"])
    all_num_protos.append(data["num_proto"])
    all_paths.append(file)
    all_datasets.append(data["dataset"])
    all_attacks.append(data["attack_type"])

  0%|          | 0/176 [00:00<?, ?it/s]

In [4]:
file = "BART_dbpedia_textfooler_0.9_0.9_0.9_16.json"
with open(os.path.join(log_directory, file)) as f:
    data = json.load(f)
    f.close()

In [29]:
attack_type = data["attack_type"]
results = data["results"]
prototypes = results["prototypes"]
prototypes_to_texts = {}
weights_associated_with_protos = results["test"]["test_textfooler"][
    "weights_associated_with_protos"
]
weights_associated_with_protos = np.array(weights_associated_with_protos).transpose()
for i in range(len(prototypes[0]["best_train_egs"])):
    all_texts = [x[0] for x in prototypes[0]["best_train_egs"][str(i)]]
    prototypes_to_texts[i] = all_texts

prototype_to_corresponding_label = {}
for i in range(len(prototypes[0]["best_train_egs"])):
    all_labels = [x[1] for x in prototypes[0]["best_train_egs"][str(i)]]
    prototype_to_corresponding_label[i] = max(set(all_labels), key=all_labels.count)

tests = results["test"]
advs = results["adv"]

tests_predictions = np.array(tests[f"test_{attack_type}"]["all_predictions"])
tests_labels = np.array(tests[f"test_{attack_type}"]["all_correct_labels"])
tests_distances = np.array(tests[f"test_{attack_type}"]["best_protos_dists"])
tests_best_prototypes = np.array(tests[f"test_{attack_type}"]["best_protos"])
tests_best_prototypes_labels = []
for prototypes_list in tests_best_prototypes:
    tests_best_prototypes_labels.append(
        [prototype_to_corresponding_label[x] for x in prototypes_list]
    )
tests_best_prototypes_labels = np.array(tests_best_prototypes_labels)

advs_predictions = np.array(advs[f"adv_{attack_type}"]["all_predictions"])
advs_labels = np.array(advs[f"adv_{attack_type}"]["all_correct_labels"])
advs_distances = np.array(advs[f"adv_{attack_type}"]["best_protos_dists"])
advs_best_prototypes = np.array(advs[f"adv_{attack_type}"]["best_protos"])
advs_best_prototypes_labels = []
for prototypes_list in advs_best_prototypes:
    advs_best_prototypes_labels.append(
        [prototype_to_corresponding_label[x] for x in prototypes_list]
    )
advs_best_prototypes_labels = np.array(advs_best_prototypes_labels)

In [30]:
dbpedia_classes = {
    "agent": 0,
    "work": 1,
    "place": 2,
    "species": 3,
    "unitofwork": 4,
    "event": 5,
    "sportsseason": 6,
    "device": 7,
    "topicalconcept": 8,
}

In [31]:
for prototype_index in range(len(prototypes[0]["best_train_egs"])):
    print("Prototype", prototype_index)
    print(
        "label associated with the prototype",
        prototype_to_corresponding_label[prototype_index],
    )
    for x in prototypes[0]["best_train_egs"][str(prototype_index)]:
        print(x[1], x[0])

    print("---------------------------------")

Prototype 0
label associated with the prototype 4
4 Handly's Lessee v. Anthony, 18 U. S. 374 (1820), is a ruling by the Supreme Court of the United States which held that the proper boundary between the states of Indiana and Kentucky was the low-water mark on the western and northwestern bank of the Ohio River. Motion by the plaintiff, Handly's lessee, to eject inhabitants of a peninsula in the Ohio River (which was at times temporarily cut off from Indiana by high water) was denied.
4 Rasul v. Bush, 542 U.S. 466 (2004), is a landmark United States Supreme Court decision establishing that the U.S. court system has the authority to decide whether foreign nationals (non-U.S. citizens) held in Guantanamo Bay were wrongfully imprisoned. The 6–3 ruling on June 28, 2004, reversed a decision by the Court of Appeals for the D.C. Circuit, which held that the Judiciary had no jurisdiction to handle wrongful imprisonment cases involving foreign nationals who are held in Guantanamo Bay. Justice Jo

In [33]:
test_csv_file = "datasets/dbpedia_dataset/test_textfooler.csv"
adv_csv_file = "datasets/dbpedia_dataset/adv_textfooler.csv"

test_csv_df = pd.read_csv(os.path.join(root_directory, test_csv_file))
adv_csv_df = pd.read_csv(os.path.join(root_directory, adv_csv_file))

In [None]:
overlap_of_prediction_label_with_prototype_label = 0
for random_index in range(len(tests_labels)):
    predicted_label = tests_predictions[random_index]
    print("TEST prototypes and labels")
    print(tests_best_prototypes[random_index][:4])
    print(tests_distances[random_index][:4])
    for label in range(9):
        print("label ", label)
        print(
            [
                weights_associated_with_protos[prototype][label]
                for prototype in tests_best_prototypes[random_index][:4]
            ]
        )
    print(tests_best_prototypes_labels[random_index][:4])

    print("---------------------------------")

    print("ADV prototypes and labels")
    print(advs_best_prototypes[random_index][:4])
    print(advs_distances[random_index][:4])
    for label in range(9):
        print("label ", label)
        print(
            [
                weights_associated_with_protos[prototype][label]
                for prototype in advs_best_prototypes[random_index][:4]
            ]
        )
    print(advs_best_prototypes_labels[random_index][:4])

    print(test_csv_df.loc[random_index]["text"], test_csv_df.loc[random_index]["label"])
    print(adv_csv_df.loc[random_index]["text"], adv_csv_df.loc[random_index]["label"])

In [42]:
indices_where_both_test_prediction_and_adv_prediction_are_correct = np.where(
    (tests_predictions == tests_labels)
    & (advs_predictions == advs_labels)
    & (tests_labels == 2)
)[0]

random_index = np.random.choice(
    indices_where_both_test_prediction_and_adv_prediction_are_correct
)

In [47]:
print("Random index: ", random_index)
# show the prototypes for the random index in the test set
print(
    "The label of the test example is:",
    tests_labels[random_index],
    "with the prediction:",
    tests_predictions[random_index],
    "and perturbed prediction:",
    advs_predictions[random_index],
)
print("TEST prototypes and labels")
print(tests_best_prototypes[random_index][:4])
print(tests_distances[random_index][:4])
for label in range(9):
    print("label ", label)
    print(
        [
            weights_associated_with_protos[prototype][label]
            for prototype in tests_best_prototypes[random_index][:4]
        ]
    )
print(tests_best_prototypes_labels[random_index][:4])

print("---------------------------------")

print("ADV prototypes and labels")
print(advs_best_prototypes[random_index][:4])
print(advs_distances[random_index][:4])
for label in range(9):
    print("label ", label)
    print(
        [
            weights_associated_with_protos[prototype][label]
            for prototype in advs_best_prototypes[random_index][:4]
        ]
    )
print(advs_best_prototypes_labels[random_index][:4])


print(test_csv_df.loc[random_index]["text"], test_csv_df.loc[random_index]["label"])
print(adv_csv_df.loc[random_index]["text"], adv_csv_df.loc[random_index]["label"])

Random index:  150
The label of the test example is: 2 with the prediction: 2 and perturbed prediction: 2
TEST prototypes and labels
[ 7 14  5 11]
[-3.74387503 -0.12438459 -0.08675446  0.00837474]
label  0
[-0.07225818932056427, -0.2766049802303314, -0.33035922050476074, -0.01710103452205658]
label  1
[0.4024500846862793, 0.35161471366882324, -0.10004425048828125, -0.3217751681804657]
label  2
[-0.6278798580169678, -0.48109567165374756, -0.5777713060379028, -0.5818343162536621]
label  3
[-0.5178269743919373, 0.8407810926437378, 0.7405584454536438, -0.10318030416965485]
label  4
[0.4227420687675476, 0.3181793689727783, -0.29289817810058594, 0.16065558791160583]
label  5
[0.29405224323272705, -0.35102730989456177, -0.17219163477420807, 0.3944070637226105]
label  6
[0.3630494475364685, 0.2522026300430298, 0.006006683222949505, 0.6615841388702393]
label  7
[0.3050634264945984, -0.255790650844574, 0.35098108649253845, 0.209478497505188]
label  8
[0.22266632318496704, -0.1487235575914383, 0.

In [None]:
indices_where_both_test_prediction_and_adv_prediction_are_correct = np.where(
    (tests_predictions == tests_labels)
    & (advs_predictions != advs_labels)
    & (tests_labels == 0)
    & (advs_predictions == 2)
)[0]

random_index = np.random.choice(
    indices_where_both_test_prediction_and_adv_prediction_are_correct
)

In [None]:
print("Random index: ", random_index)
# show the prototypes for the random index in the test set
print(
    "The label of the test example is:",
    tests_labels[random_index],
    "with the prediction:",
    tests_predictions[random_index],
    "and perturbed prediction:",
    advs_predictions[random_index],
)
print(tests_best_prototypes[random_index][:4])
print(tests_best_prototypes_labels[random_index][:4])

print(advs_best_prototypes[random_index][:4])
print(advs_best_prototypes_labels[random_index][:4])

print(test_csv_df.loc[random_index]["text"], test_csv_df.loc[random_index]["label"])
print(adv_csv_df.loc[random_index]["text"], adv_csv_df.loc[random_index]["label"])

Random index:  992
The label of the test example is: 0 with the prediction: 0 and perturbed prediction: 0
[ 7 12  8  9]
[3 0 0 0]
[ 7 12  8  2]
[3 0 0 1]
Inta Ezergailis (11 September 1932 in Riga, Latvia – 1 January 2005, in Ithaca, New York), was a Latvian American professor emerita of German literature at Cornell University from 1969 to 1999, specializing in Thomas Mann and contemporary women writers. In 1965, she began graduate study at Cornell University and after earning the doctorate in 1969, she was appointed to the Cornell faculty as an assistant professor of German literature. In addition to books, she published numerous articles, in English and Latvian, in scholarly and intellectual periodicals. During the last decade of her life, she turned from academic prose to poetry. Posthumously, her poems have been published by Ulysses House in four volumes: Inta's poems I, II, and III, Alzheimer's Poems, the Vanishing of a Mother. The last volume is devoted to her mother who died of 