In [None]:
import joblib
import pandas as pd
import numpy as np
from transformers import BartTokenizer

In [None]:
import sys

MOD_FOLDER = "../"
# setting path to enable import from the parent directory
sys.path.append(MOD_FOLDER)
print(sys.path)

In [None]:
tokenizer = BartTokenizer.from_pretrained("ModelTC/bart-base-mnli")


train_df = pd.read_csv("../data/finegrained/train.csv")
val_df = pd.read_csv("../data/finegrained/val.csv")
test_df = pd.read_csv("../data/finegrained/test.csv")

train_sentences = train_df["text"].tolist()
test_sentences = test_df["text"].tolist()
train_labels = train_df["label"].tolist()
val_labels = val_df["label"].tolist()
test_labels = test_df["label"].tolist()

In [None]:
bestk_train_data_per_proto = joblib.load(
    "../artifacts/bestk_train_data_per_proto.joblib"
)
best_protos_per_testeg = joblib.load("../artifacts/best_protos_per_testeg.joblib")

In [None]:
bestk_train_data_per_proto[0].shape

In [None]:
best_protos_per_testeg[0].shape

#### Accuracy of test example label being present in the topk train sample labels close to the best prototype.

In [None]:
results = []
for test_index, test_sample_prototypes in enumerate(best_protos_per_testeg[0]):
    test_sample_label = test_labels[test_index]
    respective_prototypes_train_labels = []
    for prototype in test_sample_prototypes:
        prototype_train_labels = []
        train_samples_close_to_prototype = bestk_train_data_per_proto[0][prototype]
        for train_sample in train_samples_close_to_prototype:
            prototype_train_labels.append(train_labels[train_sample])
        respective_prototypes_train_labels.append(prototype_train_labels)
    results.append((test_sample_label, respective_prototypes_train_labels))

In [None]:
def get_accuracy_of_model_on_label(results, k=5):
    statistics = []
    for test_label, prototypes_train_labels in results:
        per_test_statistics = []
        for prototype_train_labels in prototypes_train_labels:
            per_test_statistics.append(test_label in prototype_train_labels)
        statistics.append(per_test_statistics)
    statistics = np.array(statistics)
    return np.round(np.mean(np.sum(statistics[:, :k], axis=1) != 0), 3)

In [None]:
from collections import defaultdict

accuracies = defaultdict(list)
for k in [5, 3, 1]:
    overall_accuracy = get_accuracy_of_model_on_label(results, k)
    accuracies[k].append(overall_accuracy)
    for label in set(train_labels):
        if label == "O":
            continue
        label_specific_accuracy = get_accuracy_of_model_on_label(
            [result for result in results if result[0] == label], k
        )
        accuracies[k].append(label_specific_accuracy)
report_labels = list(set(train_labels))

In [None]:
report_df = pd.DataFrame(
    {
        "class": ["overall", *report_labels],
        "k = 5": accuracies[5],
        "k = 3": accuracies[3],
        "k = 1": accuracies[1],
    }
)

In [None]:
report_df

### Overlapping prototypes for each training label

In [None]:
best_protos_per_traineg = joblib.load("../artifacts/best_protos_per_traineg.joblib")
train_prototype_sentences = defaultdict(set)
for train_index, train_sample_prototypes in enumerate(best_protos_per_traineg[0]):
    for prototype in train_sample_prototypes.tolist()[:3]:
        train_prototype_sentences[train_labels[train_index]].add(prototype)
for key, values in train_prototype_sentences.items():
    print(f"{key}: {values}")

In [None]:
best_protos_per_testeg = joblib.load("../artifacts/best_protos_per_testeg.joblib")
test_prototype_sentences = defaultdict(set)
for test_index, test_sample_prototypes in enumerate(best_protos_per_testeg[0]):
    for prototype in test_sample_prototypes.tolist()[:3]:
        test_prototype_sentences[test_labels[test_index]].add(prototype)
for key, values in test_prototype_sentences.items():
    print(f"{key}: {values}")

### Clustering prototype tensors

In [None]:
import seaborn as sns
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.manifold import TSNE
import numpy as np

In [None]:
import torch

all_protos = torch.load("../artifacts/all_protos.pt")
all_protos = all_protos.cpu()
print(all_protos.shape)
num_protos = 50
all_protos = all_protos.view(num_protos, -1)
print(all_protos.shape)
print(all_protos.min())
print(all_protos.max())

In [None]:
pca = PCA(n_components=2)
all_protos_transformed = pca.fit_transform(all_protos.detach().numpy())
print(all_protos_transformed.shape)

proto_df = {
    "1st component": all_protos_transformed[:, 0].tolist(),
    "2nd component": all_protos_transformed[:, 1].tolist(),
    "Protoypes": np.arange(1, 51).tolist(),
}
sns.scatterplot(data=proto_df, x="2nd component", y="1st component", hue="Protoypes")

In [None]:
tsne = TSNE(n_components=2, learning_rate="auto", init="random", perplexity=2)
all_protos_transformed = tsne.fit_transform(all_protos.detach().numpy())
print(all_protos_transformed.shape)

proto_df = {
    "X": all_protos_transformed[:, 0].tolist(),
    "Y": all_protos_transformed[:, 1].tolist(),
    "Label": np.arange(1, 51).tolist(),
}
sns.scatterplot(data=proto_df, x="Y", y="X", hue="Label")

In [None]:
import umap

reducer = umap.UMAP()
all_protos_transformed = reducer.fit_transform(all_protos.detach().numpy())
print(all_protos_transformed.shape)

proto_df = {
    "X": all_protos_transformed[:, 0].tolist(),
    "Y": all_protos_transformed[:, 1].tolist(),
    "Label": np.arange(1, 51).tolist(),
}
sns.scatterplot(data=proto_df, x="Y", y="X", hue="Label")