*"A box of mixed biscuit and biscuit mixture"*

In [None]:
with open('data/nouns_and_adjectives_unclean.txt') as f:
    words = [w.strip() for w in f]
    
## remove redundancy
word_list = []
word_set = set()

for w in words:
    if w not in word_set:
        word_list.append(w)
        word_set.update([w])
    else:
        continue

with open('data/nouns_and_adjectives.txt', 'w') as f:
    f.write('\n'.join(word_list))

In [None]:
# import jupyter_black

# jupyter_black.load()

In [None]:
# !tree -L 2

In [None]:
import torch
import numpy as np
from umap import UMAP

from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from matplotlib import colormaps
from IPython.display import clear_output
from time import sleep

import data_utils

plt.style.use("ggplot")
plt.style.use("seaborn-v0_8-colorblind")

In [None]:
dir_out = "my_data"

## Load Data

In [None]:
## Load vocabulary

vocabulary_name = '20k'
# vocabulary_name = 'nouns_and_adjectives'

with open(f"./data/{vocabulary_name}.txt") as f:
    vocabulary = [v.strip() for v in f]
", ".join(vocabulary[:20])

In [None]:
word_embedding = torch.load(f"./saved_activations/{vocabulary_name}_ViT-B16.pt").cpu().numpy()
word_embedding.shape

In [None]:
umap = UMAP(metric='euclidean', n_neighbors=100, min_dist=0).fit_transform(word_embedding)


In [None]:
## rotate umap through PCA
mean = umap.mean(axis=0, keepdims=True)
rot = np.linalg.svd(umap - mean, full_matrices=False)[-1].T
umap = (umap - mean) @ rot

In [None]:
plt.scatter(umap[:, 0], umap[:, 1], s=0.1)
plt.axis("equal")

In [None]:
np.save(f"{dir_out}/concepts_umap_{vocabulary_name}.npy", umap)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE().fit_transform(word_embedding)


In [None]:
plt.scatter(tsne[:, 0], tsne[:, 1], s=0.1)
plt.axis("equal")

In [None]:
np.save(f"{dir_out}/concepts_tsne_{vocabulary_name}.npy", tsne)

---

In [None]:
## size marks by total concept scores

In [None]:
for i, layer in enumerate(['conv1', 'layer1', 'layer2', 'layer3', 'layer4']):
    neuron_concept_similarity = np.load(f"./my_data/neuron_concept_similarities_{layer}.npy")
#     concept_scores = neuron_concept_similarity.clip(0, np.Infinity).mean(axis=0)
    concept_scores = neuron_concept_similarity.max(axis=0)
    argsort = concept_scores.argsort()
    
    plt.figure(figsize=[8,4])
    plt.scatter(umap[argsort, 0], umap[argsort, 1], s=5, c=concept_scores[argsort], vmin=0, vmax=0.5)  # cosine
#     plt.scatter(tsne[argsort, 0], tsne[argsort, 1], s=30, c=concept_scores[argsort])  # cosine
    plt.axis("equal")
    plt.colorbar()
    plt.title(f'concepts fired up at {layer}')
    plt.savefig(f'figs/concepts_fired_up_at_{layer}.png', dpi=200, bbox_inches='tight')
    plt.show()
    
    top = argsort[-20:][::-1]
    
    display([(vocabulary[i], concept_scores[i]) for i in top])
#     sleep(1)
    