In [None]:
# a box of mixed biscuit and biscuit mixture

In [None]:
# import jupyter_black

# jupyter_black.load(lab=False)

In [None]:
import torch
import numpy as np
from umap import UMAP

from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from matplotlib import colormaps

plt.style.use("ggplot")
plt.style.use("seaborn-v0_8-colorblind")

In [None]:
with open("./data/20k.txt") as f:
    vocabulary = [v.strip() for v in f]
", ".join(vocabulary[:20])

similarity_load = torch.load("./my_data/all_layer_similarities.pt")
print("similarity_load", type(similarity_load), len(similarity_load))

sim = similarity_load[0]["similarities"]
print("sim:", similarity_load[0]["layer"], sim.shape)

layer_names = [s["layer"] for s in similarity_load]
layer_widths = [s["similarities"].shape[0] for s in similarity_load]
print("layer_names", layer_names)
print("layer_widths", layer_widths)

all_layer_similarity = torch.concat([s["similarities"] for s in similarity_load])
layer_numbers = torch.concat(
    [
        torch.zeros(s["similarities"].shape[0], dtype=torch.int) + i
        for i, s in enumerate(similarity_load)
    ]
)
print("all_layer_similarity.shape =", all_layer_similarity.shape)

plt.figure(figsize=[4, 2])
plt.plot(layer_numbers)
plt.title("layer_numbers")
plt.show()

## Save concepts to file

In [None]:
# all_layer_similarity.numpy().to("all_layer_similarity.npy")

In [None]:
## Plot neuron-concepts similaries per neuron

for l, layer_name in enumerate(layer_names):
    sim = similarity_load[l]["similarities"]

    for neuron_index, s in enumerate(tqdm(sim)):
        ranked_vocabulary_indices = list(np.argsort(s))[::-1]

        top = 20
        top_concepts = [vocabulary[r] for r in ranked_vocabulary_indices[:top]]
        top_concepts_str = "\n".join(
            ", ".join(top_concepts[i : i + 5]) for i in range(0, top, 5)
        )
        bottom_concepts = [vocabulary[r] for r in ranked_vocabulary_indices[-top:]]
        bottom_concepts_str = "\n".join(
            ", ".join(bottom_concepts[i : i + 5]) for i in range(0, top, 5)
        )

        #         print(f"[{layer_name}, neuron {neuron_index}]\n" "-" * 10 + "\n")
        #         f.write(f'Top {len(top_concepts)} concepts:\n')
        #         f.write(top_concepts_str + '\n')
        #         f.write('-'*10 + '\n')
        #         f.write(f'Bottom {len(bottom_concepts)} concepts:\n')
        #         f.write(bottom_concepts_str+ '\n')
        #         f.write('\n')

        #         plt.figure(figsize=[4, 3], dpi=80)
        #         plt.stem(s[ranked_vocabulary_indices])
        #         plt.title(
        #             f"{layer_name}, neuron {neuron_index}\n"
        #             #         f'Top {len(top_concepts)} concepts: \n{top_concepts_str}'
        #         )

        plt.figure(figsize=[4, 3], dpi=80)
        plt.hist(s, bins=100)
        plt.xlabel("neuron-concept relatedness")
        plt.ylabel("Count of concepts (total = 20k words)")
        plt.show()

        if neuron_index >= 20:
            break

In [None]:
## Save top and bottom 20 concepts as text files

for l, layer_name in enumerate(layer_names):
    sim = similarity_load[l]['similarities']

    with open(f'neuron-concepts-{layer_name}.txt', 'w') as f:
        print(f'# {layer_name}\n')
        f.write(f'# {layer_name}\n')
        for neuron_index, s in enumerate(tqdm(sim)):
            ranked_vocabulary_indices = list(np.argsort(s))[::-1]

            top = 20
            top_concepts = [vocabulary[r] for r in ranked_vocabulary_indices[:top]]
            top_concepts_str = '\n'.join(', '.join(top_concepts[i:i+5]) for i in range(0, top, 5))
            bottom_concepts = [vocabulary[r] for r in ranked_vocabulary_indices[-top:]]
            bottom_concepts_str = '\n'.join(', '.join(bottom_concepts[i:i+5]) for i in range(0, top, 5))

            f.write(f'[{layer_name}, neuron {neuron_index}]\n')
            f.write('-'*10 + '\n')
            f.write(f'Top {len(top_concepts)} concepts:\n')
            f.write(top_concepts_str + '\n')
            f.write('-'*10 + '\n')
            f.write(f'Bottom {len(bottom_concepts)} concepts:\n')
            f.write(bottom_concepts_str+ '\n')
            f.write('\n')

            plt.figure(figsize=[4,3], dpi=80)
            plt.stem(s[ranked_vocabulary_indices])
            plt.title(
                f'{layer_name}, neuron {neuron_index}\n'
        #         f'Top {len(top_concepts)} concepts: \n{top_concepts_str}'
            )    
            plt.show()

#         if neuron_index>=20:
#             break




## Neuron UMAP

In [None]:
umap = UMAP(n_components=2).fit_transform(all_layer_similarity.numpy())

In [None]:
cmap = colormaps['viridis'].resampled(5).colors

for ln in set(layer_numbers.tolist()):
    plt.scatter(
        umap[layer_numbers==ln,0], 
        umap[layer_numbers==ln,1], 
        s=10,
        color=cmap[ln],
        label=f'layer {ln} ({layer_names[ln]})',
        linewidth=0.1,
        edgecolors='#333'
    )
plt.axis('equal')
plt.legend()
plt.show()

In [None]:
#TODO? SpectralCoclustering

from sklearn.cluster import SpectralCoclustering

clustering = SpectralCoclustering().fit(im)

row = np.argsort(clustering.row_labels_)
col = np.argsort(clustering.column_labels_)
