In [None]:
# import jupyter_black

# jupyter_black.load()

In [None]:
# !tree -L 2

In [None]:
from natsort import natsorted
from glob import glob
from itertools import product

import torch
import numpy as np
from umap import UMAP

from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from matplotlib import colormaps

import data_utils

plt.style.use("ggplot")
plt.style.use("seaborn-v0_8-colorblind")

In [None]:
## Load vocabulary

# with open("./data/20k.txt") as f:
# with open("./data/nouns_and_adjectives.txt") as f:
with open("./data/wordnet_hierarchy.txt") as f:
    vocabulary = [v.strip() for v in f]
", ".join(vocabulary[:20])

## Load Data

In [None]:
models = natsorted(glob('my_data/resnet*/'))
models

In [None]:
model_layer_neurons = []
sim_data = []
for m in models:
    sim = torch.load(f"./{m}/all_layer_similarities.pt")
    m = m.split('/')[-2]
    model_layer_neurons += [[m, s['layer'], s['similarities'].shape[0]] for s in sim]
    sim_data += [s['similarities'] for s in sim]
sim_data = torch.cat(sim_data)

In [None]:
sim_data.shape

In [None]:
c = torch.cat([torch.ones(s[2])+i for i,s in enumerate(model_layer_neurons)]).numpy()
s = torch.cat([torch.ones(s[2])+i for i,s in enumerate(model_layer_neurons)]).numpy()


In [None]:
xy = UMAP(
    n_neighbors=300,
    min_dist=0.3, 
    n_components=2,
).fit_transform(sim_data.numpy())


In [None]:
plt.scatter(
    xy[:,0], 
    xy[:,1], 
    s=0.5,
)

In [None]:
dir_out

In [None]:
model_layer_neurons

In [None]:
vis = False


start = 0
n_neurons_per_layer = [s[2] for s in model_layer_neurons]
for start,s in zip([0,*np.cumsum(n_neurons_per_layer)],model_layer_neurons) :
    n = s[2]
    xy_layer = xy[start:start+n]
    
    fn = f'{s[0]}_{s[1]}'
    np.save(f"{dir_out}/{fn}.npy", xy_layer.astype(np.float16))
    print(f"{dir_out}/{fn}.npy")
    
    if vis:
        plt.figure(figsize=[3,3])
        plt.scatter(
            xy_layer[:,0], 
            xy_layer[:,1], 
            s=1,
            label=f'{s[0]}, {s[1]}'
        )
        plt.legend()
        plt.axis('equal')
        plt.show()

In [None]:

umaps = []
for m1, m2 in product(models, models):
    if m1>m2:
        continue
                
    ## Load neuron-concept similarity
    sim1 = torch.load(f"./{m1}/all_layer_similarities.pt")
    sim2 = torch.load(f"./{m2}/all_layer_similarities.pt")
    layer_names1 = [s['layer'] for s in sim1]
    layer_names2 = [s['layer'] for s in sim2]
    
    m1, m2 = m1.split('/')[-2], m2.split('/')[-2]
    for l1, l2 in product(
        range(len(layer_names1)), 
        range(len(layer_names2))
    ):
        if l1>l2: 
            continue
        
        data = torch.cat([sim1[l1]["similarities"],sim2[l2]["similarities"]]).numpy()
        print(data.shape)
        xy = UMAP(n_components=2, min_dist=0.3).fit_transform(data)
        ## num neurons
        n1 = sim1[l1]["similarities"].shape[0]
        n2 = sim2[l2]["similarities"].shape[0]
        plt.figure(figsize=[4,4])
        plt.scatter(
            xy[:,0], 
            xy[:,1], 
            s=torch.cat([10+torch.zeros(n1), 5+torch.zeros(n2)]), 
            c=torch.cat([torch.zeros(n1), torch.ones(n2)]), 
            cmap='tab10'
        )
        plt.title([m1, m2, layer_names1[l1], layer_names2[l2]])
        plt.show()
        
        umaps.append([m1, m2, layer_names1[l1], layer_names2[l2], xy])
        umaps.append([m2, m1, layer_names2[l2], layer_names1[l1], xy])
        

### Neuron UMAP, Each individual layers

In [None]:
for i,[m1, m2, l1, l2, umap] in enumerate(umaps):
    plt.figure()
    plt.scatter(
        umap[:, 0],
        umap[:, 1],
        s=10,
        linewidth=0.1,
        edgecolors="#333",
    )
    plt.axis("equal")
    plt.legend()
    plt.show()

## Save UMAP to file

In [None]:
m1.split('/')[-2]

In [None]:
layer_names1

In [None]:
import os

In [None]:
dir_out = 'my_data/neuron_umaps'
os.makedirs(dir_out, exist_ok=True)

In [None]:

for i,[m1, m2, l1, l2, umap] in enumerate(umaps):
    fn = f'{m1}_{l1}_{m2}_{l2}'
    np.save(f"{dir_out}/{fn}.npy", umap)
    print(f"{dir_out}/{fn}.npy")

---

In [None]:
# #TODO? SpectralCoclustering

# from sklearn.cluster import SpectralCoclustering

# clustering = SpectralCoclustering().fit(im)

# row = np.argsort(clustering.row_labels_)
# col = np.argsort(clustering.column_labels_)


## Save neuron top-n concept indices to file

In [None]:
# for sim in similarity_load:
#     layer_name = sim['layer']
#     sim = sim['similarities'].argsort(descending=True)
#     sim = sim[:,:100] ## get top
#     sim = sim.type(torch.int32)
#     fn = f'{dir_out}/concepts_top100_{layer_name}.npy'
#     np.save(fn, sim.numpy())

## Copy vocabulary file

In [None]:
# !cp data/20k.txt {dir_out}/vocabulary_20k.txt