# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# %%capture
!pip install kmapper matplotlib numpy scikit_learn umap umap-learn

Collecting kmapper
  Downloading kmapper-2.1.0-py3-none-any.whl.metadata (4.9 kB)
Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading kmapper-2.1.0-py3-none-any.whl (126 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: umap
  Building wheel for umap 

In [3]:
import pickle

import kmapper as km
from kmapper.jupyter import display
import umap
import sklearn
import sklearn.manifold as manifold
import numpy as np
import matplotlib.pyplot as plt

# load sae features

In [7]:
# fn = 'ts-1L-21M_Wdec'
# fn = 'ts-2L-33M_Wdec'
fn = 'gpt2-small-8-res-jb_Wdec'
file_path = f'/content/drive/MyDrive/{fn}.pkl'
with open(file_path, 'rb') as f:
    feature_weights = pickle.load(f)

In [8]:
data = feature_weights.detach().cpu().numpy()
data.shape

(24576, 768)

# load labels

In [9]:
import json
with open('gpt2-small-8-res-jb-explanations.json', 'rb') as f:
    feat_snip_dict = json.load(f)

In [10]:
# can't just loop over dict as it's not in order
# labels = []
# for feat_dict in feat_snip_dict['explanations']:
#     labels.append(feat_dict['description'])

labels = [0] * len(feat_snip_dict['explanations'])
for feat_dict in feat_snip_dict['explanations']:
    labels[int(feat_dict['index'])] = feat_dict['description']

In [11]:
labels[41]

' specific technical terms and numbers'

In [12]:
fList_model_A = np.array(labels)

# Mapper

In [13]:
mapper = km.KeplerMapper(verbose=1) # initialize mapper

# project data into 2D subspace via 2 step transformation, 1)isomap 2)UMAP
projected_data = mapper.fit_transform(data, projection=[manifold.Isomap(n_components=100, n_jobs=-1), umap.UMAP(n_components=2,random_state=1)])

# cluster data using DBSCAN
graph = mapper.map(projected_data, data, clusterer=sklearn.cluster.DBSCAN(metric="cosine"))

KeplerMapper(verbose=1)
..Composing projection pipeline of length 2:
	Projections: Isomap(n_components=100, n_jobs=-1)
		UMAP(random_state=1)
	Distance matrices: False
False
	Scalers: MinMaxScaler()
MinMaxScaler()
..Projecting on data shaped (24576, 768)

..Projecting data using: 
	Isomap(n_components=100, n_jobs=-1)


..Scaling with: MinMaxScaler()

..Projecting on data shaped (24576, 100)

..Projecting data using: 
	UMAP(random_state=1, verbose=1)

UMAP(n_jobs=1, random_state=1, verbose=1)
Wed Jul 24 18:27:37 2024 Construct fuzzy simplicial set
Wed Jul 24 18:27:37 2024 Finding Nearest Neighbors
Wed Jul 24 18:27:37 2024 Building RP forest with 13 trees


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Wed Jul 24 18:27:41 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	Stopping threshold met -- exiting after 4 iterations
Wed Jul 24 18:27:56 2024 Finished Nearest Neighbor Search
Wed Jul 24 18:27:59 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Jul 24 18:28:16 2024 Finished embedding

..Scaling with: MinMaxScaler()

Mapping on data shaped (24576, 768) using lens shaped (24576, 2)

Creating 100 hypercubes.

Created 215 edges and 612 nodes in 0:00:01.446255.


In [14]:
# define an excessively long filename (helpful if saving multiple Mapper variants for single dataset)
fileID = fn + '_projection=' + graph['meta_data']['projection'].split('(')[0] + '_' + \
'n_cubes=' + str(graph['meta_data']['n_cubes']) + '_' + \
'perc_overlap=' + str(graph['meta_data']['perc_overlap']) + '_' + \
'clusterer=' + graph['meta_data']['clusterer'].split('(')[0] + '_' + \
'scaler=' + graph['meta_data']['scaler'].split('(')[0]

fileID

'gpt2-small-8-res-jb_Wdec_projection=UMAP_n_cubes=10_perc_overlap=0.1_clusterer=DBSCAN_scaler=MinMaxScaler'

In [15]:
labels = list(range(data.shape[0]))
labels = np.array(labels)

In [16]:
mapper.visualize(graph,
                path_html=fileID + ".html",
                title=fileID,
                custom_tooltips =  fList_model_A,
                # custom_tooltips = labels,
                # color_values = np.log(per_return+1),
                color_function_name = 'test',
                node_color_function = np.array(['average', 'std', 'sum', 'max', 'min']))



Wrote visualization to: gpt2-small-8-res-jb_Wdec_projection=UMAP_n_cubes=10_perc_overlap=0.1_clusterer=DBSCAN_scaler=MinMaxScaler.html




In [17]:
from google.colab import files
files.download(fileID + ".html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>