# Creating a dataset class with activations from the hooks and corresponding tokens

In [None]:
%pip install transformer_lens

In [2]:
from typing import Any

from tqdm import tqdm
import einops
import numpy as np
import torch
from datasets import Dataset, IterableDataset, load_dataset
from numpy.typing import NDArray
from pydantic import BaseModel, ConfigDict
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from transformer_lens import HookedTransformer

In [6]:
class DatasetConfig(BaseModel):
    model_config = ConfigDict(extra="forbid", frozen=True)
    dataset_name: str
    is_tokenized: bool = True
    tokenizer_name: str
    streaming: bool = True
    split: str
    n_ctx: int
    seed: int = 0
    column_name: str = "input_ids"
    """The name of the column in the dataset that contains the data (tokenized or non-tokenized).
    Typically 'input_ids' for datasets stored with e2e_sae/scripts/upload_hf_dataset.py, or "tokens"
    for datasets tokenized in TransformerLens (e.g. NeelNanda/pile-10k)."""

In [8]:
dataset_config  = DatasetConfig

dataset_config.model_config = None # blank
dataset_config.dataset_name= 'roneneldan/TinyStories' # 'apollo-research/roneneldan-TinyStories-tokenizer-gpt2'
dataset_config.is_tokenized= False # True
dataset_config.tokenizer_name= 'gpt2'
dataset_config.streaming= True # True - means you do not download the dataset https://huggingface.co/docs/datasets/en/stream
dataset_config.split= 'train' # ['train', 'validation']
dataset_config.n_ctx= 512
dataset_config.seed= 0
dataset_config.column_name: str = "input_ids"

In [None]:
device = 'cuda' # 'cpu', 'cuda'
model = HookedTransformer.from_pretrained("tiny-stories-1M").to(device)
dataset = load_dataset(
    dataset_config.dataset_name, streaming=dataset_config.streaming, split=dataset_config.split
)
# tokenizer = AutoTokenizer.from_pretrained(dataset_config.tokenizer_name) # 'gpt2' for tiny-stories-1M

In [None]:
"""
Gather hidden representations of the residual stream after the block  blocks.5
    blocks.5.hook_resid_post
"""

n_seqs = 3200 # 3200 # Num sequences to process
embs_list = []
tokens_list = []
text_id_list = []
seqs_saved = []
for i, v in tqdm(enumerate(dataset), total=n_seqs):
    text_ = v['text']
    tokenized = model.tokenizer(text_)['input_ids'] # No BOS token
    model.eval()
    with torch.no_grad():
        logits_, cache_ = model.run_with_cache(text_)
    acts_ = cache_['blocks.5.hook_resid_post'][0].cpu().numpy()[1:] # Skip BOS token
    # Append to lists
    embs_list.append(acts_)
    tokens_list.append(tokenized)
    text_id_list = text_id_list + [i]*len(acts_)
    seqs_saved.append(text_)
    if i >= n_seqs:
        break

In [10]:
from pathlib import Path
import pandas as pd

embs_flat = np.array([v for vv in embs_list for v in vv])
tokens_str_list_flat = [model.tokenizer.decode(v) for vv in tokens_list for v in vv]
assert (len(embs_flat)==len(tokens_str_list_flat)==len(text_id_list)), 'Not equal len'

# Save embedding matrix
print(f"Embedding matrix takes {embs_flat.nbytes/1024**2:.3f} Mb")
datadir = Path ('./')
filepath_acts = datadir / f"emb_matrix.npy"
with open(filepath_acts, 'wb') as f:
    np.save(f, embs_flat)

df_acts_tokens = pd.DataFrame()
df_acts_tokens['emb_id'] = list(range(len(embs_flat)))
df_acts_tokens['token_str'] = tokens_str_list_flat
df_acts_tokens['seq_id'] = text_id_list
df_acts_tokens.emb_id = df_acts_tokens.emb_id.astype(np.int32)
df_acts_tokens.seq_id = df_acts_tokens.seq_id.astype(np.int32)

In [None]:
df_acts_tokens.head()

In [12]:
# Play with sorting tokens by neuron activations
# qq = np.array(acts_list_flat)
# df_acts_tokens['nn'] = qq[:, 1]
# df_acts_tokens.sort_values(by='nn', ascending=False).head(20)

In [13]:
# Save the dataset of texts activations have been acquired from
df_texts = pd.DataFrame()
df_texts['seq'] = seqs_saved
df_texts['seq_id'] = df_texts.index

In [14]:
# # Left join activations dataframe with sequence dataframe
# df_acts_tokens.merge(df_texts, how='left', on='seq_id').info()

In [17]:
df_acts_tokens.to_csv(datadir / "tinystories1M-TinyStories1_gpt2token-acts_id.csv", index=False)
df_texts.to_csv(datadir / "tinystories1M-TinyStories1_gpt2token-texts.csv", index=False)

In [None]:
del df_acts_tokens
del df_texts
del embs_flat

In [311]:
# from pathlib import Path
# datadir = Path("./")

# # Load DataFrames with embedding ids and texts
# df_acts_filename = datadir / "tinystories1M-TinyStories1_gpt2token-acts_id.csv"
# df_texts_filename = datadir / "tinystories1M-TinyStories1_gpt2token-texts.csv"
# df_acts_tokens = pd.read_csv(df_acts_filename)
# df_texts= pd.read_csv(df_texts_filename)

# # Load embedding matrix
# with open(datadir / 'emb_matrix.npy', 'rb') as f:
#     embs_flat = np.load(f)
    
# # # Left join activations dataframe with sequence dataframe
# # df_acts_tokens_texts = df_acts_tokens.merge(df_texts, how='left', on='seq_id').info()