In [2]:
import pandas as pd
classes = ["empathetic_dialogue", "math", "mmlu", "programming"]
path = f"./datasets/dot_product/{classes[0]}_between_class_clean_topk.csv"
# path = "./datasets/dot_product/negative_feelings.csv"
df = pd.read_csv(path, encoding="utf-8-sig")
df.head(20)

Unnamed: 0,type,layer,class1,class2,Neuron_ID,Contribution,Explanation
0,mlp,24,apprehensive,trusting,282,15015042.0,hyperlinks and web addresses
1,att,22,apprehensive,trusting,14953,13710207.0,attends to specific significant tokens from s...
2,mlp,24,apprehensive,trusting,9329,12911080.0,legal terms and phrases related to court case...
3,att,12,apprehensive,trusting,2424,12815097.0,attends from tokens denoting an action or con...
4,att,7,apprehensive,trusting,10364,11636983.0,attends to instances of the token followed by...
5,res,1,apprehensive,trusting,12054,11112904.0,content related to medical conditions and trea...
6,mlp,7,apprehensive,trusting,6117,10844195.0,topics related to nutrition and health measur...
7,att,5,apprehensive,trusting,9670,10387046.0,attends to specific tokens lacking any conten...
8,att,23,apprehensive,trusting,5279,10378344.0,"attends to the token ""by"" from the surroundin..."
9,res,12,apprehensive,trusting,2620,9890904.0,"words associated with authority, control, and ..."


In [3]:
import requests

def get_feature(model_id, source, index):
    try:
        url = f"https://www.neuronpedia.org/api/feature/{model_id}/{source}/{index}"
        resp = requests.get(url)
        resp.raise_for_status()            # throws if not 200
        feature = resp.json()

        # 2. Grab the list of explanations (might be empty!)
        explanations = feature.get("explanations", [])

        # 3. first description:
        # if explanations:
        return explanations[0]["description"]
    except Exception as e:
        return None

In [4]:
from tqdm.auto import tqdm

model = "gemma-2-2b"
explanations = []
if "Explanation" not in df.columns:
    df["Explanation"] = None

to_process = 50000

In [None]:
# for idx, row in df.head(24700).iterrows():
# for idx, row in tqdm(df.iterrows(), total=len(df), desc="Fetching features"):
for idx, row in tqdm(df.head(25000).iterrows(), total=min(25000, len(df)), desc="Fetching features"):
    sae_type  = row["type"]
    layer     = row["layer"]
    neuron_id = row["Neuron_ID"]
    source    = f"{layer}-gemmascope-{sae_type}-16k"
    if pd.notna(row["Explanation"]):
        continue
    feat = get_feature(model, source, neuron_id)
    df.at[idx, "Explanation"] = feat

# df["Explanation"] = explanations
df.to_csv(path, index=False, encoding="utf-8-sig")

Fetching features:   0%|          | 0/25000 [00:00<?, ?it/s]

In [5]:
import pandas as pd
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

model = "gemma-2-2b"

def parallel_fetch_feature(args):
    try:
      """Helper function for parallel processing"""
      idx, row = args
      sae_type = row["type"]
      layer = row["layer"]
      neuron_id = row["Neuron_ID"]
      source = f"{layer}-gemmascope-{sae_type}-16k"

      feat = get_feature(model, source, neuron_id)
      return idx, feat
    except Exception as e:
      print(e)
      return None, None

def parallel_approach(df, path, max_workers=8):
    """Use parallel processing with skipping for existing explanations"""
    if "Explanation" not in df.columns:
        df["Explanation"] = None

    df_subset = df.head(to_process)

    # SKIP ROWS THAT ALREADY HAVE EXPLANATIONS
    # Filter out rows where Explanation is not None/NaN/empty
    rows_to_process = []
    skipped_count = 0

    for idx, row in df_subset.iterrows():
        # Skip if explanation already exists and is not None/NaN/empty
        if pd.notna(row.get("Explanation")) and row.get("Explanation") not in ["", None]:
            skipped_count += 1
            continue
        rows_to_process.append((idx, row))

    print(f"Skipping {skipped_count} rows that already have explanations")
    print(f"Processing {len(rows_to_process)} rows out of {len(df_subset)} total")

    if not rows_to_process:
        print("All rows already have explanations!")
        df.to_csv(path, index=False, encoding="utf-8-sig")
        return

    # THE PARALLEL PROCESSING HAPPENS HERE:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit only the rows that need processing
        future_to_idx = {executor.submit(parallel_fetch_feature, args): args[0]
                        for args in rows_to_process}

        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_idx), total=len(rows_to_process), desc="Fetching features"):
            idx = future_to_idx[future]
            try:
                result_idx, feat = future.result()
                df.at[result_idx, "Explanation"] = feat
            except Exception as exc:
                print(f'Row {idx} generated an exception: {exc}')
                continue

    df.to_csv(path, index=False, encoding="utf-8-sig")

parallel_approach(df, path, max_workers=8)  # Recommended for I/O bound operations

Skipping 24999 rows that already have explanations
Processing 25001 rows out of 50000 total


Fetching features:   0%|          | 0/25001 [00:00<?, ?it/s]

In [None]:
df.to_csv(path, index=False, encoding="utf-8-sig")


In [6]:
# select the first 25 000 rows
subset = df.iloc[0:to_process]

# count None/NaN values in the ‘Explanation’ column
none_count = subset['Explanation'].isna().sum()

print(f"Number of missing explanations in rows 0–24999: {none_count}")

Number of missing explanations in rows 0–24999: 3


In [7]:
# take rows 0–24999
subset = df.iloc[0:to_process]

# select only those with missing Explanation
missing = subset[subset["Explanation"].isna()]

# show them
print(missing)


      type  layer        class1       class2  Neuron_ID  Contribution  \
3971   att     24  apprehensive      excited       9145        2891.0   
4122   att     24    devastated    disgusted       9099        2754.0   
49999  mlp     12    devastated  sentimental       6384          63.0   

      Explanation  
3971         None  
4122         None  
49999        None  


In [None]:
df['layer'] = df['layer'].astype('Int64')
df['Neuron_ID'] = df['Neuron_ID'].astype('Int64')

# Within class

In [1]:
import pandas as pd

In [3]:
DEPLOYMENT_NAME = 0


DATASET_NAMES = [
    "programming",
    "mmlu",
    "math",
    "empethetic_dialogue"
]
# DATASET_NAMES = [
#     "negative_feelings",
#     "law_and_policy",
#     "positive_feelings",
#     "philosophy_and_ethics"
# ]

PATH = f"./datasets/dot_product/{DATASET_NAMES[DEPLOYMENT_NAME]}_between_class_clean.csv"
# PATH = f"./datasets/dot_product/{}.csv"
print(PATH)

./datasets/dot_product/programming_between_class_clean.csv


In [4]:
df = pd.read_csv(PATH, dtype={ "layer": int, "Neuron_ID": int })
df.head()

Unnamed: 0,type,layer,class1,class2,Neuron_ID,Contribution
0,mlp,7,external,codeforces,6117,694673600000.0
1,mlp,23,external,codeforces,10184,639242100000.0
2,mlp,24,external,codeforces,9329,611876100000.0
3,res,1,external,codeforces,12054,549871800000.0
4,mlp,4,external,codeforces,2793,468349900000.0


In [25]:
df.columns

Index(['type', 'layer', 'class1', 'class2', 'Neuron_ID', 'Contribution',
       'Explanation'],
      dtype='object')

In [28]:
import sqlite3
import pandas as pd

# 0) assume DATASET_NAMES is defined, and your DB has columns typ, layer, neuron_index
# 1) Open the DB connection
conn = sqlite3.connect("./datasets/explanations.db")
cur  = conn.cursor()

for dataset_name in DATASET_NAMES:
    src_path = f"./datasets/dot_product/{DATASET_NAMES[DEPLOYMENT_NAME]}_between_class_clean.csv"
    out_path = f"./datasets/dot_product/{dataset_name}_with_explanations.csv"

    # read with the right dtypes
    df = pd.read_csv(src_path, dtype={ "layer": int, "Neuron_ID": int })

    descriptions = []

    # 2) Loop over your DataFrame rows
    for idx, row in df.iterrows():
        typ       = row["type"]
        layer     = row["layer"]
        neuron_id = row["Neuron_ID"]

        # 3) Query using your variables
        cur.execute(
            """
            SELECT description
              FROM explanations
             WHERE typ = ?
               AND layer = ?
               AND neuron_index = ?
            """,
            (typ, layer, neuron_id)
        )

        result = cur.fetchone()
        descriptions.append(result[0] if result else None)

    # 4) Assign back into your DataFrame
    df["Explanations"] = descriptions

    # 5) Write out (no index column)
    df.to_csv(out_path, index=False)
    print(f"Wrote {out_path}")

# 6) Clean up
cur.close()
conn.close()


  df = pd.read_csv(src_path, dtype={ "layer": int, "Neuron_ID": int })


KeyboardInterrupt: 

In [5]:
from tqdm import tqdm
import sqlite3
import pandas as pd

conn = sqlite3.connect("./datasets/explanations.db")

# load once
expl_df = pd.read_sql_query(
    "SELECT typ, layer, neuron_index, description FROM explanations",
    conn,
    dtype={"typ": str, "layer": int, "neuron_index": int},
)

for dataset_name in tqdm(DATASET_NAMES, desc="Datasets"):
    df = pd.read_csv(f"./datasets/dot_product/{DATASET_NAMES[DEPLOYMENT_NAME]}_between_class_clean.csv",
                     dtype={"type": str, "layer": int, "Neuron_ID": int})

    merged = (
        df
        .merge(
            expl_df,
            left_on=["type", "layer", "Neuron_ID"],
            right_on=["typ", "layer", "neuron_index"],
            how="left",
        )
        .drop(columns=["typ", "neuron_index"])
        .rename(columns={"description": "Explanations"})
    )

    merged.to_csv(f"./datasets/dot_product/{dataset_name}_with_explanations.csv",
                  index=False)

conn.close()


Datasets: 100%|██████████| 4/4 [00:09<00:00,  2.40s/it]


# Auto interpret between class

In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('./datasets/explanations.csv')

# Create the dictionary mapping (typ, layer, neuron_index) -> description
mapping = df.set_index(['typ', 'layer', 'neuron_index'])['description'].to_dict()

# Display the resulting dictionary
mapping

{('res',
  0,
  11837): 'references to a specific individual or entity denoted by "Del."',
 ('res', 0, 11838): 'expressions of concern or indifference from the speaker',
 ('res', 0, 11839): ' instances of the word "notice" in various contexts',
 ('res',
  0,
  1184): 'references to specific technical or programming terms and their associated components',
 ('res', 0, 11840): ' patterns related to database row processing',
 ('res', 0, 11841): 'references to poetry and poetic forms',
 ('res',
  0,
  11842): 'terms and concepts related to statistics, analysis, and programming for technical applications',
 ('res',
  0,
  11843): 'phrases associated with achieving goals or reaching targets',
 ('res',
  0,
  11844): 'instances of the word "touch" in various forms and contexts',
 ('res', 0, 11845): 'references to environmental factors and measurements',
 ('res',
  0,
  11846): 'terms related to legal concepts, specifically judicial canons and their implications',
 ('res', 0, 11847): ' referenc

In [8]:
import pandas as pd

name = "math"
PATH = f"./datasets/dot_product/Judge/{name}_with_explanations.csv"
df = pd.read_csv(PATH)   # path to your empathetic file

df['Explanations'] = df.apply(
    lambda row: mapping.get((row['type'], row['layer'], row['Neuron_ID'])),
    axis=1
)

df.to_csv(PATH, index=False)
print(df.head())


  type  layer       class1      class2  Neuron_ID  Contribution  \
0  mlp      7  precalculus    geometry       6117  4.830805e+09   
1  mlp     24  precalculus    geometry       9329  4.825307e+09   
2  mlp     23  precalculus    geometry      10184  4.382280e+09   
3  res      1  precalculus    geometry      12054  3.802484e+09   
4  att      4     geometry  prealgebra       4742  3.458361e+09   

                                        Explanations  
0   topics related to nutrition and health measur...  
1   legal terms and phrases related to court case...  
2  mathematical or analytical expressions related...  
3  content related to medical conditions and trea...  
4   attends to specific keywords marked with doub...  
