In [1]:
!nvidia-smi

Wed Jul  2 13:25:39 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 ...    On  | 00000000:86:00.0 Off |                  N/A |
| 27%   29C    P8              16W / 250W |      1MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import pandas as pd
import ast
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# DATASET_NAMES = ["emotion", "math", "mmlu", "programming"]
# DATASET_PATH = "emotion_processed.csv"
# df = load_dataset(DATASET_PATH)


DATASET_NAMES = ["emotion", "math", "mmlu", "programming"]
DATASET_PATHS = [f"{name}_processed.csv" for name in DATASET_NAMES]

# Load and concatenate all datasets
df = pd.concat([pd.read_csv(path) for path in DATASET_PATHS], ignore_index=True)



In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
TYPE_SAE = ["mlp", "att", "res"]
LAYER_NUM = [str(i) for i in range(26)]
TYPE_NAMES = ['empathetic_dialogue', 'math', 'mmlu', 'programming']
# col = f'gemma-scope-2b-pt-{TYPE_SAE}-canonical-layer_{LAYER_NUM}/width_16k/canonical-token_feature_ids'

In [None]:
# for the layer 0, get the jaccard similarity from within class, outer class 
import pandas as pd
import ast
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
import os
import ast
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import pairwise_distances



In [None]:

# --- Plotting Function ---
def plot_similarity_over_layers(df_long, current_sae_type):
    """
    Takes a long-form DataFrame of similarity scores and plots them over layers.
    """
    # 1) Create a new column that encodes each ordered pair for the legend
    df_long['pair'] = df_long['type1'] + ' vs ' + df_long['type2']

    # 2) Plot
    plt.figure(figsize=(13, 7))
    sns.lineplot(
        data=df_long,
        x='layer', y='avg_sim',
        hue='pair',          # Each pair gets its own color/line
        palette='tab20',     # Palette with up to 20 distinct colors
        marker='o',          # Add a circle at each layer's data point
        linewidth=1.5,
    )
    
    plt.ylim(0, 1)
    plt.title(f"Average Cosine Similarity by Layer & Class-Pair - SAE Type: {current_sae_type.upper()}", fontsize=16)
    plt.xlabel("Layer", fontsize=12)
    plt.ylabel("Avg. Cosine Similarity", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Place legend outside the plot
    plt.legend(
        title="Class Pair",
        bbox_to_anchor=(1.02, 1),
        loc='upper left'
    )
    
    # Ensure the 'figures' directory exists
    os.makedirs("figures", exist_ok=True)
    output_path = os.path.join("figures", f"cosine_similarity_by_layer-{current_sae_type}.png")
    
    # Use bbox_inches='tight' to ensure the legend is not cut off
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"\nSaved plot to {output_path}")


In [None]:
import os
import pandas as pd
import ast
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Main Execution ---

# Assuming 'df' is your pre-loaded pandas DataFrame with all required columns
# df = pd.read_csv(...) 

# --- Settings ---
TYPE_NAMES = ['empathetic_dialogue', 'math', 'mmlu', 'programming']
SAE_TYPES = [
            "res"
             # "mlp" 
             # "att"
            ]
# This format string now has a placeholder for the SAE type
col_fmt = 'gemma-scope-2b-pt-{sae_type}-canonical-layer_{L}/width_16k/canonical-token_feature_ids'

# --- 1) Gather data and plot for each SAE Type ---
for sae_type in SAE_TYPES:
    print(f"--- Processing SAE Type: {sae_type.upper()} ---")
    rows = []
    
    for L in range(26):
        print(f"  Layer {L}...")
        # Correctly format the column name with the current SAE type and layer
        col = col_fmt.format(sae_type=sae_type, L=L)
        
        # Check if the column exists before proceeding
        if col not in df.columns:
            print(f"    Warning: Column '{col}' not found. Skipping.")
            continue
            
        # Parse the string lists into real Python lists
        lists = df[col].apply(ast.literal_eval)
        
        # Binarize all examples for the current layer
        mlb = MultiLabelBinarizer()
        bin_mat = mlb.fit_transform(lists)
        
        # Compute avg cosine for every pair of types
        for t1 in TYPE_NAMES:
            for t2 in TYPE_NAMES:
                mask1 = df['type'] == t1
                mask2 = df['type'] == t2
                
                # Ensure there is data for both types to avoid errors
                if not any(mask1) or not any(mask2):
                    continue

                m1 = bin_mat[mask1]
                m2 = bin_mat[mask2]
                
                # avg = cosine_similarity(m1, m2).mean()
                avg = jaccard_similarity_matrix(m1, m2).mean()
                rows.append({'layer': L, 'type1': t1, 'type2': t2, 'avg_sim': avg, 'sae_type': sae_type})
    
    # After processing all layers for the current SAE type, create the DataFrame
    if not rows:
        print(f"No data processed for SAE type '{sae_type}'. Skipping plot.")
        continue

    df_long = pd.DataFrame(rows)
    
    # --- 2) Call the plotting function ---
    plot_similarity_over_layers(df_long, sae_type)