In [None]:
import pandas as pd
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset, Dataset
import os
import random
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import json
from tqdm import tqdm
from einops import rearrange, repeat
import base64



#### change this stuff to fit your local machine

In [None]:

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
########### MODIFY THIS STUFF AS NEEDED!
device = "cuda"

batch_size = 256

seed = 42

MAX=500 # number of images to run on

use_image_samples = True # uses image sample indices path (otherwise just use all imagenet)
use_random_image_samples = False # uses random (up to seed) sample instead 

save_activations = False

layer_nums = [7, 8, 9] # must be 0-9, the examples below assume their respective layers are specified up here

from path_util import data_path, my_draft_folder, imagenet_path # this is something on my machine, remove and replace the following with your own paths
file_name = 'mlp_fc1_{0}.npz'
parquet_file_path = os.path.join(data_path, file_name)

json_file_path = os.path.join(data_path,'imagenet_class_index.json')

image_sample_indices_path = os.path.join(my_draft_folder, 'imagenet_sample_indices.npy') # set to None to not use


output_folder = os.path.join(my_draft_folder, 'outputs')

gpt_output_folder = os.path.join(output_folder, "gpt")


os.makedirs(output_folder, exist_ok=True)
os.makedirs(gpt_output_folder, exist_ok=True)
############################################################
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [None]:
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


BATCH_INDEX = 'batch_index'
IMAGE_INDEX = 'image_index'
GT_CLASS = 'gt_class'
PRED_CLASS = 'pred_class'
PATCH_INDEX = 'patch_index'
NEURON_INDEX = 'neuron_index'
ACTIVATION_VALUE = 'activation'

# Load the JSON file into a Python dictionary
with open(json_file_path, 'r') as file:
    num_to_word_dict = json.load(file)


# Get class names
imagenet_class_nums = np.arange(0, 1000, 1)
imagenet_class_names = ["{}".format(num_to_word_dict.get(str(i), ["Unknown label"])[1]) for i in imagenet_class_nums]


#### Dataset

In [None]:

data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])


imagenet_data = datasets.ImageFolder(imagenet_path, transform=data_transforms)
if use_image_samples:
    
    if not use_random_image_samples:
        image_samples = np.load(image_sample_indices_path)
    else: 
        image_samples = np.random.choice(len(imagenet_path), MAX, replace=False)

    
    imagenet_data = Subset(imagenet_data, image_samples)

class ReturnIndexDatasetWrapper(Dataset):

    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        img, label = self.dataset[index]
        return img, label ,index 
    

imagenet_data = ReturnIndexDatasetWrapper(imagenet_data)    
data_loader = DataLoader(imagenet_data, batch_size=batch_size, shuffle=False)


imagestest, labelstest, index = next(iter(data_loader))


#### Model

In [None]:

class ClipWrapper(torch.nn.Module):

    def __init__(self, imagenet_class_names, device):
        super(ClipWrapper, self).__init__()
        self.clip = CLIPModel.from_pretrained("wkcn/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M")

        self.processor = CLIPProcessor.from_pretrained("wkcn/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M", do_rescale=False) # Make sure the do_rescale is false for pytorch datasets
        self.imagenet_class_names = imagenet_class_names
        self.device = device
    def forward(self, x):
        
        #inputs  =self.processor(text="CAR CAR", images=x, return_tensors="pt", padding=True)
        inputs  =self.processor(text=self.imagenet_class_names, images=x, return_tensors="pt", padding=True)
        # processor returns cpu tensors even if input is cuda :/ Makes this whole wrapper ill-advised
        # TODO processor should run in Dataset instead
        for key in inputs.keys():
            inputs[key] = inputs[key].to(self.device)
        return  self.clip(**inputs)


model = ClipWrapper(imagenet_class_names, device)

model.eval()
model.to(device)

# example of running model
images, labels, image_indices = next(iter(data_loader))

idx = 0

outputs = model(images)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
predicted_indices = probs.argmax(dim=1)

plt.imshow(images.cpu()[idx].permute(1, 2, 0))
plt.title(f"Predicted: {imagenet_class_names[predicted_indices[idx].item()]}. True: {imagenet_class_names[labels[idx].item()]}")

plt.show()

#### Get the activations

In [None]:

activations_list = []

# Function to register the hook
#TODO do multiple layers at once
def register_hook(module, activations_list):
    def hook(module, input, output):
        activations_list.append(output.detach())
    return module.register_forward_hook(hook)

def process_images(model, total_images, total_labels, image_indices, batch_idx, detach=True, flatten=True):
    activations_list.clear()

    #with torch.no_grad():
    outputs = model(total_images)

    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
    class_indices = probs.argmax(dim=1)
    total_labels = total_labels.to(class_indices.device)

    batch_activations = activations_list[0]

   # TODO Much better than looping but maybe could be restructured entirely? (batch_idx, image_idx, class_name, predicted are all constants per batch moreoever they are int. patch_idx, neuron_idx could be infered from shape. Should probably just return activations + 'constants' and figure out the rest when trying to convert to panda?). Intstead I'm trying to mimic the exact format of the old list in a float tensor.... :O


    b, p, n = batch_activations.shape
    dt = batch_activations.dtype
    dev = batch_activations.device

    patch_index = torch.arange(p, dtype=dt).to(dev)
    neuron_index = torch.arange(n, dtype=dt).to(dev)

    patch_index, neuron_index = torch.meshgrid(patch_index, neuron_index, indexing='ij')
    patch_index = repeat(patch_index, 'h w -> b h w 1', b=b)
    neuron_index = repeat(neuron_index, 'h w -> b h w 1', b=b)

    batch_indices = torch.full_like(neuron_index, fill_value=batch_idx)
    image_indices = torch.tensor(image_indices, dtype=dt).to(dev)

    image_indices = repeat(image_indices, 'b -> b h w 1', h=p, w=n)
    gt_class_indices = repeat(total_labels, 'b -> b h w 1', h=p, w=n)
    pred_class_indices =  repeat(class_indices, 'b -> b h w 1', h=p, w=n)


    activations_table =torch.concatenate([batch_indices, image_indices,gt_class_indices,pred_class_indices,  patch_index, neuron_index,batch_activations.unsqueeze(-1)], dim=-1)
    if flatten:
        activations_table = rearrange(activations_table, 'b h w c -> (b h w) c')
    if detach:
        activations_table = activations_table.detach().cpu()
    return activations_table

def tensor_activations_to_panda(activations, verbose=True, convert_to_string=False):


    
    df_activations = pd.DataFrame(activations)



    # Assign column names
    df_activations.columns = [ BATCH_INDEX, IMAGE_INDEX, GT_CLASS, PRED_CLASS, PATCH_INDEX, NEURON_INDEX, ACTIVATION_VALUE ]

    # convert everything but activation to int
    for int_names in df_activations.columns[:-1]:
        df_activations[int_names] = df_activations[int_names].astype(int)

    # convert class indices to string names (a bit slow)
    if convert_to_string:
        if verbose:
            print("converting to string...")
        #TODO only do this when actually displaying something
        df_activations[GT_CLASS] = df_activations[GT_CLASS].map(lambda x: imagenet_class_names[x])
        df_activations[PRED_CLASS] = df_activations[PRED_CLASS].map(lambda x: imagenet_class_names[x])
        if verbose:
            print("converting done.") 

    return df_activations

def process_images_to_panda(model, total_images, total_labels, image_indices, batch_idx, verbose=False, convert_to_string=False):
    return tensor_activations_to_panda(process_images(model, total_images, total_labels, image_indices, batch_idx), verbose=verbose, convert_to_string=convert_to_string)


In [None]:

activations_per_layer={}
sorted_activations_per_layer = {}
for layer_num in layer_nums:

    module = model.clip.vision_model.encoder.layers[layer_num].mlp.fc1 # Layer number here
    hook_handle = register_hook(module, activations_list)

    master_layer_activations = []
    

    count = 0


    for batch_idx, (total_images, total_labels, total_indices) in tqdm(enumerate(data_loader), total=MAX//batch_size):

            detailed_activations = process_images(model, total_images, total_labels, total_indices, batch_idx=batch_idx)
            master_layer_activations.append(detailed_activations)

            count += batch_size
            if count >= MAX:
                break


    # Remove the hook when done
    hook_handle.remove()

    activations = torch.cat(master_layer_activations, dim=0)


    df_activations = tensor_activations_to_panda(activations)

    print("sorting..")
    #TODO pytorch sort?
    sorted = df_activations.sort_values(by=[ACTIVATION_VALUE], ascending=False, inplace=False)
    print("sorting done")

    activations_per_layer[layer_num] = df_activations
    sorted_activations_per_layer[layer_num] = sorted
    if save_activations:
        #TODO option to load
        df_activations.to_parquet(parquet_file_path.format(layer_num), index=False)



### EXAMPLES

#### Visualization

In [None]:

    
def plot_image_patch_heatmap(images, heatmaps, titles=None, main_title="" ,width=4, alpha=0.4, save=None):
    if titles is None:
        titles = [''] * len(images)

    assert len(images) == len(heatmaps) == len(titles), "Images, heatmaps, and titles must have the same length."
    
    num_images = len(images)
    height = int(np.ceil(num_images/width))
    if height == 1:
        width = num_images
    fig, axes = plt.subplots(height, width, figsize=(5*width, 5*height))

    fig.suptitle(main_title, fontsize=12)

    if num_images == 1:
        axes = np.array([[axes]])
   

    
    for idx, (image, heatmap, title) in enumerate(zip(images, heatmaps, titles)):
        ax = axes.flatten()[idx]
        
        image_size = image.shape[-1]
        pixel_num = heatmap.shape[-1]

        # Create a heatmap overlay
        heatmap_expanded = np.zeros((image_size, image_size))
        patch_size = image_size // pixel_num

        for i in range(pixel_num):
            for j in range(pixel_num):
                heatmap_expanded[i*patch_size:(i+1)*patch_size, j*patch_size:(j+1)*patch_size] = heatmap[i, j]

        # Plotting the image with the heatmap overlay
        ax.imshow(image.permute(1, 2, 0))
        ax.imshow(heatmap_expanded, cmap='viridis', alpha=0.6)  # Overlaying the heatmap
        ax.axis('off')  
        
        min_activation = heatmap.min()
        max_activation = heatmap.max()



        ax.set_title(title)

    # Hide any remaining subplots that don't have data
    for ax in axes.flatten()[num_images:]:
        ax.axis('off')
    
    plt.tight_layout()

    if save is not None:
        plt.savefig(save)

    plt.show()

def get_activation_array(image_idx, neuron_idx, activations, pixel_num=14):

    filtered_df = activations[(activations[IMAGE_INDEX] == image_idx) &
                            (activations[NEURON_INDEX] == neuron_idx)]

    activation_values = filtered_df[ACTIVATION_VALUE]

    activation_values_array = activation_values.to_numpy()[1:]
    activation_values_array = activation_values_array.reshape(pixel_num, pixel_num)

    return activation_values_array

def plot_activations_heatmap(image_indices, neuron_indices, titles, main_title="", activations=df_activations, dataset=imagenet_data , pixel_num=14, alpha=0.4, save=None):
    images = []
    heatmaps = []
    for image_idx, neuron_idx in zip(image_indices, neuron_indices):

        
        image = dataset[image_idx][0]


        images.append(image)
        heatmaps.append(get_activation_array(image_idx, neuron_idx, activations, pixel_num) )
    
    plot_image_patch_heatmap(images, heatmaps, titles, main_title=main_title, alpha=alpha, save=save)




In [None]:

top_n = 10

interesting_neurons = [
    # Layer 9
    (9, 145, 'Car. Details. Headlights and handles'),
    (9, 327, 'Cars'),
    (9, 398, 'Snowy white fluffy. Polysemantic.'),
    (9, 496, 'Beach'),
    (9, 179, 'Very polysemantic. faces, corners, coastal images, dogs...'),
    (9, 469, 'Snowy white fluffy'),
    (9, 659, 'Water, water animals'),
    
    # Layer 8
    (8, 79, 'Palm trees and fronds'),
    (8, 392, 'Curly text and designs'),
    (8, 490, 'Flames / glinty light reflections'),
    (8, 493, 'Text'),
    (8, 539, 'Faces (male?)'),
    (8, 699, 'Underwater backgrounds'),
    (8, 893, 'Faces'),
    (8, 927, 'Rims of hats, hats in general, hoods, and dog ears that may look like hats'),
    
    # Layer 7
    (7, 107, 'Side of the face / neck'),
    (7, 326, 'Text at corner of the image'),
    (7, 370, 'Top-of-the-head'),
    (7, 494, 'Pants/legs'),
    (7, 649, 'Logo'),
    (7, 670, 'Logo'),
]

#neuron_idx = 



In [None]:
#TODO could use speed ups from below
for layer_num, neuron_idx, human_label in  interesting_neurons:

    df_activations = activations_per_layer[layer_num]
    df_sorted = sorted_activations_per_layer[layer_num]
    #TODO why drop duplicates?
    unique_top_entries = df_sorted[df_sorted[NEURON_INDEX] == neuron_idx].drop_duplicates(subset=GT_CLASS).head(top_n)

    images, titles = [], []
    for i, (image_index, class_name) in enumerate(zip(unique_top_entries[IMAGE_INDEX], unique_top_entries[GT_CLASS])):
        title = f"Image: {image_index}\nClass: {class_name}"
        images.append(image_index)
        titles.append(title)
    main_title = f"ACTIVATIONS\nLayer/Neuron: {layer_num}/{neuron_idx}\n{human_label}"
    plot_activations_heatmap(images, [neuron_idx]*len(images), titles, main_title=main_title, activations=df_activations, dataset=imagenet_data, save=os.path.join(output_folder, f"{layer_num:02d}_{neuron_idx:04d}_activations.jpg"))

#### Finding key input patches

In [None]:

# this is a lot of computation, likely your notebook won't handle all neurons at once..

#TODO instead of zero-ing out a patch take a random patch from elsewhere?

#TODO this could use a cleaner version of process_images since much of it isn't used

# TODO currently assuming square
num_patches_width = 14
#interesting_neurons = [
    # (7, 494, 'Pants/legs'),
    # (7, 649, 'Logo'),
    # (7, 670, 'Logo'),
    #(7, 326, 'Text at corner of the image'),

#]
# TODO these functions not really used much anymore
def image_to_batch(image):
    torch_image = np.expand_dims(image,axis=0)
    torch_image = torch.from_numpy(torch_image).to(device)
    return torch_image
def label_to_batch(label):
    return torch.tensor([label]).to(device)
#TODO some neurons share images so could do for both at once..
def register_hook_specific_neuron(module, activations_list, neuron_idx):
    def hook(module, input, output):
        activations_list.append(output[:,:, neuron_idx].unsqueeze(-1).detach())
    return module.register_forward_hook(hook)

#master_data = []
arange = torch.arange(batch_size).to(device)
for layer_num, neuron_idx, human_label in  tqdm(interesting_neurons):

    activations_list = []
    module = model.clip.vision_model.encoder.layers[layer_num].mlp.fc1 
    hook_handle = register_hook_specific_neuron(module, activations_list, neuron_idx)
    dummy_neuron_idx = 0
    df_activations = activations_per_layer[layer_num]
    df_sorted = sorted_activations_per_layer[layer_num]

    unique_top_entries = df_sorted[df_sorted[NEURON_INDEX] == neuron_idx].drop_duplicates(subset=GT_CLASS).head(top_n)

    images, heatmaps, titles = [], [], []
    for i, (image_index, class_name) in enumerate(zip(unique_top_entries[IMAGE_INDEX], unique_top_entries[GT_CLASS])):

        title = f"Image: {image_index}\nClass: {class_name}"
        image, label = imagenet_data[image_index][0:2]
        batch_image= image_to_batch(image)
        batch_label = label_to_batch(label)
        default_activations = process_images(model, batch_image, batch_label, [image_index], batch_idx=-1,detach=False, flatten=False)
        default_activations = default_activations[0,:, dummy_neuron_idx,-1]

        #default_activations = rearrange(default_activations[1:], '(x y)-> x y', x=num_patches_width, y=num_patches_width)


        


        patch_size = image.shape[-1] // num_patches_width

        zero_patch_images = {} 

        #diffs = torch.zeros((num_patches_width, num_patches_width))
        diffs = []
        gathering_images = []
        gathering_dummy_indices = []
        
        all_activations = []
        count = 0
        for i, j in np.ndindex((num_patches_width, num_patches_width)):
            zeroed_image = image.clone()
            zeroed_image[:,i*patch_size:(i+1)*patch_size, j*patch_size:(j+1)*patch_size] = 0

            gathering_images.append(zeroed_image)
            gathering_dummy_indices.append(i*num_patches_width + j)


            if len(gathering_images) == batch_size or (i == num_patches_width-1 and j == num_patches_width - 1):

                zeroed_torch_images = torch.stack(gathering_images, dim=0).to(device)

          
                zeroed_activations = process_images(model, zeroed_torch_images, torch.tensor([label]*len(gathering_images)).to(device), gathering_dummy_indices, batch_idx=-1, detach=False, flatten=False)
                zeroed_activations = zeroed_activations[:,:, dummy_neuron_idx,-1]


                # make square
                #zeroed_activations = rearrange(zeroed_activations[:,1:], 'b (x y)-> b x y', x=num_patches_width, y=num_patches_width)

                diff = torch.abs(zeroed_activations-default_activations)


                # zero out the diff in which the input image is 0 since we expect it to be different
                if diff.shape[0] == batch_size:
                    r = arange
                else:
                    r =  torch.arange(diff.shape[0]).to(device)
                diff[r, r +gathering_dummy_indices[0]] = 0 
                
                diff = diff.mean(dim=1)

                diffs.append(diff)

                gathering_images = []
                gathering_dummy_indices = []



        diff = torch.cat(diffs, dim=0)

        diff = diff.reshape((num_patches_width, num_patches_width))
        diff = diff.detach().cpu()
        #plot_image_patch_heatmap([image], [diff], titles=['test'], main_title="" ,width=4, alpha=0.4)

        images.append(image)
        heatmaps.append(diff)
        titles.append(title)

    main_title = f"INPUT SIGNIFICANCE\nLayer/Neuron: {layer_num}/{neuron_idx}\n{human_label}"
    plot_image_patch_heatmap(images, heatmaps, titles, main_title=main_title, save=os.path.join(output_folder, f"{layer_num:02d}_{neuron_idx:04d}_input_significance.jpg"))

    #master_data.append((images,heatmaps, titles, main_title))
    hook_handle.remove()


### Asking GPT


In [None]:
#We assume OPENAI_API_KEY is an env key you can set it here
# os.environ['OPENAI_API_KEY'] = 'yourkey'


In [None]:
#pdf stuff

from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate,  Paragraph, Spacer
from reportlab.platypus import Image as PDFImage
from reportlab.lib.styles import getSampleStyleSheet
from PIL import Image 
import io 
def resize_image(image, max_width, max_height):
    # Calculate the new dimensions maintaining the aspect ratio
    width_percent = max_width / float(image.size[0])
    height_percent = max_height / float(image.size[1])
    aspect_ratio = min(width_percent, height_percent)

    # New dimensions
    width = int((float(image.size[0]) * float(aspect_ratio)))
    height = int((float(image.size[1]) * float(aspect_ratio)))

    return image.resize((width, height), Image.ANTIALIAS)
def create_pdf(content_list, file_name):
    doc = SimpleDocTemplate(file_name, pagesize=letter)
    elements = []
    style_sheet = getSampleStyleSheet()

    #TODO figure out how to do this better.. 
    max_image_width = 456#letter[0] - 50
    max_image_height = 636#letter[1] - 50
    for item in content_list:
        if isinstance(item, str):
            # Add text
            item = item.replace('\n', '<br/>')

            elements.append(Paragraph(item, style_sheet['BodyText']))
            elements.append(Spacer(1, 12))  # Add space after paragraph

        elif isinstance(item, np.ndarray):
            # Convert numpy array to list of lists and create a table
            image = Image.fromarray(item.astype('uint8'), 'RGB')
            if image.size[0] > max_image_width or image.size[1] > max_image_height:
                image = resize_image(image, max_image_width, max_image_height)
            image_buffer = io.BytesIO()
            image.save(image_buffer, format='PNG')
            image_buffer.seek(0)
            img = PDFImage(image_buffer)
            elements.append(img)

        else:
            raise ValueError("Content list must contain only strings and numpy arrays.")

    doc.build(elements)

In [None]:
import io
import requests
# converts a numpy array image to encoded bytes
def convert_to_base(vis:np.ndarray):
    image = Image.fromarray(vis)
    buffer = io.BytesIO()
    image.save(buffer, format="JPEG")  
    byte_data = buffer.getvalue()
    return base64.b64encode(byte_data).decode('utf-8')

# handler for messages to gpt-v
class ImageMessages:
    def __init__(self, json_output=False ):
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"
          }

        self.content = []

        self.human_readable_content = []
        self.init_content()

    def init_content(self):
        self.content = []
        self.human_readable_content = []


    def add_content_text(self,new_text):
        self.content.append({
            "type": "text",
            "text": new_text})
        self.human_readable_content.append("USER: " + new_text)

    def add_content_image(self, vis, detail="high"):
        self.content.append({
                  "type": "image_url",
                  "image_url": {
                    "url": f"data:image/jpeg;base64,{convert_to_base(vis)}",
                    "detail": detail,
                  }})
        self.human_readable_content.append(vis)

    def add_comment(self, text):
        self.human_readable_content.append("###COMMENT (not seen by gpt): " + text + "###")
        
    def run_gpt(self):
        payload = {
            "model": "gpt-4-vision-preview",
            "messages": [
                {
                    "role": "user",
                    "content": self.content
                }
            ],
            "max_tokens": 300
        }
    
    
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
        text = response.json()['choices'][0]['message']['content']
        self.human_readable_content.append("GPT: " + text)
        return text
    
    def convert_to_pdf(self, filename):
        create_pdf(self.human_readable_content, filename)

    def jupyter_notebook_display(self):
        for blah in self.human_readable_content:
            if type(blah) == str:
                print(blah)
            else:
                plt.imshow(blah)
                plt.axis('off') 
                plt.show()

#### Freeform
Here I just ask gpt what it sees

In [None]:
# in this test I'm just asking nicely

#TODO pass along the patch activations in some form to gpt 
#TODO pass the labels to gpt 
#TODO pass examples so it's not zero shot(!)
#TODO since these images are low resolution I think it's cheaper to send them individually but in general that isn't true so might want to concat. Concatenating them may effect model answers though!
#here I'm concatenating mostly because gpt is very sensitive to prompt and this happens to work...
#TODO infinite number of possible tweaks..."



interesting_neurons = [
    # Layer 9
    (9, 145, 'Car. Details. Headlights and handles'),
    (9, 327, 'Cars'),
    (9, 398, 'Snowy white fluffy. Polysemantic.'),
    (9, 496, 'Beach'),
    (9, 179, 'Very polysemantic. faces, corners, coastal images, dogs...'),
    (9, 469, 'Snowy white fluffy'),
    (9, 659, 'Water, water animals'),
    
    # Layer 8
    (8, 79, 'Palm trees and fronds'),
    (8, 392, 'Curly text and designs'),
    (8, 490, 'Flames / glinty light reflections'),
    (8, 493, 'Text'),
    (8, 539, 'Faces (male?)'),
    (8, 699, 'Underwater backgrounds'),
    (8, 893, 'Faces'),
    (8, 927, 'Rims of hats, hats in general, hoods, and dog ears that may look like hats'),
    
    # Layer 7
    (7, 107, 'Side of the face / neck'),
    (7, 326, 'Text at corner of the image'),
    (7, 370, 'Top-of-the-head'),
    (7, 494, 'Pants/legs'),
    (7, 649, 'Logo'),
    (7, 670, 'Logo'),
]

detail="auto"
#detail = "low"

start_text = f"""The following images from imagenet produced high activations from a neuron in TinyClip. I'm trying to determine what common feature(s) the neuron is reacting to."""

messages = ImageMessages()
hacky_max = None
count = 0
for layer_num, neuron_idx, human_label in  tqdm(interesting_neurons):

    df_activations = activations_per_layer[layer_num]
    df_sorted = sorted_activations_per_layer[layer_num]

    unique_top_entries = df_sorted[df_sorted[NEURON_INDEX] == neuron_idx].drop_duplicates(subset=GT_CLASS).head(top_n)
    messages.init_content()
    messages.add_comment(f"Layer/Neuron: {layer_num}/{neuron_idx} human answer: {human_label}")

    messages.add_content_text(start_text)
    #TODO ignores requires if not given all together..
    master_image = np.empty((0,imagenet_data[0][0].shape[1],3), dtype=np.uint8)
    for i, (image_index,) in enumerate(zip(unique_top_entries[IMAGE_INDEX], )):

        image = imagenet_data[image_index][0].cpu().numpy()
        image = np.uint8(255*np.transpose(image, (1, 2, 0)))

        master_image = np.concatenate([master_image,image],axis=0)
        
        # NOTE 
        #messages.add_content_text(f"Image {i+1}")
        #messages.add_content_image(image, detail=detail)
    messages.add_content_image(master_image, detail=detail)

    messages.run_gpt()

    messages.convert_to_pdf(os.path.join(gpt_output_folder, f"{layer_num:02d}_{neuron_idx:04d}_freeform_answer.pdf"))
    messages.jupyter_notebook_display()
         

    count = count + 1
    if hacky_max is not None and count >= hacky_max:
        break



#TODO auto interpretability score!


#### Spearman rank correlation
choice of samples here is very important. from the anthropics paper:

Claude is asked to predict activations for sixty examples: six from the top activations; two from the other 12 intervals; ten completely random; and twenty top activating tokens out of context.

Here I'm just arbitrarily taking 7 samples [0,3,6, 9, 12,15,18] and asking the model to rank [1, 4, 7 ...]
I have no solid justification for this, In fact with this amount of data the task seems pointless.

Moreoever this is not a good way to rank things (it doesn't scale well). Persumably they ask the model to score images instead of directly asking it to rank? I haven't actually checked.

But I just want to write up some basic code and get a sense of what it does...





In [None]:
from scipy.stats import pearsonr

# see markdown above about sample
example_samples_indices = [0,3,6, 9, 12,15,18]

test_samples_indices = [n+1 for n in example_samples_indices] 

messages = ImageMessages(json_output=True)

detail = "auto"

start_text =  f"""The following {len(example_samples_indices)} images from imagenet have been ranked based on their activation value from a neuron in TinyClip."""
question_text = f"""Here are {len(test_samples_indices)} more images. Based on the above ranking try to predict the rank each image. Each image has a name, A, B, C...
Format your answer as a json dict with two keys. Under key 'info', describe your thought process. Under key 'ranking' make a list of the names in order"""
final_text = f"Return your answer. Remember the json format"

hacky_max = None
count = 0
r_values = []
p_values = []
for layer_num, neuron_idx, human_label in  tqdm(interesting_neurons):

    df_activations = activations_per_layer[layer_num]
    df_sorted = sorted_activations_per_layer[layer_num]

    #TODO I don't think it makes sense to drop classes for this but I just want to keep the examples the same as above
    unique_sorted = df_sorted[df_sorted[NEURON_INDEX] == neuron_idx].drop_duplicates(subset=GT_CLASS)
    example_samples = unique_sorted.iloc[example_samples_indices]
    random_test_indices = random.sample(test_samples_indices, len(test_samples_indices))
    test_samples = unique_sorted.iloc[random_test_indices]
    messages.init_content()
    messages.add_comment(f"Layer/Neuron: {layer_num}/{neuron_idx} human answer: {human_label}")

    messages.add_content_text(start_text)

    for i, (image_index, class_name, act_val) in enumerate(zip(example_samples[IMAGE_INDEX], example_samples[GT_CLASS], example_samples[ACTIVATION_VALUE])):
        rank = i + 1
        class_name =imagenet_class_names[class_name]
        act_val = round(act_val, 2)
        image = imagenet_data[image_index][0].cpu().numpy()
        image = np.uint8(255*np.transpose(image, (1, 2, 0)))
        
        messages.add_content_text(f"Rank: {rank}. Class name: {class_name}. Neuron activation value: {act_val}.")
        messages.add_content_image(image, detail=detail)

    messages.add_content_text(question_text)
    gt_ranks = []
    names = []
    for i, (image_index, class_name, act_val) in enumerate(zip(test_samples[IMAGE_INDEX], test_samples[GT_CLASS], test_samples[ACTIVATION_VALUE])):
        name = "ABCDEFGHIJKLMNOP"[i] # there are some obvious limitations to this line lol
        names.append(name)
        class_name =imagenet_class_names[class_name]
        act_val = round(act_val, 2)
        image = imagenet_data[image_index][0].cpu().numpy()
        image = np.uint8(255*np.transpose(image, (1, 2, 0)))
        
        rank = test_samples_indices.index(random_test_indices[i])+1
        gt_ranks.append(rank) 
        messages.add_content_text(f"Name: {name}. Class name: {class_name}")
        messages.add_comment(f"Rank: {rank}. Class name: {class_name}. Neuron activation value: {act_val}.")

        messages.add_content_image(image, detail=detail)
    messages.add_content_text(final_text)


    
    output = messages.run_gpt()

    #TODO figure out how to run in json mode! (does gpt-v support?)
    try:
        try:
            output = json.loads(output)
        except:
            # remove chatgpts format ```json\n...\b```
            output = output.replace("```json","").replace("```","").strip()
            output = json.loads(output)
        
        pred_rank_names = output["ranking"]
        pred_ranks = [pred_rank_names.index(name) + 1 for name in names]
    except:
        pred_ranks = None 

        messages.add_comment(f"Model messed up output format!")


    if pred_ranks is not None:
        score = pearsonr(gt_ranks, pred_ranks)
        r_val, p_val = score[0], score[1]

        messages.add_comment(f"\nModel output: {pred_ranks}\nGround truth: {gt_ranks}\nFINAL AUTOINTERPRETABILITY SCORE: {r_val}\np val: {p_val}")
        # hack comment to top
        messages.human_readable_content = [messages.human_readable_content[-1]] + messages.human_readable_content[:-1]                                                                                                                             
        r_values.append(r_val)
        p_values.append(p_val)
    messages.convert_to_pdf(os.path.join(gpt_output_folder, f"{layer_num:02d}_{neuron_idx:04d}_auto_interp.pdf"))
   # messages.jupyter_notebook_display()

    count = count + 1
    if hacky_max is not None and count >= hacky_max:
        break
print(r_values)
print(p_values)
    


