In [1]:
from einops import rearrange
import torch
import torchvision.transforms as transforms
from torch import nn
import torch.nn.functional as F
import numpy as np
from torchvision.datasets import ImageFolder
from ncut_pytorch import NCUT, rgb_from_tsne_3d
from matplotlib import pyplot as plt
import os
import glob
import matplotlib.pyplot as plt
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoModel, AutoProcessor, CLIPTokenizer, CLIPTextModelWithProjection
from qwen_vl_utils import process_vision_info
import requests
from PIL import Image, ImageOps
import accelerate
import gc
from diffusers import StableDiffusion3Pipeline, AutoencoderKL, SD3Transformer2DModel
import functools

  from .autonotebook import tqdm as notebook_tqdm
2025-04-14 14:42:19.970072: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-14 14:42:19.970117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-14 14:42:19.988090: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-14 14:42:20.041630: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from huggingface_hub import login
login("") # login token removed since this is going on a public repo

In [3]:
def resize(image, size=(448, 448), pad=(255, 255, 255)):
    image.thumbnail((size[0], size[1]), Image.Resampling.LANCZOS)

    resized = Image.new("RGB", size, pad)

    x_offset = (size[0] - image.size[0]) // 2
    y_offset = (size[1] - image.size[1]) // 2

    resized.paste(image, (x_offset, y_offset))

    return resized

In [4]:
image_files = sorted(glob.glob("data/*_base.png") + glob.glob("data/*_test.png"))

images = []
for image_file in image_files:
    image = Image.open(image_file).convert("RGB")
    image = resize(image, size=(512, 512))
    images.append(image)

In [5]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

model_id = "stabilityai/stable-diffusion-3.5-medium"
model_vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")

In [6]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])

image_tensors = []
for image in images:
    image_tensor = transform(image).to(torch.bfloat16).to(model_vae.device)
    image_tensors.append(image_tensor)

image_tensors = torch.stack(image_tensors, dim=0)

In [7]:
with torch.no_grad():
    latent_distributions = model_vae.encode(image_tensors)
    latents = latent_distributions.latent_dist.sample()

In [25]:
def nn_similarity(feat1, feat2):    
    sims1 = torch.zeros((feat1.shape[0]), device=feat1.device)
    for i in range(feat1.shape[0]):
        sims1[i] = torch.clip(F.cosine_similarity(feat1[i:i+1], feat2, dim=1).max(), -1, 1)

    sims2 = torch.zeros((feat2.shape[0]), device=feat2.device)
    for i in range(feat2.shape[0]):
        sims2[i] = torch.clip(F.cosine_similarity(feat2[i:i+1], feat1, dim=1).max(), -1, 1)

    return ((sims1.mean() + sims2.mean()) / 2).item()

In [26]:
sims = np.zeros((latents.shape[0], latents.shape[0]))

for i in range(latents.shape[0]):
    lat1 = torch.permute(latents[i], (1, 2, 0))
    lat1 = lat1.reshape(-1, lat1.shape[-1]).to("cuda:0")
    
    for j in range(latents.shape[0]):
        lat2 = torch.permute(latents[j], (1, 2, 0))
        lat2 = lat2.reshape(-1, lat2.shape[-1]).to("cuda:0")
        
        sims[i,j] = nn_similarity(lat1, lat2)

np.save("latent_similarities.npy", sims)

In [27]:
sims = np.load("latent_similarities.npy", allow_pickle=True)

In [28]:
illusionillusion_sim = []
otherillusion_sim = []

for i in range(0, 20, 2):
    illusionillusion_sim.append(sims[i][i+1])

    idx = list(range(0, 20, 2)).remove(i)
    otherillusion_sim.append(sims[i][idx].mean())

print("latents illusion / illusion-illusion:", np.mean(illusionillusion_sim), illusionillusion_sim)
print("latents illusion / other illusion:", np.mean(otherillusion_sim), otherillusion_sim)

latents illusion / illusion-illusion: 0.9943636894226074 [0.9960331916809082, 0.9732928276062012, 0.9966030120849609, 0.9925751686096191, 0.9979376792907715, 0.9969453811645508, 0.9980902671813965, 0.9995484352111816, 0.9933309555053711, 0.9992799758911133]
latents illusion / other illusion: 0.9530617746710777 [0.9669348955154419, 0.8939115881919861, 0.9482133358716964, 0.9688911437988281, 0.9725773841142654, 0.9645303130149842, 0.9393180638551712, 0.9479171335697174, 0.9534059286117553, 0.9749179601669311]
