# Check embedding cosine similarity between tokens


One thing I notice is that the cosine similarity of hidden representation across tokens are usually positive. Very rarely I can find a pair that has negative cosine similarity. This is interesting, since I intuitively expect there will be some positive, some negative. Maybe the cosine similarities are mostly positive right beginning from the embedding layer.

Let's check.

In [1]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

from dawnet.inspector import LLMInspector
from dawnet.tokens import Tokens
from dawnet import op
from dawnet.prompts import get_words, Prompt

torch.set_grad_enabled(False)
device = torch.device("mps")

In [2]:
insp = LLMInspector.from_hf("google/gemma-3-4b-it")
print(insp.model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwi

In [3]:
def get_embedding_layer(insp):
  model_id = insp.model_id.lower()
  if "qwen" in model_id:
    return insp.model.model.embed_tokens
  if "olmo" in model_id:
    return insp.model.model.embed_tokens
  if "oss" in model_id:
    return insp.model.model.embed_tokens
  if "gemma" in model_id:
    return insp.model.language_model.embed_tokens

embed_layer = get_embedding_layer(insp)

In [4]:
def random_cosine_similarity(w, target_idx=None, verbose=False):
  if target_idx is None:
    target_idx = random.randrange(w.shape[0])
  if verbose:
    print(target_idx, insp.tokenizer.decode(target_idx))
  v = w[target_idx]
  others = torch.cat([w[:target_idx], w[target_idx+1:]], dim=0)
  if verbose:
    print(v.shape)
    print(others.shape)
  dot_product = others @ v.unsqueeze(1)
  norm1 = others.norm(p=2, dim=1, keepdim=True)
  norm = norm1 * v.norm(p=2)
  c = dot_product / norm
  non_neg = (c >= 0).sum().cpu().item()
  neg = c.shape[0] - non_neg
  _min = round(c.min().cpu().item(), 2)
  _max = round(c.max().cpu().item(), 2)
  _mean = round(c.mean().cpu().item(), 2)
  _median = round(c.median().cpu().item(), 2)
  _per = round(non_neg / (non_neg + neg), 4)
  if verbose:
    print(
      "Min:", _min,
      "Max:", _max,
      "Mean:", _mean,
      "Median:", _median,
      "Non-negative:", non_neg,
      "Negative:", neg,
      "Percentage", _per
  )
  return c, _min, _max, _mean, _median, non_neg, neg, _per

In [5]:
times = 50
t_min, t_max, t_mean, t_median, t_non_neg, t_neg, t_per = 0, 0, 0, 0, 0, 0, 0
for _ in range(times):
  _, _min, _max, _mean, _median, _non_neg, _neg, _per = random_cosine_similarity(embed_layer.weight)
  t_min += _min
  t_max += _max
  t_mean += _mean
  t_median += _median
  t_non_neg += _non_neg
  t_neg += _neg
  t_per += _per

print(
    "Min:", round(t_min / times, 2),
    "Max:", round(t_max / times, 2),
    "Mean:", round(t_mean / times, 2),
    "Median:", round(t_median / times, 2),
    "Non-negative:", round(t_non_neg / times,2),
    "Negative:", round(t_neg / times, 2),
    "Percentage", round(t_per / times, 4)
  )

Min: -0.09 Max: 0.56 Mean: 0.03 Median: 0.03 Non-negative: 239564.38 Negative: 22642.62 Percentage 0.9136


In [6]:
ran_embed = nn.Embedding(
  num_embeddings=embed_layer.weight.shape[0],
  embedding_dim=embed_layer.weight.shape[1],
)

_, _min, _max, _mean, _median, _non_neg, _neg, _per = random_cosine_similarity(ran_embed.weight)
print("% of positive cosine simlarity for random embedding layer", _per)

% of positive cosine simlarity for random embedding layer 0.501


### Zoom in closer into tokens that have negative similarity

In [7]:
cos, _,_,_,_,_,_,_ = random_cosine_similarity(embed_layer.weight, verbose=True)
_temp = (cos < 0).nonzero()
print(_temp[:100,0])

73048 µ
torch.Size([2560])
torch.Size([151935, 2560])
Min: -0.11 Max: 0.82 Mean: 0.05 Median: 0.05 Non-negative: 148898 Negative: 3037 Percentage 0.98
tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  45,  46,  47,  49,  50,  51,  53,  54,  55,  56,  58,  59,
         60,  61,  62,  64,  65,  66,  67,  68,  69,  71,  72,  73,  74,  75,
         77,  78,  79,  81,  82,  83,  86,  87,  88,  90,  91,  92, 120, 149,
        151, 170, 197, 198, 201, 220, 222, 230, 231, 233, 234, 245, 253, 254,
        256, 257], device='mps:0')


In [9]:
insp.tokenizer.decode(_temp[:93,0])

'!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLNOPRSTVWXY[\\]^_abcdefhijklnoprstwxy{|}����\t\n\r ���'

In [15]:
_ = random_cosine_similarity(embed_layer.weight, target_idx=192)

192 
Min: -0.29 Max: 0.6 Mean: 0.04 Non-negative: 136608 Negative: 64479
