# Investigating Capitalization-Sensitive Dimensions in BERT

This notebook explores how token embeddings in `bert-base-cased` differ based on capitalization.  
Inspired by the repository [GPT-2 Embedding Edit Explorer](https://github.com/tiagomloeblein/gpt2-embedding-edit-explorer), this is a natural extension into identifying which embedding dimensions are most sensitive to casing (uppercase/lowercase).

## Goals:
- Compare static embeddings of capitalized vs non-capitalized tokens
- Identify which dimensions show the highest deviation due to casing
- Explore whether these dimensions are semantically meaningful or structurally consistent across other tokens


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objs as go


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')
model.eval()


In [None]:
tokens_to_compare = ['cat', 'Cat', 'God']

token_ids = [
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(token)[0])
    if tokenizer.tokenize(token) else None
    for token in tokens_to_compare
]

with torch.no_grad():
    embedding_weights = model.embeddings.word_embeddings.weight

token_vectors = {
    token: embedding_weights[token_id].detach().numpy()
    for token, token_id in zip(tokens_to_compare, token_ids)
    if token_id is not None
}


In [None]:
diff_cat_Cat = np.abs(token_vectors['cat'] - token_vectors['Cat'])

top_dims = np.argsort(diff_cat_Cat)[-15:][::-1]
top_diffs = diff_cat_Cat[top_dims]

capitalization_sensitive_dims = pd.DataFrame({
    'Dimension': top_dims,
    'Abs. Difference |cat - Cat|': top_diffs
})
capitalization_sensitive_dims


In [None]:
plt.figure(figsize=(16, 5))
plt.plot(-diff_cat_Cat, label="Symmetry: -|cat - Cat|", color='gray')

for idx in top_dims:
    plt.axvline(x=idx, color='red', linestyle='--', alpha=0.5)
    plt.text(idx, -diff_cat_Cat[idx], f"{idx}", rotation=90, verticalalignment='bottom', fontsize=8)

plt.title("Capitalization Sensitivity by Dimension – cat vs Cat")
plt.xlabel("Embedding Dimension")
plt.ylabel("Symmetry Measure")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
diff_cat_God = np.abs(token_vectors['cat'] - token_vectors['God'])
diff_Cat_God = np.abs(token_vectors['Cat'] - token_vectors['God'])

plt.figure(figsize=(16, 5))
plt.plot(diff_cat_God, label='|cat - God|', color='blue', alpha=0.6)
plt.plot(diff_Cat_God, label='|Cat - God|', color='green', alpha=0.6)

plt.title("Differences Between God and ['cat', 'Cat']")
plt.xlabel("Embedding Dimension")
plt.ylabel("Absolute Difference")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
