<a href="https://colab.research.google.com/github/vruddhis/semanticshift/blob/main/comparing_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Downgrade pandas to what Colab and dask-cudf expect
!pip install pandas==2.2.2 --quiet

# Downgrade transformers to match sentence-transformers
!pip install transformers==4.41.0 --quiet

# Downgrade torch to match torchvision/torchaudio/fastai
!pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.8.4 requires torch<2.9,>=1.10, but you have torch 2.9.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.9/887.9 MB[0m [31m703.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.4/322.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m


In [2]:
import json
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel


In [32]:
from google.colab import files
uploaded = files.upload()
file_path = list(uploaded.keys())[0]
print("Uploaded:", file_path)




Saving train.data.jl to train.data.jl
Uploaded: train.data.jl


In [33]:

data = []
with open(file_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} instances")


Loaded 1428 instances


In [34]:
MODEL_NAME = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [35]:
def get_target_embedding(text, word, token_idx=None, char_start=None, char_end=None):
    inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    offsets = inputs.pop("offset_mapping")[0]
    tokens = inputs["input_ids"]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state[0].cpu()

    if char_start is not None and char_end is not None:
        target_indices = [i for i, (s, e) in enumerate(offsets) if s <= char_start < e or s < char_end <= e]
    elif token_idx is not None:
        target_indices = [token_idx]
    else:
        return None

    if len(target_indices) == 0:
        return None

    emb = hidden_states[target_indices].mean(dim=0)
    return emb.numpy()


In [59]:
rows = []
for item in tqdm(data):
    try:
        w = item["word"]
        t1, t2 = item["tweet1"], item["tweet2"]

        emb1 = get_target_embedding(
            t1["text"], w,
            token_idx=t1.get("token_idx"),
            char_start=t1.get("text_start"),
            char_end=t1.get("text_end"),
        )
        emb2 = get_target_embedding(
            t2["text"], w,
            token_idx=t2.get("token_idx"),
            char_start=t2.get("text_start"),
            char_end=t2.get("text_end"),
        )

        if emb1 is not None and emb2 is not None:
            rows.append({
                "word": w,
                "text1": t1["text"],
                "text2": t2["text"],
                "date1": t1["date"],
                "date2": t2["date"],
                "emb1": emb1,
                "emb2": emb2
            })
    except Exception as e:
        print(f"Error on {item['id']}: {e}")
        continue

df = pd.DataFrame(rows)
print(df.head())


100%|██████████| 1428/1428 [11:25<00:00,  2.08it/s]

    word                                              text1  \
0  frisk  my new most wanted character in smash is frisk...   
1  frisk  imagine seeing qoute from cave story making it...   
2  frisk  frisk and sans are my two favorite undertale c...   
3  frisk  We don't like the search and frisk so this bit...   
4  frisk  Hey guys I'm wondering if anybody would draw m...   

                                               text2    date1    date2  \
0  I was surprised by how much applause Bloomberg...  2019-02  2020-02   
1  Bloomberg? Are you people for real?16 cases of...  2019-02  2020-02   
2  Today, in my wrongful convictions class we lis...  2019-02  2020-02   
3  who the fuck is listening to mike bloomberg ra...  2019-02  2020-02   
4  How about stop in frisk happened in mostly min...  2019-02  2020-02   

                                                emb1  \
0  [-0.41409454, -1.0239263, 0.96580446, 0.138374...   
1  [0.07960042, -0.688802, 0.5348002, 0.011716709...   
2  [-0.1




In [37]:
!pip install POT




In [60]:
def cosine_sim(emb1, emb2):
    if emb1 is None or emb2 is None:
        return None
    return cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

df["cosine_similarity"] = df.apply(lambda row: cosine_sim(row["emb1"], row["emb2"]), axis=1)


In [61]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jensenshannon
from collections import Counter


def cosine_sim(emb1, emb2):
    if emb1 is None or emb2 is None:
        return None
    return cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

def nn_overlap(embeddings_t1, embeddings_t2):
    c1 = np.mean(embeddings_t1, axis=0)
    c2 = np.mean(embeddings_t2, axis=0)
    return cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0]

def centroid_shift(embeddings_t1, embeddings_t2):
    c1 = np.mean(embeddings_t1, axis=0)
    c2 = np.mean(embeddings_t2, axis=0)
    return 1 - cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0]

def dispersion_change(embeddings_t1, embeddings_t2):
    def disp(embs):
        c = np.mean(embs, axis=0)
        return np.mean([np.linalg.norm(e - c) for e in embs])
    return disp(embeddings_t2) - disp(embeddings_t1)

def temporal_ref_similarity(embeddings_t1, embeddings_t2):
    sims = [cosine_similarity(e1.reshape(1, -1), e2.reshape(1, -1))[0][0]
            for e1 in embeddings_t1 for e2 in embeddings_t2]
    return np.mean(sims)

def contextual_entropy(texts):
    counts = Counter(texts)
    probs = np.array(list(counts.values())) / sum(counts.values())
    return -np.sum(probs * np.log2(probs + 1e-12))

def uot_shift(embeddings_t1, embeddings_t2):
    X = np.vstack(embeddings_t1)
    Y = np.vstack(embeddings_t2)
    a = np.ones(len(X)) / len(X)
    b = np.ones(len(Y)) / len(Y)
    M = ot.dist(X, Y, metric='cosine')
    G = ot.unbalanced.sinkhorn_unbalanced(a, b, M, reg=0.01, reg_m=0.1)
    return np.sum(G * M)


grouped = df.groupby("word").agg({
    "emb1": list,
    "emb2": list,
    "text1": list,
    "text2": list,
    "cosine_similarity": list
})


grouped["cosine_mean"] = grouped["cosine_similarity"].apply(lambda sims: np.mean(sims))
grouped["nn_overlap"] = grouped.apply(lambda row: nn_overlap(row["emb1"], row["emb2"]), axis=1)
grouped["centroid_shift"] = grouped.apply(lambda row: centroid_shift(row["emb1"], row["emb2"]), axis=1)
grouped["dispersion_change"] = grouped.apply(lambda row: dispersion_change(row["emb1"], row["emb2"]), axis=1)
grouped["temporal_ref_sim"] = grouped.apply(lambda row: temporal_ref_similarity(row["emb1"], row["emb2"]), axis=1)
grouped["entropy_t1"] = grouped["text1"].apply(contextual_entropy)
grouped["entropy_t2"] = grouped["text2"].apply(contextual_entropy)
grouped["entropy_change"] = grouped["entropy_t2"] - grouped["entropy_t1"]
grouped["uot_shift"] = grouped.apply(lambda row: uot_shift(row["emb1"], row["emb2"]), axis=1)



In [62]:
from sklearn.metrics.pairwise import cosine_similarity

def top_k_neighbors(embeddings, all_embeddings, k=10):
    sims = cosine_similarity(np.vstack(embeddings), np.vstack(all_embeddings))
    avg_sims = sims.mean(axis=0)
    return np.argsort(-avg_sims)[:k]

def rbo_score(list1, list2, p=0.9):
    overlap = 0.0
    for i, (a, b) in enumerate(zip(list1, list2), 1):
        if a in list2[:i]:
            overlap += 1 / i
    return (1 - p) * overlap



In [63]:
from sklearn.cluster import KMeans

def cluster_shift(embeddings_t1, embeddings_t2, n_clusters=2):
    X1 = np.vstack(embeddings_t1)
    X2 = np.vstack(embeddings_t2)
    kmeans1 = KMeans(n_clusters=min(n_clusters, len(X1))).fit(X1)
    kmeans2 = KMeans(n_clusters=min(n_clusters, len(X2))).fit(X2)
    centroids1 = kmeans1.cluster_centers_
    centroids2 = kmeans2.cluster_centers_
    shift = np.mean([min(np.linalg.norm(c1 - c2) for c2 in centroids2) for c1 in centroids1])
    return shift


In [64]:
from scipy.stats import wasserstein_distance

def wasserstein_shift(embeddings_t1, embeddings_t2):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=1)
    all_embs = np.vstack(embeddings_t1 + embeddings_t2)
    pca.fit(all_embs)
    proj_t1 = pca.transform(np.vstack(embeddings_t1)).flatten()
    proj_t2 = pca.transform(np.vstack(embeddings_t2)).flatten()
    return wasserstein_distance(proj_t1, proj_t2)

from collections import Counter

def context_entropy(contexts):
    counts = Counter()
    for ctx in contexts:
        tokens = ctx.split()
        counts.update(tokens)
    probs = np.array(list(counts.values())) / sum(counts.values())
    return -np.sum(probs * np.log2(probs + 1e-12))

def entropy_shift(texts_t1, texts_t2):
    return context_entropy(texts_t2) - context_entropy(texts_t1)


In [65]:

grouped["cluster_shift"] = grouped.apply(lambda row: cluster_shift(row["emb1"], row["emb2"]), axis=1)
grouped["wasserstein_shift"] = grouped.apply(lambda row: wasserstein_shift(row["emb1"], row["emb2"]), axis=1)
grouped["context_entropy_shift"] = grouped.apply(lambda row: entropy_shift(row["text1"], row["text2"]), axis=1)


In [66]:
import seaborn as sns
import matplotlib.pyplot as plt

metrics = ["cosine_mean", "uot_shift", "wasserstein_shift", "context_entropy_shift"]

corr = grouped[metrics].corr()
print(corr)


                       cosine_mean  uot_shift  wasserstein_shift  \
cosine_mean               1.000000   0.102114          -0.275696   
uot_shift                 0.102114   1.000000          -0.147233   
wasserstein_shift        -0.275696  -0.147233           1.000000   
context_entropy_shift     0.104229  -0.211980          -0.274519   

                       context_entropy_shift  
cosine_mean                         0.104229  
uot_shift                          -0.211980  
wasserstein_shift                  -0.274519  
context_entropy_shift               1.000000  


In [71]:
top_n = 3
for m in metrics:
    print(f"\nTop {top_n} words by {m}:")
    if m in ["cosine_mean", "nn_overlap", "temporal_ref_sim"]:  # similarity metrics
        top_words = grouped.sort_values(m, ascending=True).head(top_n)
    else:
        top_words = grouped.sort_values(m, ascending=False).head(top_n)
    print(top_words.index.tolist())



Top 3 words by cosine_mean:
['folklore', 'epicenter', 'teargas']

Top 3 words by uot_shift:
['villager', 'turnip', 'pogrom']

Top 3 words by wasserstein_shift:
['entanglement', 'turnip', 'frisk']

Top 3 words by context_entropy_shift:
['ventilator', 'mask', 'frisk']


different metrics capture different types of shifts


* Cosine similarity is largely independent from UOT, Wasserstein, and context entropy. This means average embedding shift often does not align with distributional or context diversity shifts.
* UOT is capturing distributional/contextual shifts that other metrics mostly
miss.
* Context entropy aptures change in diversity of word contexts, which is mostly independent from embedding-based metrics.
