In [79]:
from transformers import GPT2Model, GPT2Tokenizer
import pandas as pd
import numpy as np
import torch

In [80]:
def cosine_similarity(tensor_list1, tensor_list2):
    if len(tensor_list1) != len(tensor_list2):
        raise ValueError("Both tensor lists must have the same length.")
    
    similarities = []
    for u, v in zip(tensor_list1, tensor_list2):
        # Ensure the tensors are floats for dot product calculation
        u, v = u.float(), v.float()
        dot_product = torch.dot(u, v)
        norm_u = torch.norm(u)
        norm_v = torch.norm(v)
        if norm_u.item() == 0 or norm_v.item() == 0:
            similarity = 0.0  # Avoid division by zero
        else:
            similarity = dot_product / (norm_u * norm_v)
        similarities.append(similarity)
    
    return similarities

In [81]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)

In [82]:
df = pd.read_csv("5000_synonym.csv", index_col = "rank")
df.head()

Unnamed: 0_level_0,lemma,PoS,freq,synonym,synset
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26,say,v,4096416,tell,Synset('state.v.01')
31,go,v,3546732,belong,Synset('belong.v.03')
34,get,v,3347615,convey,Synset('bring.v.04')
39,know,v,2761628,,
49,would,v,2349400,,


In [89]:
# Tokenize input text
text = "go"  # replace 'example' with your specific word
inputs = tokenizer(text, return_tensors="pt")

# Get all hidden states
outputs = model(**inputs)
hidden_states = outputs.hidden_states

# Access the embeddings for the first token of the input
# For a single word input, this typically corresponds to the embeddings of the word
say_embeddings = [layer[0][0] for layer in hidden_states]

In [90]:
# Tokenize input text
text = "blend"  # replace 'example' with your specific word
inputs = tokenizer(text, return_tensors="pt")

# Get all hidden states
outputs = model(**inputs)
hidden_states = outputs.hidden_states

# Access the embeddings for the first token of the input
# For a single word input, this typically corresponds to the embeddings of the word
bedroom_embeddings = [layer[0][0] for layer in hidden_states]

In [91]:
cosine_similarity(say_embeddings, bedroom_embeddings)

[tensor(0.9063, grad_fn=<DivBackward0>),
 tensor(0.9731, grad_fn=<DivBackward0>),
 tensor(0.9991, grad_fn=<DivBackward0>),
 tensor(0.9999, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(1.0000, grad_fn=<DivBackward0>),
 tensor(0.9946, grad_fn=<DivBackward0>)]