**Finding Top Similar Words**

Author: Lucas Ma

In [14]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
import numpy as np
import heapq

# Load the tokenizer and model
model_name = './fine-tuned-MacBERTh'  # Path to your fine-tuned model
# model_name = "emanjavacas/MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get word embedding in context
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

def find_top_similar_words(target_word, text, tokenizer, model, top_n=10):
    # Get contextual embeddings for the entire text
    contextual_embeddings, inputs = get_contextual_embeddings(text, tokenizer, model)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Tokenize and get embedding for the target word
    target_embedding = get_word_embedding(target_word, tokenizer, model)
    
    # Find embeddings for each word in context
    word_embeddings = []
    for token, embedding in zip(tokens, contextual_embeddings[0]):
        word_embeddings.append((token, embedding))
    
    # Calculate similarities
    similarities = []
    for token, embedding in word_embeddings:
        similarity = cosine_similarity(target_embedding.unsqueeze(0), embedding.unsqueeze(0)).item()
        similarities.append((token, similarity))
    
    top_entries = []
    seen_words = set()
    for token, similarity in similarities:
        if token not in seen_words:
            # Add the current entry to the heap
            heapq.heappush(top_entries, (similarity, token))
            seen_words.add(token)

            # If the heap exceeds size n, remove the smallest entry
            if len(top_entries) > top_n:
                # Remove the smallest entry and also remove it from the seen set
                removed_similarity, removed_token = heapq.heappop(top_entries)
                seen_words.remove(removed_token)

    # Extract the entries from the heap and sort them in descending order
    top_entries.sort(reverse=True, key=lambda x: x[0])

    return top_entries

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Path to the .txt file
file_path = 'data/A10010_short.txt'

# Read the text from the file
text = read_text_from_file(file_path)

# Target word to find similarities with
target_word = "natiue"

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_word, text, tokenizer, model, top_n=10)
print(f"Top 10 words most similar to '{target_word}':")
for word, similarity in top_similar_words:
    print(f"{similarity}: {word}")


Some weights of BertModel were not initialized from the model checkpoint at ./fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top 10 words most similar to 'natiue':
god: 0.9105949997901917
angry: 0.9092453718185425
world: 0.907972514629364
touch: 0.9073593616485596
john: 0.9065189361572266
wrought: 0.9064560532569885
none: 0.9043810367584229
find: 0.9040130376815796
argue: 0.9038161635398865
preach: 0.90345299243927
