**Finding Top Similar Words**

Author: Lucas Ma

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
import heapq

# Load the tokenizer and model
model_name = './fine-tuned-MacBERTh'  # Path to your fine-tuned model
# model_name = "emanjavacas/MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get contextual embeddings for a chunk of text
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

# Function to split text into chunks of a given size
def split_text_into_chunks(text, tokenizer, chunk_size=512):
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function to get the embedding for a specific word
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors="pt", padding='max_length', max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze()

def find_top_similar_words(target_word, text, tokenizer, model, top_n=10):
    # Split the text into chunks
    chunks = split_text_into_chunks(text, tokenizer)

    # Tokenize and get embedding for the target word
    target_embedding = get_word_embedding(target_word, tokenizer, model)

    # Collect similarities across all chunks
    similarities = []
    for chunk in chunks:
        contextual_embeddings, inputs = get_contextual_embeddings(chunk, tokenizer, model)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Find embeddings for each word in context
        for i, (token, embedding) in enumerate(zip(tokens, contextual_embeddings[0])):
            if token in tokenizer.all_special_tokens:
                continue
            similarity = cosine_similarity(target_embedding.unsqueeze(0), embedding.unsqueeze(0)).item()
            similarities.append((token, similarity))
    
    # Use a heap to find the top N similar words
    top_entries = []
    seen_words = set()
    for token, similarity in similarities:
        if token not in seen_words:
            heapq.heappush(top_entries, (similarity, token))
            seen_words.add(token)

            if len(top_entries) > top_n:
                removed_similarity, removed_token = heapq.heappop(top_entries)
                seen_words.remove(removed_token)

    top_entries.sort(reverse=True, key=lambda x: x[0])
    return top_entries

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Path to the .txt file
file_path = 'data/bible_full_text.txt'

# Read the text from the file
text = read_text_from_file(file_path)

# Target word to find similarities with
target_word = "god"

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_word, text, tokenizer, model, top_n=10)
print(f"Top 10 words most similar to '{target_word}':")
for similarity, word in top_similar_words:
    print(f"{similarity}: {word}")


Some weights of BertModel were not initialized from the model checkpoint at ./fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top 10 words most similar to 'divine':
0.44577479362487793: #
0.4430214464664459: tooth
0.4426579773426056: dead
0.44222792983055115: death
0.4397338628768921: work
0.4395444691181183: grace
0.4383096694946289: equity
0.4339000880718231: write
0.4326639175415039: galath
0.4303591549396515: purpose
