<span style="font-family: Arial; font-size: 14pt;"><b>Finding Top Similar Words</b></span><br>
Author: Lucas Ma


This version of finding the top n words with the highest cosine similarity is able to take into account the contextual information, and it could do so in a strikingly efficient way.

<span style="font-family: Arial; font-size: 11pt;"><b>Note:</b></span><br>
The following code should not be run until you run the pretrainMac.ipynb--it will help you train a fine-tuned MacBERTh locally and store it in a folder (which will be created) called fine-tuned-MacBERTh, and you do not have to worry about the trained model being too large to be pushed onto GitHub. The folder that contains the model has been ignored by Git, which can be shown in the file .gitignore.

<span style="font-family: Arial; font-size: 11pt;">Essentially, go run the program retrainMac.ipynb and come back to run the following code. Feel free to git add or git commit or git push as per normal.</span>

In [21]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
import heapq

# Load the tokenizer and model
model_name = './fine-tuned-MacBERTh'  # Path to your fine-tuned model
# model_name = "emanjavacas/MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get contextual embeddings for a chunk of text
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

# Function to split text into chunks of a given size
def split_text_into_chunks(text, tokenizer, chunk_size=512):
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function to get the embedding for a specific word
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors="pt", padding='max_length', max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze()

def find_top_similar_words(target_word, text, tokenizer, model, top_n=10):
    # Split the text into chunks
    chunks = split_text_into_chunks(text, tokenizer)

    # Tokenize and get embedding for the target word
    target_embedding = get_word_embedding(target_word, tokenizer, model)

    # Collect similarities across all chunks
    similarities = []
    for chunk in chunks:
        contextual_embeddings, inputs = get_contextual_embeddings(chunk, tokenizer, model)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Find embeddings for each word in context
        for i, (token, embedding) in enumerate(zip(tokens, contextual_embeddings[0])):
            if token in tokenizer.all_special_tokens:
                continue
            similarity = cosine_similarity(target_embedding.unsqueeze(0), embedding.unsqueeze(0)).item()
            similarities.append((token, similarity))
    
    # Use a heap to find the top N similar words
    top_entries = []
    #seen_words = set()
    for token, similarity in similarities:
        #if token not in seen_words:
            heapq.heappush(top_entries, (similarity, token))
            #seen_words.add(token)

            if len(top_entries) > top_n:
                removed_similarity, removed_token = heapq.heappop(top_entries)
                #seen_words.remove(removed_token)

    top_entries.sort(reverse=True, key=lambda x: x[0])
    return top_entries

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Path to the .txt file
file_path = 'data/A10010.P4 copy.txt'

# Read the text from the file
text = read_text_from_file(file_path)

# Target word to find similarities with
target_word = "work"

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_word, text, tokenizer, model, top_n=10)
print(f"Top 10 words most similar to '{target_word}':")
for similarity, word in top_similar_words:
    print(f"{similarity}: {word}")


Some weights of BertModel were not initialized from the model checkpoint at ./fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top 10 words most similar to 'work':
0.5531777739524841: #
0.5477403402328491: of
0.54087895154953: when
0.5402771830558777: #
0.5390589833259583: therefore
0.538948118686676: made
0.5377960205078125: have
0.5375989675521851: may
0.5355035066604614: it
0.533892810344696: ,
