<span style="font-family: Arial; font-size: 14pt;"><b>Finding Top Similar Words</b></span><br>
Author: Lucas Ma<br>
Edited by Jerry Zou

This version of finding the top n words with the highest cosine similarity is able to take into account the contextual information, and it could do so in a strikingly efficient way.

<span style="font-family: Arial; font-size: 11pt;"><b>Note:</b></span><br>
The following code should not be run until you run the pretrainMac.ipynb--it will help you train a fine-tuned MacBERTh locally and store it in a folder (which will be created) called fine-tuned-MacBERTh, and you do not have to worry about the trained model being too large to be pushed onto GitHub. The folder that contains the model has been ignored by Git, which can be shown in the file .gitignore.

<span style="font-family: Arial; font-size: 11pt;">Essentially, go run the program retrainMac.ipynb and come back to run the following code. Feel free to git add or git commit or git push as per normal.</span>

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch, string, json, heapq, pandas as pd, numpy as np
from torch.nn.functional import cosine_similarity
# from docx import Document
# from tqdm import tqdm
#import matplotlib.pyplot as plt
#import networkx as nx
#import seaborn as sns
from nltk.corpus import stopwords

In [6]:
# model_name = "/Users/Jerry/Desktop/finetunedBibleNonBible"  # Path to your fine-tuned model
# model_name = "emanjavacas/MacBERTh"
model_name = "fine-tuned-MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
stop_words = set(stopwords.words('english'))

with open("data/StopWord.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        stop_words.add(line.strip())

# Function to get contextual embeddings for a chunk of text
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

# Function to split text into chunks of a given size
def split_text_into_chunks(text, tokenizer, chunk_size=50):
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function to get the embedding for a specific word
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors="pt", padding='max_length', max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze()

def find_top_similar_words(target_word, text, tokenizer, model, top_n=10):
    chunks = split_text_into_chunks(text, tokenizer)
    # Tokenize and get embedding for the target word
    target_embedding = get_word_embedding(target_word, tokenizer, model)

    # Collect similarities across all chunks
    similarities = []
    for chunk in chunks:
        contextual_embeddings, inputs = get_contextual_embeddings(chunk, tokenizer, model)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Find embeddings for each word in context
        for i, (token, embedding) in enumerate(zip(tokens, contextual_embeddings[0])):
            if (token in tokenizer.all_special_tokens) or (token.lower() in stop_words) or (token in string.punctuation) or (token.startswith('##')) or (token=="•"):
                continue
            similarity = cosine_similarity(target_embedding.unsqueeze(0), embedding.unsqueeze(0)).item()
            similarities.append((token, similarity))
    
    # Use a heap to find the top N similar words
    top_entries = []
    #seen_words = set()
    for token, similarity in similarities:
        #if token not in seen_words:
            heapq.heappush(top_entries, (similarity, token))
            #seen_words.add(token)

            if len(top_entries) > top_n:
                removed_similarity, removed_token = heapq.heappop(top_entries)
                #seen_words.remove(removed_token)

    top_entries.sort(reverse=True, key=lambda x: x[0])
    return top_entries

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Path to the .txt file
file_path = 'data/copland.txt'

text = read_text_from_file(file_path)

# Target word to find similarities with
target_word = "profit"

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_word, text, tokenizer, model, top_n=20)
print(f"Top 20 words most similar to '{target_word}':")
for similarity, word in top_similar_words:
    print(f"{similarity}: {word}")

Some weights of BertModel were not initialized from the model checkpoint at fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top 20 words most similar to 'profit':
0.3997337222099304: pron
0.38985148072242737: best
0.38830727338790894: giuing
0.3871503472328186: num
0.3871023654937744: let
0.3826659023761749: doe
0.3803568184375763: giue
0.3793027102947235: brev
0.37927481532096863: waters
0.3782011568546295: verba
0.3778243064880371: quid
0.3776637315750122: ipsa
0.375434547662735: ano
0.3751353621482849: quint
0.37412822246551514: sp
0.3730430603027344: virginia
0.37267985939979553: ore
0.37196341156959534: un
0.3696487843990326: cotton
0.36941593885421753: bengal


----
Code below are merged code between Lucas's and Jerry's programs.

In [4]:
# Lucas-Jerry Merged Code for Top Similar Words in a document

# modelName = "/Users/Jerry/Desktop/finetunedBibleNonBible"  # Path to your fine-tuned model
# modelName = "emanjavacas/MacBERTh"
modelName = "fine-tuned-MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModel.from_pretrained(modelName)
stop_words = set(stopwords.words('english'))
stop_words.add("us")

def preprocess_text(document_text):
    with open(document_text, 'r') as file:
        text = file.read()
    return text.split()

# Function to get contextual embeddings for a chunk of text
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

# Function to split text into chunks using a sliding window approach
def split_text_into_chunks(text, tokenizer, chunk_size=50, overlap=256):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunks.append(' '.join(chunk_tokens))
    return chunks

# Function to get the embedding for a word in context
def get_word_embedding_in_context(word, context, tokenizer, model):
    inputs = tokenizer(context, return_tensors="pt", padding='max_length', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_ids = inputs['input_ids'][0]
    word_token_id = tokenizer.convert_tokens_to_ids(word)
    for idx, token_id in enumerate(token_ids):
        if token_id == word_token_id:
            return outputs.last_hidden_state[0, idx, :]
    return None

# Function to encode text
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # print(f"Encoding word: {text}")
    return outputs.last_hidden_state.mean(dim=1)  # Use the mean of the hidden states as the embedding

def find_top_similar_words(keyword_embeddings, token_embeddings, top_n=10):
    similar_words = {}
    for keyword, keyword_emb in keyword_embeddings.items():
        similarities = []
        for token, token_emb in token_embeddings.items():
            sim = cosine_similarity(keyword_emb.unsqueeze(0), token_emb.unsqueeze(0)).item()
            similarities.append((token, sim))
        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_words[keyword] = similarities[:top_n]
    return similar_words

file_path = 'data/copland.txt'
tokenized_text = preprocess_text(file_path)
token_text_embedding = {token: encode_text(token, tokenizer, model) for token in tokenized_text}

keywords = ["conversion"]
token_keyword_embeddings = {token: encode_text(token, tokenizer, model) for token in keywords}

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(token_keyword_embeddings, token_text_embedding, top_n=50)
print("Top similar words:")
for keyword, similar_words in top_similar_words.items():
    print(f"Top words most similar to '{keyword}':")
    for similarity, word in similar_words:
        print(f"{similarity}: {word}")

# json_storage = "SimilarWords.json"
# with open(json_storage, "w") as file:
#     json.dump(top_similar_words, file)

Some weights of BertModel were not initialized from the model checkpoint at fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:130] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 