<span style="font-family: Arial; font-size: 14pt;"><b>Finding Top Similar Words</b></span><br>
Author: Lucas Ma<br>
Edited by Jerry Zou

This version of finding the top n words with the highest cosine similarity is able to take into account the contextual information, and it could do so in a strikingly efficient way.

<span style="font-family: Arial; font-size: 11pt;"><b>Note:</b></span><br>
The following code should not be run until you run the pretrainMac.ipynb--it will help you train a fine-tuned MacBERTh locally and store it in a folder (which will be created) called fine-tuned-MacBERTh, and you do not have to worry about the trained model being too large to be pushed onto GitHub. The folder that contains the model has been ignored by Git, which can be shown in the file .gitignore.

<span style="font-family: Arial; font-size: 11pt;">Essentially, go run the program retrainMac.ipynb and come back to run the following code. Feel free to git add or git commit or git push as per normal.</span>

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch, string, json, heapq
import numpy as np
from torch.nn.functional import cosine_similarity
# from docx import Document
from tqdm import tqdm
#import matplotlib.pyplot as plt
#import networkx as nx
#import seaborn as sns
import pandas as pd
from nltk.corpus import stopwords

In [6]:
# Load the tokenizer and model
model_name = './fine-tuned-MacBERTh'  # Path to your fine-tuned model
# model_name = "emanjavacas/MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
stop_words = set(stopwords.words('english'))

# Function to get contextual embeddings for a chunk of text
def get_contextual_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state, inputs

# Function to split text into chunks of a given size
def split_text_into_chunks(text, tokenizer, chunk_size=512):
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
    return chunks

# Function to get the embedding for a specific word
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors="pt", padding='max_length', max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.squeeze()

def find_top_similar_words(target_word, text, tokenizer, model, top_n=10):
    # Split the text into chunks
    chunks = split_text_into_chunks(text, tokenizer)

    # Tokenize and get embedding for the target word
    target_embedding = get_word_embedding(target_word, tokenizer, model)

    # Collect similarities across all chunks
    similarities = []
    for chunk in chunks:
        contextual_embeddings, inputs = get_contextual_embeddings(chunk, tokenizer, model)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Find embeddings for each word in context
        for i, (token, embedding) in enumerate(zip(tokens, contextual_embeddings[0])):
            if token in tokenizer.all_special_tokens:
                continue
            similarity = cosine_similarity(target_embedding.unsqueeze(0), embedding.unsqueeze(0)).item()
            similarities.append((token, similarity))
    
    # Use a heap to find the top N similar words
    top_entries = []
    #seen_words = set()
    for token, similarity in similarities:
        #if token not in seen_words:
            heapq.heappush(top_entries, (similarity, token))
            #seen_words.add(token)

            if len(top_entries) > top_n:
                removed_similarity, removed_token = heapq.heappop(top_entries)
                #seen_words.remove(removed_token)

    top_entries.sort(reverse=True, key=lambda x: x[0])
    return top_entries

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Path to the .txt file
file_path = 'data/A10010.P4 copy.txt'

# Read the text from the file
text = read_text_from_file(file_path)

# Target word to find similarities with
target_word = "work"

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_word, text, tokenizer, model, top_n=10)
print(f"Top 10 words most similar to '{target_word}':")
for similarity, word in top_similar_words:
    print(f"{similarity}: {word}")

Some weights of BertModel were not initialized from the model checkpoint at ./fine-tuned-MacBERTh and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Top 10 words most similar to 'work':
0.6188908219337463: of
0.5968559384346008: his
0.5933787226676941: us
0.5915705561637878: made
0.5914732813835144: comes
0.5902810096740723: us
0.589349627494812: my
0.5881844758987427: and
0.5880509614944458: #
0.5873427391052246: is


----
Code below are merged code between Lucas's and Jerry's programs.

In [None]:
# In progress

# Lucas-Jerry Merged Code for Top Similar Words in a document
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModel.from_pretrained(modelName)

def preprocess_text(docx):
    content = Document(docx)
    lowercaseWords = []
    translator = str.maketrans('', '', string.punctuation)

    for paragraph in content.paragraphs:
        for run in paragraph.runs:
            text = run.text
            words = text.split()
            for word in words:
                word = word.translate(translator)
                lowercaseWords.append(word.lower())
    joinedList = " ".join(lowercaseWords)
    return tokenizer.tokenize(joinedList)
# FOR DEBUG: print(preprocess_text("/Users/Jerry/Desktop/test.docx")[:40])

def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    #tokens = tokenizer.tokenize(text)
    #combined_word = tokenizer.convert_tokens_to_string(tokens)
    print(f"Encoding word: {text}")
    return outputs.last_hidden_state.mean(dim=1)  # Use the mean of the hidden states as the embedding

document_text = "/Users/Jerry/Desktop/test.docx"
tokenizedText = preprocess_text(document_text)
tokenTextEmbedding = {token: encode_text(token, tokenizer, model) for token in tokenizedText}

keywords = ["ciuilitie", "Sathan", "school", "instruction"]
tokenKeywordEmbeddings = {token: encode_text(token, tokenizer, model) for token in keywords}

# for token, embedding in tokenTextEmbedding.items():
#     print(f"Token: {token}, Embedding: {embedding}")


def find_similar_words(keyword_embeddings, token_embeddings):
    similar_words = {}
    for keyword, keyword_emb in keyword_embeddings.items():
        similarities = []
        for token, token_emb in token_embeddings.items():
            sim = cosine_similarity(keyword_emb, token_emb)
            similarities.append((token, float(sim[0][0])))
        # Sort by similarity score in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_words[keyword] = similarities
    return similar_words

similar_words = find_similar_words(tokenKeywordEmbeddings, tokenTextEmbedding)

shortenedSimilarWords = {}

for keyword, words in similar_words.items():
    #print(f"Words similar to '{keyword}':")
    for word, similarity in words[:50]:
        #print(f" {word}: {similarity}")
        shortenedSimilarWords[keyword] = words[:12]

print(shortenedSimilarWords)

jsonStorage = "SimilarWords.json"
with open(jsonStorage, "w") as file:
    json.dump(shortenedSimilarWords, file)