<span style="font-family: Arial; font-size: 14pt;"><b>Finding Top Similar Words as Base Words</b></span><br>
Author: Lucas Ma<br>

In [2]:
from transformers import AutoTokenizer, AutoModel
import string, heapq, numpy as np
from torch.nn.functional import cosine_similarity
from nltk.corpus import stopwords
import os, json
import torch
import nltk
from nltk.tokenize import sent_tokenize

# Make sure to download the punkt tokenizer
nltk.download('punkt')

model_name = "emanjavacas/MacBERTh"
# model_name = "fine-tuned-MacBERTh"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/lucasma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def read_sentence_document(document_path):
    encodings = ['utf-8', 'latin-1', 'cp1252']
    for encoding in encodings:
        try:
            with open(document_path, 'r', encoding=encoding) as f:
                text = f.read()
            break  # Exit the loop if no exception was raised
        except UnicodeDecodeError:
            continue
    else:
        raise UnicodeDecodeError("Failed to read the file with any of the tried encodings.")
        
    text = text.lower()

    sentences = sent_tokenize(text)

    # Further split the sentences if there are more than 450 words
    split_sentences = []
    for sentence in sentences:
        words = sentence.split()
        while len(words) > 400:
            split_sentences.append(' '.join(words[:400]))
            words = words[400:]
        split_sentences.append(' '.join(words))

    # Strip whitespace and filter out empty sentences
    final_sentences = [s.strip() for s in split_sentences if s.strip()]
    
    return final_sentences


# Function to get word embeddings
def get_word_embedding(chunks, tokenizer, model):
    word_times = {}
    word_embeddings = {}
    for chunk in chunks:

        inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        for i, word in enumerate(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])):
            if word not in word_times:
                word_times[word] = 1
            else:
                word_times[word]+=1
            if word not in word_embeddings:
                word_embeddings[word] = embeddings[0, i, :].numpy()
            else:
                word_embeddings[word] = (word_embeddings[word] * (word_times[word]-1) + embeddings[0, i, :].numpy()) / word_times[word]
    return word_embeddings

def get_single_embedding(word, tokenizer, model):
    word_embeddings = {}
    chunk = word
    inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    for i, word in enumerate(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])):
        if word not in word_embeddings:
            word_embeddings[word] = embeddings[0, i, :].numpy()
    return word_embeddings

# Find the substitute words for the original keyword, by iterating over the standard word list
def substitute_word(word):
    json_dir = "/Users/lucasma/Documents/The States/ECBC/Code/ECBCData2024/standardizedwords.json"
    with open (json_dir, "r") as f:
        standardWord = json.load(f)
    ret = []
    for term, equals in standardWord.items():
        for spell in equals:
            if spell==word:
                ret.append(term)
    return ret

def clean_embedding(embeddings):
    clean = {}
    for token, embedding in embeddings.items():
        if (token in tokenizer.all_special_tokens) or (token.lower() in stop_words) or (token in string.punctuation) or (token.startswith('##')) or (token=="•"):
            clean[token] = embedding
    return clean

# Main function
def find_top_similar_words(target_words, sentences, tokenizer, model, top_n):
    
    embeddings = clean_embedding(get_word_embedding(sentences, tokenizer, model))

    key_sim_word = {}

    for keyWord in target_words:
        key_embeddings = {}
        presence = ""
        wordFound = False

        if keyWord in embeddings:
            presence = keyWord
            wordFound = True
        else:
            for equal in substitute_word(keyWord):
                wordFound = wordFound or equal in embeddings
                if wordFound:
                    presence = equal
                    break
        
        if not wordFound:
            key_embeddings[keyWord] = get_single_embedding(keyWord, tokenizer, model)
        else:
            key_embeddings[keyWord] = embeddings[presence]

        similarities = {}

        for token, embedding in embeddings.items():
            similarities[token] = np.dot(embedding, key_embeddings[keyWord]) / ((np.linalg.norm(embedding)) * (np.linalg.norm(key_embeddings[keyWord])))
    
        # Use a heap to find the top N similar words
        top_entries = []
        #seen_words = set()
        for token, similarity in similarities:
                heapq.heappush(top_entries, (token, similarity))

                if len(top_entries) > top_n:
                    removed_token, removed_similarity = heapq.heappop(top_entries)

        top_entries.sort(reverse=True, key=lambda x: x[0])

        key_sim_word[keyWord] = top_entries

    return key_sim_word


# Path to the .txt file
# file_path = '/Users/lucasma/Documents/The\ States/ECBC/Code/ECBCData2024/data/VirginiaTotal.txt'
file_path = "data/A10010_short.txt"

sentences = read_sentence_document(file_path)

# Target word to find similarities with
target_words = ["money", "christ", "light", "darkness", "clothes", "naked"]

# Find and print top 10 similar words
top_similar_words = find_top_similar_words(target_words, sentences, tokenizer, model, 50)

for word, words in top_similar_words.items():
    print(f"Top 50 words most similar to '{word}':")
    for relevantWord, sim in words.items():
        print(f"{relevantWord}: {sim}")


TypeError: unsupported operand type(s) for *: 'float' and 'dict'