 MacBERTh program reserves

This MacBERTh program uses the transformers library to find words used in a text that are similar in semantics and meaning as certain pre-selected keywords.

Author: Jerry Zou

In [19]:
from transformers import AutoTokenizer, AutoModel
import torch, string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from tqdm import tqdm

In [20]:
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModel.from_pretrained(modelName)



In [21]:
def preprocess_text(docx):
    content = Document(docx)
    lowercaseWords = []
    translator = str.maketrans('', '', string.punctuation)


    for paragraph in content.paragraphs:
        for run in paragraph.runs:
            text = run.text
            words = text.split()
            for word in words:
                word = word.translate(translator)
                lowercaseWords.append(word.lower())
    joinedList = " ".join(lowercaseWords)
    return tokenizer.tokenize(joinedList)
# FOR DEBUG: print(preprocess_text("/Users/Jerry/Desktop/test.docx")[:40])

In [28]:
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    #tokens = tokenizer.tokenize(text)
    #combined_word = tokenizer.convert_tokens_to_string(tokens)
    print(f"Encoding word: {text}")
    return outputs.last_hidden_state.mean(dim=1)  # Use the mean of the hidden states as the embedding

document_text = "/Users/Jerry/Desktop/test.docx"
tokenizedText = preprocess_text(document_text)
tokenTextEmbedding = {token: encode_text(token, tokenizer, model) for token in tokenizedText}

keywords = ["ciuilitie", "Sathan", "school", "instruction"]
tokenKeywordEmbeddings = {token: encode_text(token, tokenizer, model) for token in keywords}

# for token, embedding in tokenTextEmbedding.items():
#     print(f"Token: {token}, Embedding: {embedding}")


Encoding word: standing
Encoding word: above
Encoding word: the
Encoding word: coast
Encoding word: that
Encoding word: has
Encoding word: watched
Encoding word: fifteen
Encoding word: generations
Encoding word: of
Encoding word: maritime
Encoding word: ambition
Encoding word: and
Encoding word: sp
Encoding word: ##illed
Encoding word: blood
Encoding word: the
Encoding word: stone
Encoding word: statue
Encoding word: of
Encoding word: lud
Encoding word: ##mi
Encoding word: ##lla
Encoding word: fel
Encoding word: ##isc
Encoding word: ##hen
Encoding word: ##a
Encoding word: greet
Encoding word: ##ed
Encoding word: travel
Encoding word: ##ers
Encoding word: from
Encoding word: the
Encoding word: los
Encoding word: angel
Encoding word: ##es
Encoding word: direction
Encoding word: with
Encoding word: an
Encoding word: eternal
Encoding word: smile
Encoding word: on
Encoding word: her
Encoding word: face
Encoding word: and
Encoding word: a
Encoding word: torch
Encoding word: in
Encoding word:

In [29]:
def find_similar_words(keyword_embeddings, token_embeddings):
    similar_words = {}
    for keyword, keyword_emb in keyword_embeddings.items():
        similarities = []
        for token, token_emb in token_embeddings.items():
            sim = cosine_similarity(keyword_emb, token_emb)
            similarities.append((token, sim[0][0]))
        # Sort by similarity score in descending order
        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_words[keyword] = similarities
    return similar_words

similar_words = find_similar_words(tokenKeywordEmbeddings, tokenTextEmbedding)

for keyword, words in similar_words.items():
    print(f"Words similar to '{keyword}':")
    for word, similarity in words[:10]:
        print(f" {word}: {similarity}")

Words similar to 'ciuilitie':
 goodnesse: 0.9822680950164795
 goodnes: 0.9791624546051025
 happinesse: 0.9781022667884827
 greatnesse: 0.9777007699012756
 liberalitie: 0.9775600433349609
 feare: 0.977512001991272
 countrey: 0.976071834564209
 heauen: 0.976024866104126
 dyet: 0.975933313369751
 tongue: 0.975925624370575
Words similar to 'Sathan':
 sathan: 0.9999999403953552
 satan: 0.9975593686103821
 sin: 0.9938597083091736
 sins: 0.9937807321548462
 sicke: 0.993764340877533
 cures: 0.9937357902526855
 interim: 0.9936264157295227
 qui: 0.9934446811676025
 amic: 0.9934303164482117
 bene: 0.9932165741920471
Words similar to 'school':
 society: 0.990674614906311
 wish: 0.9898231029510498
 quite: 0.9898009300231934
 bill: 0.9897968769073486
 qualified: 0.9896359443664551
 described: 0.9896279573440552
 doing: 0.9895373582839966
 enable: 0.9895321130752563
 measure: 0.9893690347671509
 labourers: 0.9888454079627991
Words similar to 'instruction':
 labourers: 0.9926059246063232
 quite: 0.990