In [19]:
from transformers import AutoTokenizer, AutoModel
import torch, string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import pandas as pd
import json
import nltk

In [6]:
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModel.from_pretrained(modelName)





In [None]:
# Preprocess by word
def preprocessTextByWord(sentence):
    translator = str.maketrans('', '', string.punctuation)
    sentence = sentence.translate(translator)
    lowercase_sentence = sentence.lower()
    
    return tokenizer.tokenize(lowercase_sentence)
# FOR DEBUG: print(preprocessText("/Users/Jerry/Desktop/test.docx")[:40])

In [13]:
# Preprocess by sentence
def preprocessTextBySentence(sentence):
    # Remove punctuation and convert to lowercase
    translator = str.maketrans('', '', string.punctuation)
    sentence = sentence.translate(translator)
    lowercase_sentence = sentence.lower()
    return lowercase_sentence

def sentenceSegmentation(text):
    segmentedSentence = nltk.sent_tokenize(text)
    return segmentedSentence

In [26]:
def encode_token(token, tokenizer, model):
    inputs = tokenizer(token, return_tensors="pt", truncation=True, padding="max_length", max_length=30)
    print(f"ENCODING: {token}, {inputs}")
    with torch.no_grad():
        outputs = model(**inputs)
    print(f"ENCODED: {token}")
    return outputs.last_hidden_state.mean(dim=1)
documentText = "You will say, what needeth all this Discourse, touching the Danger of Sea-men. We are met together for another purpose, to giue thanks vnto God?"
sentences = sentenceSegmentation(documentText)
sentence_embeddings = {sentence: encode_token(preprocessTextBySentence(sentence), tokenizer, model) for sentence in sentences}

ENCODING: you will say what needeth all this discourse touching the danger of seamen, {'input_ids': tensor([[    2,   969,   976,  1312,  1023, 16787,   924,   905,  5997,  3653,
           828,  2187,   834, 12435,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])}


ENCODED: you will say what needeth all this discourse touching the danger of seamen
ENCODING: we are met together for another purpose to giue thanks vnto god, {'input_ids': tensor([[   2,  915,  929, 2347, 1770,  869, 1419, 2761,  839, 2691, 5381, 1465,
          960,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])}
ENCODED: we are met together for another purpose to giue thanks vnto god


In [None]:
sentence_embeddings = {}
for sentence in sentences:
    processed_sentence = preprocessTextBySentence(sentence)
    sentence_embedding = encode_token(processed_sentence, tokenizer, model)
    sentence_embeddings[sentence] = sentence_embedding