In [8]:
from bert_predict_class import bert_predict
from numpy import dot
from numpy.linalg import norm
import softmax_and_log_dicts
from nltk.stem import WordNetLemmatizer
import softmax
import re

We first define some utility functions

In [4]:
# cosine similarity
def get_cosine_sim(A: list[float], B: list[float]) -> float:
    return dot(A, B)/(norm(A)*norm(B))

# takes in a sentence and returns an array with sentences containing each one [UNK] token from left to right.
# [MASK] won't be mapped to [UNK]
def get_sub_sentences(sentence: str) -> list[str]:
    sub_sentences = list()
    tokens = sentence[:-1].split(' ')
    
    for i in range(len(tokens)):
        s = ""
        for j in range(len(tokens)):
            if i == j:
                if tokens[j] == '[MASK]':
                    s += '[MASK] '
                else:
                    s += '[UNK] '
            else:
                s += tokens[j] + ' '

        sub_sentences.append(s[:-1] + '.')

    return sub_sentences

In [13]:
import numpy as np

# verb stemmen
def get_verb_lemma(verb: str) -> str:
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(verb, pos='v')

# mit dicts vergleichen
def predict_class(bert_predictions, class_dicts=softmax_and_log_dicts.LOG_DICTS):
    scores = []
    score = 0
    for d in class_dicts:
        for prediction in bert_predictions:
            try:
                verb = get_verb_lemma(prediction['token_str'])
                score += d[0][verb] * prediction['score']
            except KeyError:
                continue

        scores.append((score))
        score = 0
    
    # softmax and sort scores and add labels
    softmaxed_scores = softmax.softmax(np.array(scores))
    return softmaxed_scores

In [11]:
# optional: get tagged words marked in output by providing original sentence with XML tags
orig_sentence = "Their <e1>composer</e1> has sunk into <e2>oblivion</e2>."
xml_indices = []
for index, token in enumerate(orig_sentence.split(' ')):
    if re.search('<e[12]>.*</e[12]>', token):
        xml_indices.append(index)

In [None]:
sentence = "Their composer has [MASK] into oblivion."

# first get the prediction from the original sentence so we have something to compare our next results with
bert_pred = bert_predict(sentence)
class_pred = predict_class(bert_pred)

class_predictions = []
for sub_sentence in get_sub_sentences(sentence):
    bert_sub_pred = bert_predict(sub_sentence)
    class_sub_pred = predict_class(bert_sub_pred)
    class_predictions.append(class_sub_pred)

In [15]:
cosine_similarities = []
for p in class_predictions:
    cosine_similarities.append(get_cosine_sim(p, class_pred))


# Calculate min and max
X_min = min(cosine_similarities)
X_max = max(cosine_similarities)

# Scale the array
scaled_similarities = [(x - X_min) / (X_max - X_min) for x in cosine_similarities]

In [16]:
scaled_similarities

[0.9972942570641264,
 0.9948524434381655,
 0.7415548887139448,
 1.0,
 0.0,
 0.7219479625770411]

In [8]:
from IPython.display import HTML

STR = "<div style='display: flex; flex-direction: row; font-size: xx-large; '>"
for index, token in enumerate(sentence[:-1].split(' ')):
    STR += f"<div style='color: rgb(255, {scaled_similarities[index] * 255}, {scaled_similarities[index] * 255})'>{token}&nbsp</div>"
STR = STR[:-11] + "</div>.</div>"

HTML(STR)

In [174]:
html_str = """
<script>
function js_in_ipynb_lul(){
    alert("brrrrrrrrrrt fuer die welt!")
}
</script>
<button onclick="js_in_ipynb_lul()">click me</button>
"""

HTML(html_str)