In [None]:
import sys
sys.path.append('../Performance')

import bert_predict_class
import softmax_and_log_dicts
import numpy as np
from numpy import dot
from numpy.linalg import norm
import re

In [2]:
# Cosinus Ähnlichkeit
def get_cosine_sim(A: list[float], B: list[float]) -> float:
    return dot(A, B)/(norm(A)*norm(B))

# Nimmt einen Satz und gibt einen Array mit Sätzen. Jeder von ihnen hat ein [UNK] Token von links nach rechts.
# [MASK] wird nicht zu [UNK] gemapped.
def get_sub_sentences(sentence: str) -> list[str]:
    sub_sentences = list()
    tokens = sentence[:-1].split(' ')
    
    for i in range(len(tokens)):
        s = ""
        for j in range(len(tokens)):
            if i == j:
                if tokens[j] == '[MASK]':
                    s += '[MASK] '
                else:
                    s += '[UNK] '
            else:
                s += tokens[j] + ' '

        sub_sentences.append(s[:-1] + '.')

    return sub_sentences

In [3]:
# mit dicts vergleichen
def predict_class(bert_predictions, class_dicts=softmax_and_log_dicts.LOG_DICTS):
    scores = []
    score = 0
    for d in class_dicts:
        for prediction in bert_predictions:
            try:
                verb = bert_predict_class.get_verb_lemma(prediction['token_str'])
                score += d[0][verb] * prediction['score']
            except KeyError:
                continue

        scores.append((score))
        score = 0
    
    # softmax und sortiere Score und Labels
    softmaxed_scores = bert_predict_class.softmax_basic(np.array(scores))
    return softmaxed_scores

In [4]:
# optional: highlighte die Wörter, die mit XML-Tags markiert waren, im Output-Satz
# kann leer gelassen werden!
orig_sentence = "The <e1>bacterial aerosol</e1> was generated from an up-draft <e2>nebulizer</e2>."

xml_indices = []
for index, token in enumerate(orig_sentence.split(' ')):
    if re.search('<e[12]>.*</e[12]>', token):
        xml_indices.append(index)

In [None]:
# Gib hier den Satz an, fuer dessen Klassenvorhersage die Token wichtigkeiten berechnet werden sollen.
sentence = "The bacterial aerosol was [MASK] from an up-draft nebulizer."
# Vorhersage für den originalen Satz
bert_pred = bert_predict_class.bert_predict(sentence)
class_pred = predict_class(bert_pred)

class_predictions = []
for sub_sentence in get_sub_sentences(sentence):
    bert_sub_pred = bert_predict_class.bert_predict(sub_sentence)
    class_sub_pred = predict_class(bert_sub_pred)
    class_predictions.append(class_sub_pred)

In [6]:
cosine_similarities = []
for p in class_predictions:
    cosine_similarities.append(get_cosine_sim(p, class_pred))


# Berechne min und max
X_min = min(cosine_similarities)
X_max = max(cosine_similarities)

# Cosinusaehnlichkeiten skalieren
scaled_similarities = [(x - X_min) / (X_max - X_min) for x in cosine_similarities]

In [7]:
# HTML output erstellen und rendern
from IPython.display import HTML

STR = "<div style='display: flex; flex-direction: row; font-size: xx-large; '>"
for index, token in enumerate(sentence[:-1].split(' ')):
    STR += f"<div style='color: rgb(255, {scaled_similarities[index] * 255}, {scaled_similarities[index] * 255})'>{token}&nbsp</div>"
STR = STR[:-11] + "</div>.</div>"

HTML(STR)