In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import re

**Read semeval train data and create pandas dataframe**



In [25]:
# first, we create a function that turns our class labels, which are string,
# into integers, so we can later use hot end encoding
def get_int_class(long_class: str, allow_other: bool = False) -> int:
    if long_class == 'Component-Whole(e2,e1)' or long_class == 'Component-Whole(e1,e2)':
        return 0
    if long_class == 'Instrument-Agency(e2,e1)' or long_class == 'Instrument-Agency(e1  ,e2)':
        return 1
    if long_class == 'Member-Collection(e1,e2)' or long_class == 'Member-Collection(e2,e1)':
        return 2
    if long_class == 'Cause-Effect(e2,e1)' or long_class == 'Cause-Effect(e1,e2)':
        return 3
    if long_class == 'Entity-Destination(e2,e1)' or long_class == 'Entity-Destination(e1,e2)':
        return 4
    if long_class == 'Content-Container(e2,e1)' or long_class == 'Content-Container(e1,e2)':
        return 5
    if long_class == 'Message-Topic(e2,e1)' or long_class == 'Message-Topic(e1,e2)':
        return 6
    if long_class == 'Product-Producer(e2,e1)' or long_class == 'Product-Producer(e1,e2)':
        return 7
    if long_class == 'Entity-Origin(e2,e1)' or long_class == 'Entity-Origin(e1,e2)':
        return 8
    if long_class == 'Other':
        if allow_other:
          return 9

def purify_sentence(sentence: str) -> str:
  return re.sub(r"</?e[12]>", "", sentence)

semeval_tuples = list()
temp_tuple = dict()
with open('TRAIN_FILE.TXT', 'r') as file:
    for index, line in enumerate(file.readlines()):
      # the documents is structured in blocks of 4 lines each so we use % 4
      if index % 4 == 0:
        regex_results = re.search(r"\"(.*)\"", line.strip())
        if regex_results:
          sentence = regex_results.group(1)
          temp_tuple['sentence'] = purify_sentence(sentence)
      if index % 4 == 1:
        temp_tuple['label'] = get_int_class(line.strip(), allow_other=False)
      if index % 4 == 2:
        semeval_tuples.append(temp_tuple)
        temp_tuple = dict()

df = pd.DataFrame(semeval_tuples)
df = df.dropna()

In [26]:
df.head()

Unnamed: 0,sentence,label
0,The system as described above has its greatest...,0.0
2,The author of a keygen uses a disassembler to ...,1.0
4,The student association is the voice of the un...,2.0
6,The current view is that the chronic inflammat...,3.0
7,People have been moving back into downtown.,4.0


In [27]:
# we make a 60 / 20 / 20 split for training, validation and testing data
train, val, test = np.split(df.sample(frac=1), [(int(0.6 * len(df))), (int(0.8 * len(df)))])

In [28]:
# function to convert pandas dataframe into tensorflow dataset
def df_to_dataset(dataframe, shuffle=False, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('label')
  # Convert the labels to one-hot encoding
  labels = tf.one_hot(labels, depth=9)
  df = df['sentence']
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [29]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

#### Model and embeddings

In [30]:
# we use a embedder trained on a 7B word corpus
embedding = 'https://tfhub.dev/google/nnlm-en-dim50/2'
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [31]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(9, activation='softmax'))

In [32]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [33]:
model.evaluate(valid_data)



[2.2081480026245117, 0.08314087986946106]

In [34]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Get model performance:

In [None]:
model.evaluate(test_data)



[1.3866945505142212, 0.5850654244422913]

In [None]:
preds = model.predict(test_data)



GET RESULTS AS CSV TO VISUALIZE


In [None]:
y = np.concatenate([y for x, y in test_data], axis=0)

In [None]:
gold_labels = []
for _, batch in test_data:
  for y in batch:
    gold_labels.append(np.argmax(y))

predictions = []
for _ in preds:
  predictions.append(np.argmax(_))

In [None]:
def get_short_name(label: int) -> str:
    if label == 0:
        return 'CW'
    if label == 1:
        return 'IA'
    if label == 2:
        return 'MC'
    if label == 3:
        return 'CE'
    if label == 4:
        return 'ED'
    if label == 5:
        return 'CC'
    if label == 6:
        return 'MT'
    if label == 7:
        return 'PP'
    if label == 8:
        return 'EO'

In [None]:
with open('results.csv', 'a') as f:
  for index, (gold, pred) in enumerate(zip(gold_labels, predictions)):
    f.write(f"{index},{get_short_name(gold)},{get_short_name(pred)}\n")

VISUALISE TOKEN IMPORTANCE

Sadly we have to do it here in this notebook because saving and loading tf models doesn't work as intended.

In [37]:
from numpy import dot
from numpy.linalg import norm

In [38]:
# Cosinus Ähnlichkeit
def get_cosine_sim(A: list[float], B: list[float]) -> float:
    return dot(A, B)/(norm(A)*norm(B))

# Nimmt einen Satz und gibt einen Array mit Sätzen. Jeder von ihnen hat ein [UNK] Token von links nach rechts.
# [MASK] wird nicht zu [UNK] gemapped.
def get_sub_sentences(sentence: str) -> list[str]:
    sub_sentences = list()
    tokens = sentence[:-1].split(' ')

    for i in range(len(tokens)):
        s = ""
        for j in range(len(tokens)):
            if i == j:
                if tokens[j] == '[MASK]':
                    s += '[MASK] '
                else:
                    s += '[UNK] '
            else:
                s += tokens[j] + ' '

        sub_sentences.append(s[:-1] + '.')

    return sub_sentences

In [39]:
# optional: highlighte die Wörter, die mit XML-Tags markiert waren, im Output-Satz
# kann leer gelassen werden!
orig_sentence = "The <e1>bacterial aerosol</e1> was generated from an up-draft <e2>nebulizer</e2>."

xml_indices = []
for index, token in enumerate(orig_sentence.split(' ')):
    if re.search('<e[12]>.*</e[12]>', token):
        xml_indices.append(index)

In [43]:
# Gib hier den Satz an, fuer dessen Klassenvorhersage die Token wichtigkeiten berechnet werden sollen.
sentence = "The bacterial aerosol was generated from an up-draft nebulizer."
# Vorhersage für den originalen Satz
sub_sentences = get_sub_sentences(sentence)

In [46]:
sub_tuples = [{ 'sentence': sentence, 'label': -1 }]
for sub in sub_sentences:
  sub_tuples.append({'sentence': sub, 'label': -1 })

In [51]:
df_sub = pd.DataFrame(sub_tuples)

In [52]:
sub_data = df_to_dataset(df_sub)

In [55]:
sub_preds = model.predict(sub_data)



In [57]:
# prediction for sentence with all tokens
pred_full_sentence = sub_preds[0]

# predictions for sentences with [UNK] tokens
pred_sub_sentences = sub_preds[1:]

In [62]:
cosine_similarities = []
for p in pred_sub_sentences:
    cosine_similarities.append(get_cosine_sim(p, pred_full_sentence))

In [64]:
# Berechne min und max
X_min = min(cosine_similarities)
X_max = max(cosine_similarities)

# Cosinusaehnlichkeiten skalieren
scaled_similarities = [(x - X_min) / (X_max - X_min) for x in cosine_similarities]

In [65]:
# HTML output erstellen und rendern
from IPython.display import HTML

STR = "<div style='display: flex; flex-direction: row; font-size: xx-large; '>"
for index, token in enumerate(sentence[:-1].split(' ')):
    STR += f"<div style='color: rgb(255, {scaled_similarities[index] * 255}, {scaled_similarities[index] * 255})'>{token}&nbsp</div>"
STR = STR[:-11] + "</div>.</div>"

HTML(STR)