In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import re

**Read semeval train data and create pandas dataframe**



In [None]:
# klassenlabel in zahlen umwandeln
def get_int_class(long_class: str, allow_other: bool = False) -> int:
    if long_class == 'Component-Whole(e2,e1)' or long_class == 'Component-Whole(e1,e2)':
        return 0
    if long_class == 'Instrument-Agency(e2,e1)' or long_class == 'Instrument-Agency(e1  ,e2)':
        return 1
    if long_class == 'Member-Collection(e1,e2)' or long_class == 'Member-Collection(e2,e1)':
        return 2
    if long_class == 'Cause-Effect(e2,e1)' or long_class == 'Cause-Effect(e1,e2)':
        return 3
    if long_class == 'Entity-Destination(e2,e1)' or long_class == 'Entity-Destination(e1,e2)':
        return 4
    if long_class == 'Content-Container(e2,e1)' or long_class == 'Content-Container(e1,e2)':
        return 5
    if long_class == 'Message-Topic(e2,e1)' or long_class == 'Message-Topic(e1,e2)':
        return 6
    if long_class == 'Product-Producer(e2,e1)' or long_class == 'Product-Producer(e1,e2)':
        return 7
    if long_class == 'Entity-Origin(e2,e1)' or long_class == 'Entity-Origin(e1,e2)':
        return 8
    if long_class == 'Other':
        if allow_other:
          return 9

def purify_sentence(sentence: str) -> str:
  return re.sub(r"</?e[12]>", "", sentence)

semeval_tuples = list()
temp_tuple = dict()
with open('TRAIN_FILE.TXT', 'r') as file:
    for index, line in enumerate(file.readlines()):
      # unser dokument ist in 4 zeilen aufgeteilt
      if index % 4 == 0:
        regex_results = re.search(r"\"(.*)\"", line.strip())
        if regex_results:
          sentence = regex_results.group(1)
          temp_tuple['sentence'] = purify_sentence(sentence)
      if index % 4 == 1:
        temp_tuple['label'] = get_int_class(line.strip(), allow_other=False)
      if index % 4 == 2:
        semeval_tuples.append(temp_tuple)
        temp_tuple = dict()

df = pd.DataFrame(semeval_tuples)
df = df.dropna()

In [None]:
df.head()

Unnamed: 0,sentence,label
0,The system as described above has its greatest...,0.0
2,The author of a keygen uses a disassembler to ...,1.0
4,The student association is the voice of the un...,2.0
6,The current view is that the chronic inflammat...,3.0
7,People have been moving back into downtown.,4.0


In [None]:
# wir machen ein 60 / 20 / 20 split für das training, die validation and das testen der daten
train, val, test = np.split(df.sample(frac=1), [(int(0.6 * len(df))), (int(0.8 * len(df)))])

In [None]:
# funktion um von pandas dataframe zu tensorflow dataset zu konvertieren
def df_to_dataset(dataframe, shuffle=False, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('label')
  # konvertiere die labels zu one-hot encoding
  labels = tf.one_hot(labels, depth=9)
  df = df['sentence']
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [None]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

#### Model and embeddings

In [None]:
# wir benutzen embedder trainiert auf ein 7B wort korpus
embedding = 'https://tfhub.dev/google/nnlm-en-dim50/2'
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(9, activation='softmax'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(valid_data)

In [None]:
history = model.fit(train_data, epochs=4, validation_data=valid_data)

Moodel performance:

In [None]:
model.evaluate(test_data)

In [None]:
preds = model.predict(test_data)

Resultate als CSV Datei für die Veranschaulichung

In [None]:
y = np.concatenate([y for x, y in test_data], axis=0)

In [None]:
gold_labels = []
for _, batch in test_data:
  for y in batch:
    gold_labels.append(np.argmax(y))

predictions = []
for _ in preds:
  predictions.append(np.argmax(_))

In [None]:
def get_short_name(label: int) -> str:
    if label == 0:
        return 'CW'
    if label == 1:
        return 'IA'
    if label == 2:
        return 'MC'
    if label == 3:
        return 'CE'
    if label == 4:
        return 'ED'
    if label == 5:
        return 'CC'
    if label == 6:
        return 'MT'
    if label == 7:
        return 'PP'
    if label == 8:
        return 'EO'

In [None]:
with open('results.csv', 'a') as f:
  for index, (gold, pred) in enumerate(zip(gold_labels, predictions)):
    f.write(f"{index},{get_short_name(gold)},{get_short_name(pred)}\n")