In [81]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import re

**Read semeval train data and create pandas dataframe**



In [93]:
# first, we create a function that turns our class labels, which are string,
# into integers, so we can later use hot end encoding
def get_int_class(long_class: str, allow_other: bool = False) -> int:
    if long_class == 'Component-Whole(e2,e1)' or long_class == 'Component-Whole(e1,e2)':
        return 0
    if long_class == 'Instrument-Agency(e2,e1)' or long_class == 'Instrument-Agency(e1  ,e2)':
        return 1
    if long_class == 'Member-Collection(e1,e2)' or long_class == 'Member-Collection(e2,e1)':
        return 2
    if long_class == 'Cause-Effect(e2,e1)' or long_class == 'Cause-Effect(e1,e2)':
        return 3
    if long_class == 'Entity-Destination(e2,e1)' or long_class == 'Entity-Destination(e1,e2)':
        return 4
    if long_class == 'Content-Container(e2,e1)' or long_class == 'Content-Container(e1,e2)':
        return 5
    if long_class == 'Message-Topic(e2,e1)' or long_class == 'Message-Topic(e1,e2)':
        return 6
    if long_class == 'Product-Producer(e2,e1)' or long_class == 'Product-Producer(e1,e2)':
        return 7
    if long_class == 'Entity-Origin(e2,e1)' or long_class == 'Entity-Origin(e1,e2)':
        return 8
    if long_class == 'Other':
        if allow_other:
          return 9

semeval_tuples = list()
temp_tuple = dict()
with open('TRAIN_FILE.TXT', 'r') as file:
    for index, line in enumerate(file.readlines()):
      # the documents is structured in blocks of 4 lines each so we use % 4
      if index % 4 == 0:
        regex_results = re.search(r"\"(.*)\"", line.strip())
        if regex_results:
          temp_tuple['sentence'] = regex_results.group(1)
      if index % 4 == 1:
        temp_tuple['label'] = get_int_class(line.strip(), allow_other=False)
      if index % 4 == 2:
        semeval_tuples.append(temp_tuple)
        temp_tuple = dict()

df = pd.DataFrame(semeval_tuples)
df = df.dropna()

In [89]:
df.head()

Unnamed: 0,sentence,label
0,The system as described above has its greatest...,"[1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,The <e1>author</e1> of a keygen uses a <e2>dis...,"[0, 1, 0, 0, 0, 0, 0, 0, 0]"
4,The <e1>student</e1> <e2>association</e2> is t...,"[0, 0, 1, 0, 0, 0, 0, 0, 0]"
6,The current view is that the chronic <e1>infla...,"[0, 0, 0, 1, 0, 0, 0, 0, 0]"
7,<e1>People</e1> have been moving back into <e2...,"[0, 0, 0, 0, 1, 0, 0, 0, 0]"


In [94]:
train, val, test = np.split(df.sample(frac=1), [(int(0.6 * len(df))), (int(0.8 * len(df)))])

In [98]:
# function to convert pandas dataframe into tensorflow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('label')
  # Convert the labels to one-hot encoding
  labels = tf.one_hot(labels, depth=9)
  df = df['sentence']
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [106]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

#### Model and embeddings

In [100]:
# we use a embedder trained on a 7B word corpus
embedding = 'https://tfhub.dev/google/nnlm-en-dim50/2'
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [86]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(9, activation='softmax'))

In [103]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [107]:
model.evaluate(valid_data)



[2.191575765609741, 0.14395688474178314]

In [108]:
history = model.fit(train_data, epochs=10, validation_data=valid_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [109]:
model.evaluate(test_data)



[1.7258697748184204, 0.5650500655174255]