# Entity aware Relation Classification

In [None]:
%tensorflow_version 1.x

In [None]:
import os
import sys
import time
import json
from pathlib import Path
import numpy as np
import pandas as pd
import datetime
import nltk
nltk.download('punkt')
import re
import tensorflow as tf
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')
sys.path.append(os.path.join(basepath, 'entity-aware-relation-classification/code'))

from model.entity_att_lstm import EntityAttentionLSTM
import utils

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

Switch for data usage: If True FewRel data will be used, if False Future Engineering data is used

In [None]:
use_fewrel_data=False

In [None]:
max_sent_length = 90

if (use_fewrel_data):
    data_dir = os.path.join(root, 'fewrel-training-data')
    run_dir = 'runs_fewrel'

    train_path_fewrel = os.path.join(data_dir, 'fewrel/dev_7_classes_disjoint.json')
    test_path_fewrel = os.path.join(data_dir, 'fewrel/test_7_classes_disjoint.json')
    val_path_fewrel = os.path.join(data_dir, 'fewrel/train_7_classes_disjoint.json')

    class2label = {'P105':0, 'P135':1, 'P155':2, 'P31':3, 'P800':4, 'P921':5, 'NOTA':6}
else:
    data_dir = os.path.join(root, 'fe-training-data')
    run_dir = 'runs_fe'

    train_path_fewrel = os.path.join(data_dir, 'train_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json')
    test_path_fewrel = os.path.join(data_dir, 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json')
    val_path_fewrel = os.path.join(data_dir, 'val_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json')

    class2label = {'NOTA':0, 'A manufactures product B':1, 'A operates B':2, 'A operates \[something\] in location B':3, 'A orders B':4, 'A uses/employs charging technology B':5, 'A orders something from B':6}


root_path = os.path.join(basepath, 'entity-aware-relation-classification')

allow_soft_placement = True
log_device_placement = False
gpu_allow_growth = True
embedding_size = 300
pos_embedding_size = 50
hidden_size = 300
num_heads = 4
attention_size = 50
embeddings = "glove300"
l2_reg_lambda = 1e-5

learning_rate = 1.0
decay_rate = 0.9
num_checkpoints = 1
batch_size = 20
num_epochs = 100

emb_dropout_keep_prob = 0.7
rnn_dropout_keep_prob = 0.7
dropout_keep_prob = 0.5

display_every = 10
evaluate_every = 100

In [None]:
def logging_train(step, loss, accuracy):
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

def logging_eval(step, loss, accuracy, predictions, labels):
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

    accuracy = accuracy_score(predictions, labels)
    precision = precision_score(labels, predictions, average='macro', labels=np.unique(labels))
    recall = recall_score(labels, predictions, average='macro', labels=np.unique(labels))
    f1 = f1_score(labels, predictions, average='macro', labels=np.unique(labels))

    print("{}: accuracy {:g}, precision {:g}, recall {:g}, f1 {:G}\n".format(time_str, accuracy, precision, recall, f1))

    return f1, accuracy, precision, recall

## Data Helpers

In [None]:
def clean_str(text):
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"there's", "there is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text.strip()

In [None]:
def load_data_and_labels_fewrel(path):
    with open(path, "r", encoding='utf-8') as f:
        lines = json.loads(f.read())
    
    data = []
    max_sentence_length = 0

    for (i, line) in enumerate(lines):
        id = i
        for x in line['ents']:
            if x[1] == 1:
                x[1] = 0
        ents = line['ents']
        sentence = line['text']
        relation = line['label']

        h = ents[0]
        t = ents[1]
        h_name = sentence[h[1]:h[2]]
        t_name = sentence[t[1]:t[2]]
        if h[1] < t[1]:
            sentence = sentence[:h[1]] + " _e11_ "+h_name+" _e12_ " + sentence[h[2]:t[1]] + " _e21_ "+t_name+" _e22_ " + sentence[t[2]:]
            
        else:
            sentence = sentence[:t[1]] + " _e21_ "+t_name+" _e22_ " + sentence[t[2]:h[1]] + " _e11_ "+h_name+" _e12_ " + sentence[h[2]:]

        sentence = clean_str(sentence)
        tokens = nltk.word_tokenize(sentence)
        if max_sentence_length < len(tokens):
            max_sentence_length = len(tokens)
        e1 = tokens.index("e12") - 1
        e2 = tokens.index("e22") - 1
        sentence = " ".join(tokens)

        data.append([id, sentence, e1, e2, relation])

    print(path)
    print("max sentence length = {}\n".format(max_sentence_length))

    df = pd.DataFrame(data=data, columns=["id", "sentence", "e1", "e2", "relation"])

    pos1, pos2 = get_relative_position(df, max_sentence_length)

    df['label'] = [class2label[r] for r in df['relation']]

    # Text Data
    x_text = df['sentence'].tolist()
    e1 = df['e1'].tolist()
    e2 = df['e2'].tolist()

    # Label Data
    y = df['label']
    labels_flat = y.values.ravel()
    labels_count = np.unique(labels_flat).shape[0]

    # convert class labels from scalars to one-hot vectors
    # 0  => [1 0 0 0 0 ... 0 0 0 0 0]
    # 1  => [0 1 0 0 0 ... 0 0 0 0 0]
    # ...
    # 18 => [0 0 0 0 0 ... 0 0 0 0 1]
    def dense_to_one_hot(labels_dense, num_classes):
        num_labels = labels_dense.shape[0]
        index_offset = np.arange(num_labels) * num_classes
        labels_one_hot = np.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
        return labels_one_hot

    labels = dense_to_one_hot(labels_flat, labels_count)
    labels = labels.astype(np.uint8)

    return x_text, labels, e1, e2, pos1, pos2

In [None]:
def get_relative_position(df, max_sentence_length):
    # Position data
    pos1 = []
    pos2 = []
    for df_idx in range(len(df)):
        sentence = df.iloc[df_idx]['sentence']
        tokens = nltk.word_tokenize(sentence)
        e1 = df.iloc[df_idx]['e1']
        e2 = df.iloc[df_idx]['e2']

        p1 = ""
        p2 = ""
        for word_idx in range(len(tokens)):
            p1 += str((max_sentence_length - 1) + word_idx - e1) + " "
            p2 += str((max_sentence_length - 1) + word_idx - e2) + " "
        pos1.append(p1)
        pos2.append(p2)

    return pos1, pos2

In [None]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

## Training

Loading training-, test- and validation-datasets

In [None]:
with tf.device('/cpu:0'):
    train_text, train_y, train_e1, train_e2, train_pos1, train_pos2 = load_data_and_labels_fewrel(train_path_fewrel)
with tf.device('/cpu:0'):
    test_text, test_y, test_e1, test_e2, test_pos1, test_pos2 = load_data_and_labels_fewrel(test_path_fewrel)
with tf.device('/cpu:0'):
    val_text, val_y, val_e1, val_e2, val_pos1, val_pos2 = load_data_and_labels_fewrel(val_path_fewrel)

Building vocabulary

In [None]:
# Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
# ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
# =>
# [27 39 40 41 42  1 43  0  0 ... 0]
# dimension = MAX_SENT_LENGTH
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sent_length)
vocab_processor.fit(train_text + test_text + val_text)
train_x = np.array(list(vocab_processor.transform(train_text)))
test_x = np.array(list(vocab_processor.transform(test_text)))
val_x = np.array(list(vocab_processor.transform(val_text)))
train_text = np.array(train_text)
test_text = np.array(test_text)
val_text = np.array(val_text)

print("\nText Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("train_x = {0}".format(train_x.shape))
print("train_y = {0}".format(train_y.shape))
print("test_x = {0}".format(test_x.shape))
print("test_y = {0}".format(test_y.shape))
print("val_x = {0}".format(val_x.shape))
print("val_y = {0}".format(val_y.shape))

# Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
# [95 96 97 98 99 100 101 999 999 999 ... 999]
# =>
# [11 12 13 14 15  16  21  17  17  17 ...  17]
# dimension = MAX_SENT_LENGTH
pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sent_length)
pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
val_p1 = np.array(list(pos_vocab_processor.transform(val_pos1)))
val_p2 = np.array(list(pos_vocab_processor.transform(val_pos2)))

print("\nPosition Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_)))
print("train_p1 = {0}".format(train_p1.shape))
print("test_p1 = {0}".format(test_p1.shape))
print("val_p1 = {0}".format(val_p1.shape))
print("")

Training process with TensorFlow

In [None]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=allow_soft_placement,
        log_device_placement=log_device_placement)
    session_conf.gpu_options.allow_growth = gpu_allow_growth
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        model = EntityAttentionLSTM(
            sequence_length=train_x.shape[1],
            num_classes=train_y.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=embedding_size,
            pos_vocab_size=len(pos_vocab_processor.vocabulary_),
            pos_embedding_size=pos_embedding_size,
            hidden_size=hidden_size,
            num_heads=num_heads,
            attention_size=attention_size,
            use_elmo=(embeddings == 'elmo'),
            l2_reg_lambda=l2_reg_lambda)

        cur_best_f1 = 0.0

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdadeltaOptimizer(learning_rate, decay_rate, 1e-6)
        gvs = optimizer.compute_gradients(model.loss)
        capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(root_path, run_dir, timestamp))
        print("\nWriting to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", model.loss)
        acc_summary = tf.summary.scalar("accuracy", model.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))
        pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        if embeddings == "word2vec":
            pretrain_W = utils.load_word2vec(os.path.join(root_path, 'resource/GoogleNews-vectors-negative300.bin'), embedding_size, vocab_processor)
            sess.run(model.W_text.assign(pretrain_W))
            print("Success to load pre-trained word2vec model!\n")
        elif embeddings == "glove100":
            pretrain_W = utils.load_glove(os.path.join(root_path, 'resource/glove.6B.100d.txt'), embedding_size, vocab_processor)
            sess.run(model.W_text.assign(pretrain_W))
            print("Success to load pre-trained glove100 model!\n")
        elif embeddings == "glove300":
            pretrain_W = utils.load_glove(os.path.join(root_path, 'resource/glove.840B.300d.txt'), embedding_size, vocab_processor)
            sess.run(model.W_text.assign(pretrain_W))
            print("Success to load pre-trained glove300 model!\n")

        # Generate batches
        train_batches = batch_iter(list(zip(train_x, train_y, train_text,
                                                          train_e1, train_e2, train_p1, train_p2)),
                                                batch_size, num_epochs)
        # Training loop. For each batch...
        best_f1 = 0.0  # For save checkpoint(model)
        for train_batch in train_batches:
            train_bx, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2 = zip(*train_batch)
            feed_dict = {
                model.input_x: train_bx,
                model.input_y: train_by,
                model.input_text: train_btxt,
                model.input_e1: train_be1,
                model.input_e2: train_be2,
                model.input_p1: train_bp1,
                model.input_p2: train_bp2,
                model.emb_dropout_keep_prob: emb_dropout_keep_prob,
                model.rnn_dropout_keep_prob: rnn_dropout_keep_prob,
                model.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, model.loss, model.accuracy], feed_dict)
            train_summary_writer.add_summary(summaries, step)

            # Training log display
            if step % display_every == 0:
                logging_train(step, loss, accuracy)

            # Evaluation
            if step % evaluate_every == 0:
                print("\nEvaluation:")
                # Generate batches
                test_batches = batch_iter(list(zip(test_x, test_y, test_text,
                                                                test_e1, test_e2, test_p1, test_p2)),
                                                        batch_size, 1, shuffle=False)
                # Training loop. For each batch...
                losses = 0.0
                accuracy = 0.0
                predictions = []
                labels = []
                iter_cnt = 0
                for test_batch in test_batches:
                    test_bx, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2 = zip(*test_batch)

                    for elem in test_by:
                        labels.append(np.argmax(elem))

                    feed_dict = {
                        model.input_x: test_bx,
                        model.input_y: test_by,
                        model.input_text: test_btxt,
                        model.input_e1: test_be1,
                        model.input_e2: test_be2,
                        model.input_p1: test_bp1,
                        model.input_p2: test_bp2,
                        model.emb_dropout_keep_prob: 1.0,
                        model.rnn_dropout_keep_prob: 1.0,
                        model.dropout_keep_prob: 1.0
                    }
                    loss, acc, pred = sess.run(
                        [model.loss, model.accuracy, model.predictions], feed_dict)
                    losses += loss
                    accuracy += acc
                    predictions += pred.tolist()
                    iter_cnt += 1
                losses /= iter_cnt
                accuracy /= iter_cnt
                predictions = np.array(predictions, dtype='int')

                cur_f1, cur_accuracy, cur_precision, cur_recall = logging_eval(step, loss, accuracy, predictions, labels)

                # Model checkpoint
                if best_f1 < cur_f1:
                    best_f1 = cur_f1
                    path = saver.save(sess, checkpoint_prefix+"-{:.3g}".format(best_f1), global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))