In [1]:
import tensorflow as tf
import keras
import numpy as np
from sklearn.utils import class_weight
from keras import regularizers
from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
import nltk
import re
import os
import copy
from data_preprocessing import open_data, tokenize, tag_document, data_to_seq, glove_emb_matrix, tags_to_2D, clean_data
from validation import precision, recall, f1, retrive_phrase_IO

Using TensorFlow backend.


# DATA PREPROCESSING

In [2]:
nltk.download('punkt')

documents = {}
labels = {}

#directories
dir_Tu= "/Users/kmirai/Downloads/NLPProject-master/Hulth2003/Training"

dir_valeria_train = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"
dir_valeria_val = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Validation"
dir_valeria_test = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Test"

dir_anna = "/Users/annasotnikova/Downloads/Hulth2003/Training"

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


opening training data

In [3]:
#open data
documents_train, labels_train = open_data(dir_valeria_train)
documents_val, labels_val = open_data(dir_valeria_val)
documents_test, labels_test = open_data(dir_valeria_test)

# tokenize data
tokenized_documents_train, tokenized_labels_train = tokenize(documents_train, labels_train)
tokenized_documents_val, tokenized_labels_val = tokenize(documents_val, labels_val)
tokenized_documents_test, tokenized_labels_test = tokenize(documents_test, labels_test)

# create sequence of labels (tags) for the documents
tags_train, tokenized_labels_train = tag_document(tokenized_documents_train, tokenized_labels_train)
tags_val, tokenized_labels_val = tag_document(tokenized_documents_val, tokenized_labels_val)
tags_test, tokenized_labels_test = tag_document(tokenized_documents_test, tokenized_labels_test)

# remove documents without keyphrases 
tokenized_documents_train, tags_train, tokenized_labels_train = clean_data(tokenized_documents_train,
                                                                           tags_train, tokenized_labels_train)
tokenized_documents_val, tags_val, tokenized_labels_val = clean_data(tokenized_documents_val,
                                                                           tags_val, tokenized_labels_val)
tokenized_documents_test, tags_test, tokenized_labels_test = clean_data(tokenized_documents_test,
                                                                           tags_test, tokenized_labels_test)

if you want to predict kyphrases for specific example, now is a good time to define it

In [4]:
abstr = 'In standard reinforcement learning (RL), a learning agent seeks to optimize the overall reward. However, many key aspects of a desired behavior are more naturally expressed as constraints. For instance, the designer may want to limit the use of unsafe actions, increase the diversity of trajectories to enable exploration, or approximate expert trajectories when rewards are sparse. In this paper, we propose an algorithmic scheme that can handle a wide class of constraints in RL tasks, specifically, any constraints that require expected values of some vector measurements (such as the use of an action) to lie in a convex set. This captures previously studied constraints (such as safety and proximity to an expert), but also enables new classes of constraints (such as diversity). Our approach comes with rigorous theoretical guarantees and only relies on the ability to approximately solve standard RL tasks. As a result, it can be easily adapted to work with any model-free or model-based RL algorithm. In our experiments, we show that it matches previous algorithms that enforce safety via constraints, but can also enforce new properties that these algorithms cannot incorporate, such as diversity.'.lower()
tok_abstr = nltk.word_tokenize(abstr)

converting BIO tags to IO tags

In [5]:
def BIO_to_IO(labels):
    for key in labels.keys():
        for i, token in enumerate(labels[key]):
            if token==2:
                labels[key][i] = 1
    return labels 
            
tags_train = BIO_to_IO(tags_train)
tags_val = BIO_to_IO(tags_val)
tags_test = BIO_to_IO(tags_test)

# GLOVE embeddings

spcify desired embedding size, use 100 by default

In [6]:
glove = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove[word] = coefs
f.close()

glove_size = 100

In [7]:
# Create vocabulary from all data 
X_train_eng = [doc for doc in copy.deepcopy(tokenized_documents_train).values()]
X_val_eng = [doc for doc in copy.deepcopy(tokenized_documents_val).values()]
X_test_eng = [doc for doc in copy.deepcopy(tokenized_documents_test).values()]
X_full = X_train_eng + X_val_eng + X_test_eng + [tok_abstr] # add words for specific examples

# Our vocab: all the words in all abstracts
target_vocab = list(set([token for doc in X_full for token in doc]))
# Dictionary with all words and their indices
vocab_ind_dict = dict(zip(target_vocab, range(0, len(target_vocab)))) 
# Embedding matrix
embed_matrix = glove_emb_matrix(vocab_ind_dict, glove, glove_size)

In [8]:
# Prepare data for network
X_train = data_to_seq(X_train_eng, vocab_ind_dict)
X_val = data_to_seq(X_val_eng, vocab_ind_dict)
X_test = data_to_seq(X_test_eng, vocab_ind_dict)

kp_train = [doc for doc in copy.deepcopy(tokenized_labels_train).values()]
tags_train = [doc for doc in copy.deepcopy(tags_train).values()]
kp_val = [doc for doc in copy.deepcopy(tokenized_labels_val).values()]
tags_val = [doc for doc in copy.deepcopy(tags_val).values()]
kp_test = [doc for doc in copy.deepcopy(tokenized_labels_test).values()]
tags_test = [doc for doc in copy.deepcopy(tags_test).values()]

# Padding 
X_train_padded = pad_sequences(X_train, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
X_val_padded = pad_sequences(X_val, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
X_test_padded = pad_sequences(X_test, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)

tags_train_padded = pad_sequences(tags_train, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_val_padded = pad_sequences(tags_val, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_test_padded = pad_sequences(tags_test, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)

# Convert labels to 3D as keras likes
tags_train_2d = tags_to_2D(tags_train_padded)
tags_val_2d = tags_to_2D(tags_val_padded)
tags_test_2d = tags_to_2D(tags_test_padded)

# NETWORK

In [9]:
weights = np.reshape(class_weight.compute_sample_weight('balanced', tags_train_padded.flatten()),
                             np.shape(tags_train_padded))

In [10]:
EMBEDDINGS_SIZE = 100
MAX_DOCUMENT_LENGTH = 550
BATCH_SIZE = 4
EPOCHS = 0



model = Sequential()

embedding_layer = Embedding(np.shape(embed_matrix)[0],
                            EMBEDDINGS_SIZE,
                            weights=[embed_matrix],
                            input_length=MAX_DOCUMENT_LENGTH,
                            trainable=False)

model.add(embedding_layer)
model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(2, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
              sample_weight_mode="temporal")
print(model.summary())


history = model.fit(X_train_padded, tags_train_2d,
                    validation_data=(X_val_padded, tags_val_2d),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    sample_weight=weights)




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 550, 100)          1821500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 550, 600)          962400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 550, 600)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 550, 150)          90150     
_________________________________________________________________
dropout_2 (Dropout)          (None, 550, 150)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 550, 2)            302       
Total params: 2,874,352
Trainable params: 1,052,852
Non-trainable params: 1,821,500
____________________________________

In [11]:
def validate(documents_eng, kp_eng, documents_seq, tags, model):
    prec = 0
    rec = 0
    f_score = 0
    acc = 0
    predictions = model.predict_classes(documents_seq)
    for idx, document_eng in enumerate(documents_eng):
        # our document (unpadding)
        doc_len = len(documents_eng[idx])
        document_seq = documents_seq[idx][0:doc_len]
        tags_predicted = predictions[idx][0:doc_len]
        # predicted kp
        kp_predicted = retrive_phrase_IO(tags_predicted, document_eng)
        kp_true = kp_eng[idx]
        tags_true = tags[idx]
        # compute precision, recall, f_score, accuracy
        prec += precision(kp_true, kp_predicted)
        rec += recall(kp_true, kp_predicted)
        f_score += f1(kp_true, kp_predicted)
        acc += sum(np.equal(tags_true, tags_predicted))/len(tags_true)
    return prec/len(documents_eng), rec/len(documents_eng), f_score/len(documents_eng), acc/len(documents_eng)

validation part ideally should be inside training to control overfitting (right way of doing this is using callback function, didn't have time to do this)

In [12]:
pr, r, f, acc = validate(X_val_eng, kp_val, X_val_padded, tags_val, model)
print('Validation Accuracy', acc)
print('Validation Precision', pr)
print('Validation Recall', r)
print('Validation F-score', f)

Validation Accuracy 0.5982191489239921
Validation Precision 0.02464447105112574
Validation Recall 0.04062972913969666
Validation F-score 0.0291223935894617


testing statistics

In [13]:
pr, r, f, acc = validate(X_test_eng, kp_test, X_test_padded, tags_test, model)
print('Validation Accuracy', acc)
print('Validation Precision', pr)
print('Validation Recall', r)
print('Validation F-score', f)

Validation Accuracy 0.6038567115190553
Validation Precision 0.020568483718766118
Validation Recall 0.03396724862552323
Validation F-score 0.02403948789915896


time to get predictions for our tok_abstr

In [14]:
X = copy.deepcopy(tok_abstr)
for j, token in enumerate(tok_abstr):
    X[j] = vocab_ind_dict[token]

X = pad_sequences([X], maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
predictions = model.predict_classes(X)

kp = retrive_phrase_IO(predictions[0], tok_abstr)
print(kp)

[['rl'], ['seeks'], ['expressed'], ['constraints'], ['for', 'instance', ',', 'the', 'designer'], ['want'], ['exploration', ','], ['paper', ',', 'we', 'propose', 'an', 'algorithmic', 'scheme'], ['rl', 'tasks'], ['require'], ['an'], ['convex'], ['an'], ['comes', 'with', 'rigorous', 'theoretical', 'guarantees', 'and', 'only', 'relies', 'on', 'the'], ['solve', 'standard', 'rl', 'tasks'], ['any'], ['model-based', 'rl', 'algorithm', '.'], [',', 'we', 'show', 'that', 'it', 'matches', 'previous', 'algorithms', 'that', 'enforce', 'safety', 'via'], ['can', 'also']]
