In [1]:
import tensorflow as tf
import keras
import numpy as np
from sklearn.utils import class_weight
from keras import regularizers
from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
import nltk
import re
import os
import copy
from data_preprocessing import open_data, tokenize, tag_document, data_to_seq, glove_emb_matrix, tags_to_3D, clean_data
from validation import precision, recall, f1, retrive_phrase_BIO
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# DATA PREPROCESSING

In [2]:
nltk.download('punkt')

documents = {}
labels = {}

#directories
dir_Tu= "/Users/dunbanghe/Documents/UMD/723/Hulth2003/Training"

dir_valeria_train = "/Users/dunbanghe/Documents/UMD/723/Hulth2003/Training"
dir_valeria_val = "/Users/dunbanghe/Documents/UMD/723/Hulth2003/Validation"
dir_valeria_test = "/Users/dunbanghe/Documents/UMD/723/Hulth2003/Test"

dir_anna = "/Users/dunbanghe/Documents/UMD/723/Hulth2003/Training"

[nltk_data] Downloading package punkt to /Users/dunbanghe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
#open data
documents_train, labels_train = open_data(dir_valeria_train)
documents_val, labels_val = open_data(dir_valeria_val)
documents_test, labels_test = open_data(dir_valeria_test)

# tokenize data
tokenized_documents_train, tokenized_labels_train = tokenize(documents_train, labels_train)
tokenized_documents_val, tokenized_labels_val = tokenize(documents_val, labels_val)
tokenized_documents_test, tokenized_labels_test = tokenize(documents_test, labels_test)

# create sequence of labels (tags) for the documents
tags_train, tokenized_labels_train = tag_document(tokenized_documents_train, tokenized_labels_train)
tags_val, tokenized_labels_val = tag_document(tokenized_documents_val, tokenized_labels_val)
tags_test, tokenized_labels_test = tag_document(tokenized_documents_test, tokenized_labels_test)

# remove documents without keyphrases 
tokenized_documents_train, tags_train, tokenized_labels_train = clean_data(tokenized_documents_train,
                                                                           tags_train, tokenized_labels_train)
tokenized_documents_val, tags_val, tokenized_labels_val = clean_data(tokenized_documents_val,
                                                                           tags_val, tokenized_labels_val)
tokenized_documents_test, tags_test, tokenized_labels_test = clean_data(tokenized_documents_test,
                                                                           tags_test, tokenized_labels_test)

# Embeddings

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vec = dict.fromkeys(tokenized_documents_train.keys(),[])

In [None]:
for key in tokenized_documents_train.keys():
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_documents_train[key])
    if (len(tokenized_documents_train[key]) > 512):
        continue
    segments_ids = [1] * len(tokenized_documents_train[key])
    #print(segments_ids)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    # Load pre-trained model (weights)
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
    # Concatenate the tensors for all layers. Use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs_cat = []
    for token in token_embeddings:
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(cat_vec)
    token_vecs = encoded_layers[11][0]
    token_vecs = token_vecs.numpy()
    vec[key]=token_vecs

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (671 > 512). Running this sequence through BERT will result in indexing errors


177

In [None]:
tags_train = [doc for doc in copy.deepcopy(tags_train).values()]
tags_val = [doc for doc in copy.deepcopy(tags_val).values()]
tags_test = [doc for doc in copy.deepcopy(tags_test).values()]

# Padding 
tags_train_padded = pad_sequences(tags_train, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_val_padded = pad_sequences(tags_val, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_test_padded = pad_sequences(tags_test, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)

# Convert labels to 3D as keras likes
tags_train_3d = tags_to_3D(tags_train_padded)
tags_val_3d = tags_to_3D(tags_val_padded)
tags_test_3d = tags_to_3D(tags_test_padded)

# NETWORK

In [None]:
weights = np.reshape(class_weight.compute_sample_weight('balanced', tags_train_padded.flatten()),
                             np.shape(tags_train_padded))

In [None]:
EMBEDDINGS_SIZE = 100
MAX_DOCUMENT_LENGTH = 550
BATCH_SIZE = 4
EPOCHS = 12



model = Sequential()

embedding_layer = Embedding(np.shape(embed_matrix)[0],
                            EMBEDDINGS_SIZE,
                            weights=[embed_matrix],
                            input_length=MAX_DOCUMENT_LENGTH,
                            trainable=False)

#model.add(embedding_layer)
model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
model.add(Dropout(0.25))

model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(3, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
              sample_weight_mode="temporal")
print(model.summary())


history = model.fit(X_train_padded, tags_train_3d,
                    validation_data=(X_val_padded, tags_val_3d),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    sample_weight=weights)



In [None]:
history = model.fit(X_train_padded, tags_train_3d,
                    validation_data=(X_val_padded, tags_val_3d),
                    epochs=2,
                    batch_size=4,
                    sample_weight=weights)

In [None]:
def validate(documents_eng, kp_eng, documents_seq, tags, model):
    prec = 0
    rec = 0
    f_score = 0
    acc = 0
    predictions = model.predict_classes(documents_seq)
    for idx, document_eng in enumerate(documents_eng):
        # our document (unpadding)
        doc_len = len(documents_eng[idx])
        document_seq = documents_seq[idx][0:doc_len]
        tags_predicted = predictions[idx][0:doc_len]
        # predicted kp
        kp_predicted = retrive_phrase_BIO(tags_predicted, document_eng)
        kp_true = kp_eng[idx]
        tags_true = tags[idx]
        # compute precision, recall, f_score, accuracy
        prec += precision(kp_true, kp_predicted)
        rec += recall(kp_true, kp_predicted)
        f_score += f1(kp_true, kp_predicted)
        acc += sum(np.equal(tags_true, tags_predicted))/len(tags_true)
        #if idx == 1:
        #    print('document_eng', document_eng)
        #    print('document_seq', document_seq)
        #    print("kp_true",kp_true)
        #    print("tags_true" ,tags_true)
        #    print("tags_predicted", tags_predicted)
        #    print("kp_predicted", kp_predicted)
    return prec/len(documents_eng), rec/len(documents_eng), f_score/len(documents_eng), acc/len(documents_eng)

In [None]:
pr, r, f, acc = validate(X_val_eng, kp_val, X_val_padded, tags_val, model)
print('Validation Accuracy', acc)
print('Validation Precision', pr)
print('Validation Recall', r)
print('Validation F-score', f)

In [None]:
pr, r, f, acc = validate(X_test_eng, kp_test, X_test_padded, tags_test, model)
print('Validation Accuracy', acc)
print('Validation Precision', pr)
print('Validation Recall', r)
print('Validation F-score', f)