In [1]:
import tensorflow as tf
import keras
import numpy as np
from sklearn.utils import class_weight
from keras import regularizers
from keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
import nltk
import re
import os
import copy
from data_preprocessing import open_data, tokenize, tag_document, data_to_seq, glove_emb_matrix, tags_to_3D
from validation import precision, recall, f1, retrive_phrase

Using TensorFlow backend.


# DATA PREPROCESSING

In [2]:
nltk.download('punkt')

documents = {}
labels = {}

#directories
dir_Tu= "/Users/kmirai/Downloads/NLPProject-master/Hulth2003/Training"

dir_valeria_train = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"
dir_valeria_val = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Validation"
dir_valeria_test = "/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Test"

dir_anna = "/Users/annasotnikova/Downloads/Hulth2003/Training"

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
#open data
documents_train, labels_train = open_data(dir_valeria_train)
documents_val, labels_val = open_data(dir_valeria_val)
documents_test, labels_test = open_data(dir_valeria_test)

# tokenize data
tokenized_documents_train, tokenized_labels_train = tokenize(documents_train, labels_train)
tokenized_documents_val, tokenized_labels_val = tokenize(documents_val, labels_val)
tokenized_documents_test, tokenized_labels_test = tokenize(documents_test, labels_test)

# create sequence of labels (tags) for the documents
tags_train = tag_document(tokenized_documents_train, tokenized_labels_train)
tags_val = tag_document(tokenized_documents_val, tokenized_labels_val)
tags_test = tag_document(tokenized_documents_test, tokenized_labels_test)

# GLOVE embeddings

In [30]:
glove = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove[word] = coefs
f.close()

glove_size = 100

In [31]:
# Create vocabulary from all data 
X_train_eng = [doc for doc in copy.deepcopy(tokenized_documents_train).values()]
X_val_eng = [doc for doc in copy.deepcopy(tokenized_documents_val).values()]
X_test_eng = [doc for doc in copy.deepcopy(tokenized_documents_test).values()]
X_full = X_train_eng + X_val_eng + X_test_eng

# Our vocab: all the words in all abstracts
target_vocab = list(set([token for doc in X_full for token in doc]))
# Dictionary with all words and their indices
vocab_ind_dict = dict(zip(target_vocab, range(0, len(target_vocab)))) 
# Embedding matrix
embed_matrix = glove_emb_matrix(vocab_ind_dict, glove, glove_size)

In [32]:
# Prepare data for network
X_train = data_to_seq(X_train_eng, vocab_ind_dict)
X_val = data_to_seq(X_val_eng, vocab_ind_dict)
X_test = data_to_seq(X_test_eng, vocab_ind_dict)

kp_train = [doc for doc in copy.deepcopy(tokenized_labels_train).values()]
tags_train = [doc for doc in copy.deepcopy(tags_train).values()]
kp_val = [doc for doc in copy.deepcopy(tokenized_labels_val).values()]
tags_val = [doc for doc in copy.deepcopy(tags_val).values()]
kp_test = [doc for doc in copy.deepcopy(tokenized_labels_test).values()]
tags_test = [doc for doc in copy.deepcopy(tags_test).values()]

# Padding 
X_train_padded = pad_sequences(X_train, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
X_val_padded = pad_sequences(X_val, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
X_test_padded = pad_sequences(X_test, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)

tags_train_padded = pad_sequences(tags_train, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_val_padded = pad_sequences(tags_val, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)
tags_test_padded = pad_sequences(tags_test, maxlen=550, dtype='int32', padding='post', truncating='post', value=0.0)

# Convert labels to 3D as keras likes
tags_train_3d = tags_to_3D(tags_train_padded)
tags_val_3d = tags_to_3D(tags_val_padded)
tags_test_3d = tags_to_3D(tags_test_padded)

# METRICS

In [38]:
def validate(documents_eng, kp_eng, documents_seq, tags, model):
    prec = 0
    rec = 0
    f_score = 0
    acc = 0
    predictions = model.predict_classes(documents_seq)
    
    for idx, document_eng in enumerate(documents_eng):
        # our document (unpadding)
        doc_len = len(documents_eng[idx])
        document_seq = documents_seq[idx][0:doc_len]
        tags_predicted = predictions[idx][0:doc_len]
        # predicted kp
        kp_predicted = retrive_phrase(tags_predicted, document_eng)
        kp_true = kp_eng[idx]
        tags_true = tags[idx]
        # compute precision, recall, f_score, accuracy
        prec += precision(kp_true, kp_predicted)
        rec += recall(kp_true, kp_predicted)
        f_score += f1(kp_true, kp_predicted)
        acc += sum(np.equal(tags_true, tags_predicted))/len(tags_true)
        #if idx == 1:
        #    print('document_eng', document_eng)
        #    print('document_seq', document_seq)
        #    print("kp_true",kp_true)
        #    print("tags_true" ,tags_true)
        #    print("tags_predicted", tags_predicted)
        #    print("kp_predicted", kp_predicted)
    return prec/len(documents_eng), rec/len(documents_eng), f_score/len(documents_eng), acc/len(documents_eng)

# NETWORK

In [39]:
weights = np.reshape(class_weight.compute_sample_weight('balanced', tags_train_padded.flatten()),
                             np.shape(tags_train_padded))

In [40]:
EMBEDDINGS_SIZE = 100
MAX_DOCUMENT_LENGTH = 550
BATCH_SIZE = 4
EPOCHS = 10



model = Sequential()

embedding_layer = Embedding(np.shape(embed_matrix)[0],
                            EMBEDDINGS_SIZE,
                            weights=[embed_matrix],
                            input_length=MAX_DOCUMENT_LENGTH,
                            trainable=False)

model.add(embedding_layer)
model.add(Bidirectional(LSTM(300, activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01))))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(3, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
              sample_weight_mode="temporal")
print(model.summary())


history = model.fit(X_train_padded, tags_train_3d,
                    validation_data=(X_val_padded, tags_val_3d),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    sample_weight=weights)



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 550, 100)          1827500   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 550, 600)          962400    
_________________________________________________________________
dropout_7 (Dropout)          (None, 550, 600)          0         
_________________________________________________________________
time_distributed_7 (TimeDist (None, 550, 150)          90150     
_________________________________________________________________
dropout_8 (Dropout)          (None, 550, 150)          0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 550, 3)            453       
Total params: 2,880,503
Trainable params: 1,053,003
Non-trainable params: 1,827,500
____________________________________

In [41]:
pr, r, f, acc = validate(X_val_eng, kp_val, X_val_padded, tags_val, model)
print('Validation Accuracy', acc)
print('Validation Precision', pr)
print('Validation Recall', r)
print('Validation F-score', f)

Validation Accuracy 0.7439477995503742
Validation Precision 0.25686407214164225
Validation Recall 0.601197251475809
Validation F-score 0.3479060476256512
