In [1]:
from __future__ import division, print_function
from keras.layers import Input, Merge
from keras.layers.core import Activation, Dense, Dropout, Permute
from keras.layers.embeddings import Embedding
from keras.layers.merge import add, concatenate, dot
from keras.layers.recurrent import LSTM, GRU
from keras.models import Model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
import collections
import itertools
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import h5py
import pickle
import PyArabic

Using TensorFlow backend.


In [2]:
#load word2vec embeddings
dictionary = pickle.load(open('dictionary.pic', 'rb'))
reverse_dictionary = pickle.load(open('vocabulary.pic', 'rb'))
embeddings = pickle.load(open('embeddings.pic', 'rb'))

In [3]:
#load stopwords list
with open('stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        
preprocessor = PyArabic.ArabicPreprocessor()

In [4]:
#load dataset from XML file
def get_data(infile):
    tree = ET.parse(infile)
    root = tree.getroot()

    queries, questions, labels = [], [], []

    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text

        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text

            queries.append(Qtext)
            questions.append(QAquestion)
            labels.append(QArel)
    return queries, questions, labels

In [51]:
#vectorize the query/question
def vectorize(data):
    queries, questions, labels = [], [], []
    
    q, qq, l = data
    
    for query, question, label in zip(q, qq, l):

        queries.append([embeddings[dictionary[preprocessor.deNoise(w)]] for w in query.split() if w in dictionary and w not in stopwords])
        questions.append([embeddings[dictionary[preprocessor.deNoise(w)]] for w in question.split() if w in dictionary and w not in stopwords])
        labels.append(label)
    
    query_maxlen = max([len(q) for q in queries])
    question_maxlen = max([len(q) for q in questions])
    
    return (pad_sequences(queries, maxlen=query_maxlen),
            pad_sequences(questions, maxlen=question_maxlen),
            np.array(labels))

In [54]:
DATA_DIR = "../DEV"
TRAIN_FILE = os.path.join(DATA_DIR, "SemEval2016-Task3-CQA-MD-dev.xml")

# get the training data
data_train = get_data(TRAIN_FILE)

# vectorize the training data
Xqtrain, Xqqtrain, Ytrain = vectorize(data_train)

print(Xqtrain.shape, Xqqtrain.shape, len(Ytrain))

(7384, 212, 100) (7384, 956, 100) 7384


In [55]:
### define network
EMBEDDING_SIZE = 64
LATENT_SIZE = 32
BATCH_SIZE = 100
NUM_EPOCHS = 5

# placeholders
original_sequence = Input((Xqtrain.shape[1],))
question_sequence = Input((Xqtrain.shape[1],))

# encoders

# embed the original question into a sequence of vectors of size story_maxlen
original_encoder = Sequential()
original_encoder.add(Embedding(input_dim=vocab_size,
                              output_dim=64))
# output: (samples, story_maxlen, query_maxlen)

# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                              input_length=question_maxlen))

original_encoded = original_encoder(original_sequence)
question_encoded = question_encoder(question_sequence)

shared_lstm = LSTM(64)

encoded_a = shared_lstm(original_encoded)
encoded_b = shared_lstm(question_encoded)

merged_vector = concatenate([encoded_a, encoded_b], axis=1)

predictions = Dense(1, activation='sigmoid')(merged_vector)

model = Model(inputs=[original_sequence, question_sequence], outputs=predictions)
model.compile(optimizer="rmsprop", loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

NameError: name 'question_maxlen' is not defined

In [None]:
# train model
history = model.fit([Xstrain, Xqtrain], Ytrain, batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS,
                    validation_split=0.2)
                    
# plot accuracy and loss plot
plt.subplot(211)
plt.title("Accuracy")
plt.plot(history.history["acc"], color="g", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("Loss")
plt.plot(history.history["loss"], color="g", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()


In [None]:
#save the model 
model.save('SemEval-MemNN-Model.h5')

#save the weights
model.save_weights('SemEval-MemNN-Weights.h5')

#save the architecture
model_json = model.to_json()
with open("SemEval-MemNN-Arch.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
dictionary['UNK']

In [None]:
reverse_dictionary[0]

In [None]:
data_train[0][0]

In [9]:
len(queries)

NameError: name 'queries' is not defined

In [35]:

query = 'text words from query 1 from from'
queries.append([embeddings[dictionary[preprocessor.deNoise(w)]] for w in query.split() if w in dictionary and w not in stopwords])
#query_maxlen = max(map(len, (x for x, _, _ in queries)))

In [36]:
query_maxlen

100

In [37]:
max([len(q) for q in queries])

5

In [29]:
dictionary['from']

20968