In [15]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
import pickle
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Embedding, LSTM, Merge, Flatten, dot, merge
from keras.optimizers import RMSprop, Adadelta, Adam
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
VALIDATION_SPLIT = 0.2
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 128

In [3]:
def readXML(path):
    """
    Read XML file into a Pandas DataFrame
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf}, ignore_index=True)
            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [4]:
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

In [6]:

query_texts_train = train_dataset['Qtext']
question_texts_train = train_dataset['QAquestion']
labels_train = train_dataset['QArel']

query_texts_test = test_dataset['Qtext']
question_texts_test = test_dataset['QAquestion']
labels_test = test_dataset['QArel']

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(query_texts_train.tolist() + question_texts_train.tolist() + query_texts_test.tolist() + question_texts_test.tolist())

query_sequences_train = tokenizer.texts_to_sequences(query_texts_train)
question_sequences_train = tokenizer.texts_to_sequences(question_texts_train)

query_sequences_test = tokenizer.texts_to_sequences(query_texts_test)
question_sequences_test = tokenizer.texts_to_sequences(question_texts_test)

query_maxlen = max(map(len, (x for x in query_sequences_train + query_sequences_test)))
question_maxlen = max(map(len, (x for x in question_sequences_train + question_sequences_test)))

MAX_SEQUENCE_LENGTH = max(query_maxlen, question_maxlen)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

query_data_train = pad_sequences(query_sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
question_data_train = pad_sequences(question_sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
query_data_test = pad_sequences(query_sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
question_data_test = pad_sequences(question_sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = labels_train.reshape(-1, 1)
labels_test = labels_test.reshape(-1, 1)

nb_validation_samples = int(VALIDATION_SPLIT * query_data_train.shape[0])

query_x_train = query_data_train[:-nb_validation_samples]
question_x_train = question_data_train[:-nb_validation_samples]
y_train = labels_train[:-nb_validation_samples]
query_x_val = query_data_train[-nb_validation_samples:]
question_x_val = question_data_train[-nb_validation_samples:]
y_val = labels_train[-nb_validation_samples:]
query_x_test = query_data_test
question_x_test = question_data_test
y_test = labels_test



Found 86378 unique tokens.




In [7]:
embeddings_index = {}

embeddings = pickle.load(open('embeddings.pic', 'rb'))
dictionary = pickle.load(open('dictionary.pic', 'rb'))

for word in dictionary.keys():
    embeddings_index[word] = embeddings[dictionary[word]]

print('Found %s word vectors.' % len(embeddings_index))

Found 100000 word vectors.


In [8]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
# network definition

input_query = Input(shape=(query_maxlen,))
input_question = Input(shape=(question_maxlen,))

# encoders
# embed the input sequence into a sequence of vectors
input_encoder_query = Sequential()
input_encoder_query.add(Embedding(input_dim=len(word_index) + 1,
                              output_dim=query_maxlen))
input_encoder_query.add(Dropout(0.3))
# output: (samples, story_maxlen, embedding_dim)

# embed the input into a sequence of vectors of size query_maxlen
input_encoder_question = Sequential()
input_encoder_question.add(Embedding(input_dim=len(word_index) + 1,
                              output_dim=question_maxlen))
input_encoder_question.add(Dropout(0.3))

input_encoded_query = input_encoder_query(input_query)
input_encoded_question = input_encoder_question(input_question)

match = dot([input_encoded_query, input_encoded_question], axes=(2, 2))
match = LSTM(32)(match)
match = Dropout(0.3)(match)
match = Activation('sigmoid')(match)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

  name=name)


ValueError: Only layers of same output shape can be merged using sum mode. Layer shapes: [(None, 282, 282), (None, 670, 670)]

In [None]:
# train
model.fit([query_x_train, question_x_train], y_train,
          batch_size=32,
          epochs=120,
          validation_data=([query_x_val, query_x_val], y_val))

In [240]:
# compute final accuracy on training and test sets
pred = model.predict([query_x_test, question_x_test])

* Accuracy on training set: 23.04%
* Accuracy on test set: 34.07%
