In [242]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
import pickle
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Embedding, LSTM, Merge, Flatten
from keras.optimizers import RMSprop, Adadelta, Adam
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [5]:
VALIDATION_SPLIT = 0.2
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 128

In [2]:
def readXML(path):
    """
    Read XML file into a Pandas DataFrame
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf}, ignore_index=True)
            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [3]:
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

In [176]:

query_texts_train = train_dataset['Qtext']
question_texts_train = train_dataset['QAquestion']
labels_train = train_dataset['QArel']

query_texts_test = test_dataset['Qtext']
question_texts_test = test_dataset['QAquestion']
labels_test = test_dataset['QArel']

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(query_texts_train.tolist() + question_texts_train.tolist() + query_texts_test.tolist() + question_texts_test.tolist())

query_sequences_train = tokenizer.texts_to_sequences(query_texts_train)
question_sequences_train = tokenizer.texts_to_sequences(question_texts_train)

query_sequences_test = tokenizer.texts_to_sequences(query_texts_test)
question_sequences_test = tokenizer.texts_to_sequences(question_texts_test)

query_maxlen = max(map(len, (x for x in query_sequences_train + query_sequences_test)))
question_maxlen = max(map(len, (x for x in question_sequences_train + question_sequences_test)))
MAX_SEQUENCE_LENGTH = max(query_maxlen, question_maxlen)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

query_data_train = pad_sequences(query_sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
question_data_train = pad_sequences(question_sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
query_data_test = pad_sequences(query_sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
question_data_test = pad_sequences(question_sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = labels_train.reshape(-1, 1)
labels_test = labels_test.reshape(-1, 1)

nb_validation_samples = int(VALIDATION_SPLIT * data_train.shape[0])

query_x_train = query_data_train[:-nb_validation_samples]
question_x_train = question_data_train[:-nb_validation_samples]
y_train = labels_train[:-nb_validation_samples]
query_x_val = query_data_train[-nb_validation_samples:]
question_x_val = question_data_train[-nb_validation_samples:]
y_val = labels_train[-nb_validation_samples:]
query_x_test = query_data_test
question_x_test = question_data_test
y_test = labels_test



Found 86378 unique tokens.




In [177]:
embeddings_index = {}

embeddings = pickle.load(open('embeddings.pic', 'rb'))
dictionary = pickle.load(open('dictionary.pic', 'rb'))

for word in dictionary.keys():
    embeddings_index[word] = embeddings[dictionary[word]]

print('Found %s word vectors.' % len(embeddings_index))

Found 100000 word vectors.


In [178]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [179]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

In [180]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [181]:
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

In [215]:
def create_base_network(input_shape):
    '''Base network to be shared (eq. to feature extraction).
    '''
    sequence_input = Input(shape=input_shape)
    embedded_sequences = embedding_layer(sequence_input)

    x = Dense(128, activation='relu')(embedded_sequences)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)


In [216]:
def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()

In [222]:
def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

In [217]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [238]:
# network definition

input_query = Input(shape=(MAX_SEQUENCE_LENGTH,))
input_question = Input(shape=(MAX_SEQUENCE_LENGTH,))

embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=True)

encoded_query = embedding_layer(input_query)
encoded_question = embedding_layer(input_question)

shared_lstm = LSTM(10)

processed_query = shared_lstm(encoded_query)
processed_question = shared_lstm(encoded_question)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_query, processed_question])
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([processed_query, processed_question])

model = Model(inputs=[input_query, input_question], outputs=[malstm_distance])



In [243]:
# train
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adam()

model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
model.fit([query_x_train, question_x_train], y_train,
          batch_size=30,
          epochs=2,
          validation_data=([query_x_val, question_x_val], y_val))

Train on 24329 samples, validate on 6082 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1dff64bed68>

In [240]:
# compute final accuracy on training and test sets
pred = model.predict([query_x_train, question_x_train])
train_accuracy = compute_accuracy(pred, y_train)
pred = model.predict([query_x_test, question_x_test])
test_accuracy = compute_accuracy(pred, y_test)

print('* Accuracy on training set: %0.2f%%' % (100 * train_accuracy))
print('* Accuracy on test set: %0.2f%%' % (100 * test_accuracy))

* Accuracy on training set: 23.04%
* Accuracy on test set: 34.07%
