In [1]:
import os
import sys
import math
import pickle

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

sys.path.insert(0, '../EVAL/scorer_v2.3/MAP_scripts/')
from ev import evaluate

In [None]:
VALIDATION_SPLIT = 0.2
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 128

In [None]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf}, ignore_index=True)
            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [None]:
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')

In [None]:
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

texts_train = train_dataset['Qtext'] + train_dataset['QAquestion']
labels_train = train_dataset['QArel']

texts_test = test_dataset['Qtext'] + test_dataset['QAquestion']
labels_test = test_dataset['QArel']

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_train.append(texts_test))

sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)


MAX_SEQUENCE_LENGTH = max(map(len, (x for x in sequences_train + sequences_test)))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = np.asarray(labels_train)
labels_test = np.asarray(labels_test)

print('Training Set:')
print('Shape of data tensor:', data_train.shape)
print('Shape of label tensor:', labels_train.shape)

print('Test Set:')
print('Shape of data tensor:', data_test.shape)
print('Shape of label tensor:', labels_test.shape)

nb_validation_samples = int(VALIDATION_SPLIT * data_train.shape[0])

x_train = data_train[:-nb_validation_samples]
y_train = labels_train[:-nb_validation_samples].reshape(-1, 1)
x_val = data_train[-nb_validation_samples:]
y_val = labels_train[-nb_validation_samples:].reshape(-1, 1)
x_test = data_test
y_test = labels_test.reshape(-1, 1)

In [None]:
embeddings_index = {}

embeddings = pickle.load(open('embeddings.pic', 'rb'))
dictionary = pickle.load(open('dictionary.pic', 'rb'))

for word in dictionary.keys():
    embeddings_index[word] = embeddings[dictionary[word]]

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=False)

In [None]:
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from keras import Model, Input, Sequential

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()
model.add(Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=False))
model.add(LSTM(5, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

model.fit(x_train, y_train, validation_data=(x_val, y_val),
         epochs=20, batch_size=30)

In [None]:
scores_test = model.predict(x_test)

In [None]:
test_dataset

In [None]:
test_dataset['score'] = [0 if score[0] < 0 or math.isnan(score[0]) else round(score[0],4) for score in scores_test ]
test_dataset['relevance'] = ['true' if score[0] > 0.5 else 'false' for score in scores_test]
test_dataset['rank'] = 0

In [None]:
len(test_dataset)

In [None]:
test_dataset = test_dataset.sort_index(level=0, ascending=[False, True])
test_dataset = test_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

In [None]:
test_dataset.to_csv('../EVAL/SemEval2017-Task3-CQA-MD-test-cnn.xml.pred', sep='\t', header=None, columns=['QID', 'QAID', 'rank', 'score', 'relevance' ])

In [2]:
MAP, Accuracy, P, R, F1  = evaluate('../EVAL/SemEval2017-Task3-CQA-MD-test.xml.subtaskD.relevancy', '../EVAL/SemEval2017-Task3-CQA-MD-test-lsa-mlp.xml.pred')

859.0191978458053


In [3]:
print("MAP: %5.4f" % MAP)
print("Accuracy: %5.4f" % Accuracy)
print("Precision: %5.4f" % P)
print("Recall: %5.4f" % R)
print("F1: %5.4f" % F1)

MAP: 0.6136
Accuracy: 0.6215
Precision: 0.8295
Recall: 0.0444
F1: 0.0842


In [None]:
859.0191978458053 /