In [43]:
import os
import sys
import math
import pickle

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score

In [2]:
VALIDATION_SPLIT = 0.2
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 128

In [3]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            QAdiff = ' '.join([w for w in (QAquestion + ' ' + QAanswer).split() if w in Qtext.split()])
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf,
                                    'QAdiff': QAdiff}, ignore_index=True)
            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [4]:
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')

In [5]:
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

texts_query_train = train_dataset['Qtext'] 
texts_question_train = train_dataset['QAquestion']
texts_diff_train = train_dataset['QAdiff']
labels_train = train_dataset['QArel']

texts_query_test = test_dataset['Qtext']
texts_question_test = test_dataset['QAquestion']
texts_diff_test = test_dataset['QAdiff']
labels_test = test_dataset['QArel']

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_query_train.append(texts_question_train.append(texts_query_test.append(texts_question_test))))
tokenizer.fit_on_texts(texts_diff_train.append(texts_diff_test))

sequences_query_train = tokenizer.texts_to_sequences(texts_query_train)
sequences_question_train = tokenizer.texts_to_sequences(texts_question_train)
sequences_diff_train = tokenizer.texts_to_sequences(texts_diff_train)
sequences_query_test = tokenizer.texts_to_sequences(texts_query_test)
sequences_question_test = tokenizer.texts_to_sequences(texts_question_test)
sequences_diff_test = tokenizer.texts_to_sequences(texts_diff_test)


#MAX_SEQUENCE_LENGTH = max(map(len, (x for x in sequences_query_train + sequences_question_train + sequences_query_test + sequences_question_test)))
MAX_SEQUENCE_LENGTH = max(map(len, (x for x in sequences_diff_train + sequences_diff_test)))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_query_train = pad_sequences(sequences_query_train, maxlen=MAX_SEQUENCE_LENGTH)
data_question_train = pad_sequences(sequences_question_train, maxlen=MAX_SEQUENCE_LENGTH)
data_diff_train = pad_sequences(sequences_diff_train, maxlen=MAX_SEQUENCE_LENGTH)
data_query_test = pad_sequences(sequences_query_test, maxlen=MAX_SEQUENCE_LENGTH)
data_question_test = pad_sequences(sequences_question_test, maxlen=MAX_SEQUENCE_LENGTH)
data_diff_test = pad_sequences(sequences_diff_test, maxlen=MAX_SEQUENCE_LENGTH)

labels_train = np.asarray(labels_train)
labels_test = np.asarray(labels_test)

print('Training Set:')
print('Shape of query tensor:', data_query_train.shape)
print('Shape of question tensor:', data_question_train.shape)
print('Shape of label tensor:', labels_train.shape)

print('Test Set:')
print('Shape of query tensor:', data_query_test.shape)
print('Shape of question tensor:', data_question_test.shape)
print('Shape of label tensor:', labels_test.shape)

nb_validation_samples = int(VALIDATION_SPLIT * data_query_train.shape[0])

x_query_train = data_query_train[:-nb_validation_samples]
x_question_train = data_question_train[:-nb_validation_samples]
x_diff_train = data_diff_train[:-nb_validation_samples]
y_train = labels_train[:-nb_validation_samples].reshape(-1, 1)
x_query_val = data_query_train[-nb_validation_samples:]
x_question_val = data_question_train[-nb_validation_samples:]
x_diff_val = data_diff_train[-nb_validation_samples:]
y_val = labels_train[-nb_validation_samples:].reshape(-1, 1)
x_query_test = data_query_test
x_question_test = data_question_test
x_diff_test = data_diff_test
y_test = labels_test.reshape(-1, 1)

Using TensorFlow backend.


Found 86378 unique tokens.
Training Set:
Shape of query tensor: (30411, 314)
Shape of question tensor: (30411, 314)
Shape of label tensor: (30411,)
Test Set:
Shape of query tensor: (12600, 314)
Shape of question tensor: (12600, 314)
Shape of label tensor: (12600,)


In [59]:
embeddings_index = {}

embeddings = pickle.load(open('embeddings.pic', 'rb'))
dictionary = pickle.load(open('dictionary.pic', 'rb'))

for word in dictionary.keys():
    embeddings_index[word] = embeddings[dictionary[word]]

print('Found %s word vectors.' % len(embeddings_index))

Found 100000 word vectors.


In [60]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [61]:
len(word_index)

86378

In [62]:
from keras.layers import Embedding, LSTM, Dense, GRU, Conv1D, MaxPooling1D, GlobalMaxPool1D, concatenate, Input, Bidirectional
from keras.optimizers import RMSprop
from keras.models import Sequential, Model
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

model = Sequential()

model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(GRU(8, activation='relu', recurrent_dropout=0.5, dropout=0.5)))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

callbacks_list = [
    EarlyStopping(
        monitor='acc',
        patience=1,
    ),
    ModelCheckpoint(
        filepath='data/best_model.h5',
        monitor='val_loss',
        save_best_only=True,
    )
]

history = model.fit(x_diff_train, y_train, validation_data=(x_diff_val, y_val),
    epochs=20, batch_size=128, callbacks=callbacks_list)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         11056512  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 16)                6576      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 11,063,105
Trainable params: 6,593
Non-trainable params: 11,056,512
_________________________________________________________________
Train on 24329 samples, validate on 6082 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


model.load_weights(filepath='BiGRU.h5')

In [63]:
scores_test = model.predict(x_diff_test)

In [64]:
pred_dataset = test_dataset
pred_dataset['Score'] = [0 if score[0] < 0 or math.isnan(score[0]) else round(score[0],4) for score in scores_test ]
pred_dataset['Relevance'] = ['true' if score[0] > 0.5 else 'false' for score in scores_test]
pred_dataset['Rank'] = 0

In [65]:
pred_dataset = pred_dataset.sort_index(level=0, ascending=[False, True])
pred_dataset = pred_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

In [66]:
gold_dataset = pd.read_csv('../EVAL/SemEval2017-Task3-CQA-MD-test.xml.subtaskD.relevancy', sep='\t',  names=['QID', 'QAID', 'Rank', 'Score', 'Relevance'], index_col=['QID', 'QAID'])

In [67]:
def map_score(gold_dataset, pred_dataset, th=10):
    dataset = pred_dataset.join(gold_dataset, lsuffix='_pred', rsuffix='_gold')[['Score_pred', 'Relevance_gold']].reset_index()
    dataset = dataset.sort_values(['QID', 'Score_pred'], ascending=False)
    dataset['Rank_pred'] = dataset.groupby('QID')['Score_pred'].rank(ascending=False)
    dataset = dataset[dataset.Relevance_gold]
    dataset = dataset[dataset.Rank_pred <= th]
    dataset['Position'] = dataset.groupby('QID')['Rank_pred'].rank(ascending=True)
    dataset['Precision'] = dataset.Position / dataset.Rank_pred
    AP = dataset.groupby('QID')['Precision'].mean()
    return round(AP.sum() / len(pred_dataset.groupby('QID')),4) * 100

In [68]:
qid = pred_dataset.reset_index()['QID']
qaid = pred_dataset.reset_index()['QAID']
y = pred_dataset.reset_index()['QArel']
ypred = pred_dataset.reset_index()['Score']

In [69]:
mAP = map_score(gold_dataset, pred_dataset)
accuracy = 100 * accuracy_score(y == 1, ypred >= 0.5)
precision = 100 * precision_score(y == 1, ypred >= 0.5)
recall = 100 * recall_score(y == 1, ypred >= 0.5)
F1 = 100 * f1_score(y == 1, ypred >= 0.5)

In [70]:
print('MAP: %.2f\nAccuracy: %.2f\nPrecision: %.2f\nRecall: %.2f\nF1: %.2f' % (mAP, accuracy, precision, recall, F1))

MAP: 56.80
Accuracy: 55.71
Precision: 46.32
Recall: 81.08
F1: 58.96
