In [72]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys
import zipfile
import csv
import pickle

import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score

import xml.etree.ElementTree as ET

In [22]:
dictionary = pickle.load(open('dictionary.pic', 'rb'))
reverse_dictionary = pickle.load(open('vocabulary.pic', 'rb'))
embeddings = pickle.load(open('embeddings.pic', 'rb'))

In [44]:
#Training

train_dataset_path = '../TRAIN/SemEval2016-Task3-CQA-MD-train.xml'

tree = ET.parse(train_dataset_path)
root = tree.getroot()

data_train = []
target_train = []
    
for Question in root:
    QID = int(Question.get('QID'))
    Qtext = Question.find('Qtext').text
    
    for QApair in Question.iter('QApair'): 
        QAID = int(QApair.get('QAID'))
        QArel = 0 if QApair.get('QArel') == 'I' else 1
        QAquestion = QApair.find('QAquestion').text
        QAanswer = QApair.find('QAanswer').text
        
        QQ = Qtext + QAquestion
        
        embeddings_QQ = np.mean(np.asarray([embeddings[dictionary[w]]
                                               if w in dictionary 
                                               else np.zeros_like(embeddings[0]) 
                                               for w in QQ.split()]), axis=0)

        data_train.append(embeddings_QQ)
        target_train.append(QArel)
        #    a = embeddings_Qtext
        #    b = embeddings_QAquestion
        #    s = similarity

data_train = np.asarray(data_train)
data_target = np.asarray(data_target)


In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

In [46]:
names = ["Nearest Neighbors", "Linear SVM", "Decision Tree",
         "Random Forest", "Neural Net", "AdaBoost",
        "Ridge Classifier", "Perceptron", "Passive-Aggressive",
        "LinearSVC", "l2", "elasticnet", "Rocchio classifier", "MultinomialNB", "BernoulliNB"]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    RidgeClassifier(tol=1e-2, solver="lsqr"),
    Perceptron(n_iter=50),
    PassiveAggressiveClassifier(n_iter=50),
    LinearSVC(penalty="l2", dual=False, tol=1e-3),
    SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"),
    SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"),
    NearestCentroid(),
    MultinomialNB(alpha=.01),
    BernoulliNB(alpha=.01)
]

In [76]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    dataset = {}
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        dataset[QID] = {}
        dataset[QID]['Qtext'] = Qtext
        dataset[QID]['QApairs'] = {}
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            QQ = Qtext + QAquestion
            
            dataset[QID]['QApairs'][QAID] = {
                'QAquestion': QAquestion,
                'QAanswer': QAanswer,
                'QArel': QArel,
                'QQ': QQ}
    return dataset
            

In [77]:
def map(gold_path, pred_path):
    """
    Measure MAP 'Mean Average Precision' using the gold labels and prediction labels
    for each query
    """
    
    gold_dataset = readXML(gold_path)
    pred_dataset = readXML(pred_path)
    
    AP = []
    
    for QID in gold_dataset.keys():
        assert gold_dataset[QID]['QApairs'].keys() == pred_dataset[QID]['QApairs'].keys()
        gold_labels = [0 if QApair['QArel'] == 'I' else 1 for QApair in gold_dataset[QID]['QApairs'].values()]
        pred_labels = [0 if QApair['QArel'] == 'I' else 1 for QApair in pred_dataset[QID]['QApairs'].values()]
        
        AP.append(average_precision_score(gold_labels, pred_labels))
        
        
    MAP = np.nanmean(AP)
    
    return MAP

In [122]:
names = ["Perceptron"]

classifiers = [
    RandomForestClassifier(n_estimators=100),
]

In [123]:
#Evaluating

test_dataset_path = '../TEST/2017/SemEval2017-Task3-CQA-MD-test-input.xml'

tree = ET.parse(test_dataset_path)
root = tree.getroot()

for name, clf in zip(names, classifiers):
    clf.fit(data_train, target_train)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text

        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text

            QQ = Qtext + QAquestion

            embeddings_QQ = np.mean(np.asarray([embeddings[dictionary[w]]
                                                   if w in dictionary 
                                                   else np.zeros_like(embeddings[0]) 
                                                   for w in QQ.split()]), axis=0)
            QArel = clf.predict([embeddings_QQ])
            #QAconf = np.asscalar(clf.decision_function([embeddings_QQ]))
            QAconf = clf.predict_proba([embeddings_QQ])[0][1]
            QAconf = 0.0001 if QAconf <= 0 else QAconf
            
            QApair.set('QArel', 'R' if QAconf > 0.5 else 'I')
            QApair.set('QAconf', str(round(QAconf, 4)))
            
    #print(name, np.mean(clf.predict(data_test) == target_test))

    tree.write('../TEST/2017/SemEval2017-Task3-CQA-MD-test-input-word2vec-webteb-' + name + '.xml', encoding='utf-8')

In [121]:
map('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml', '../TEST/2017/SemEval2017-Task3-CQA-MD-test-input-word2vec-webteb-perceptron.xml')

  recall = tps / tps[-1]


0.49813330561854369

In [89]:
clf.predict_proba([embeddings_QQ])[0][1]

0.58666666666666667