In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import sys
import zipfile
import csv
import pickle
import math

import tensorflow as tf
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

import xml.etree.ElementTree as ET

In [3]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    dataset = {}
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        dataset[QID] = {}
        dataset[QID]['Qtext'] = Qtext
        dataset[QID]['QApairs'] = {}
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            QQ = Qtext + QAquestion
            
            dataset[QID]['QApairs'][QAID] = {
                'QAquestion': QAquestion,
                'QAanswer': QAanswer,
                'QArel': QArel,
                'QQ': QQ}
    return dataset
            

In [4]:
with open('stopwords-old.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

In [5]:
import nltk
from nltk.stem.snowball import ArabicStemmer

stemmer = ArabicStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        try:
            stemmed.append(stemmer.stem(item))
        except:
            stemmed.append(item)
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [6]:
def vectorize_tfidf(corpora):
    tfidf_vect = TfidfVectorizer(stop_words=stopwords, max_features=10000)
    tfidf_vect.fit(corpora)
    return tfidf_vect

In [7]:
def fetch_data(dataset):
    queries = [q['Qtext'] for q in dataset.values()]
    questions = [pair['QAquestion'] for qid in dataset.keys() for pair in dataset[qid]['QApairs'].values()]
    answers = [pair['QAanswer'] for qid in dataset.keys() for pair in dataset[qid]['QApairs'].values()]
    relevancies = [0 if pair['QArel'] == 'I' else 1 for qid in dataset.keys() for pair in dataset[qid]['QApairs'].values()]
    return queries, questions, answers, relevancies

In [8]:
trainset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')
testset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

In [9]:
queries_train, questions_train, answers_train, relevancies_train = fetch_data(trainset)
queries_test, questions_test, answers_test, relevancies_test = fetch_data(testset)

tfidf_vect_train = vectorize_tfidf(queries_train + questions_train + answers_train)
tfidf_vect_test = vectorize_tfidf(queries_test + questions_test + answers_test)

In [10]:
data_train = tfidf_vect_train.transform(questions_train)
target_train = relevancies_train
data_test = tfidf_vect_test.transform(questions_test)
target_test = relevancies_test

In [11]:
data_train

<30411x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 785034 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier


from xgboost import XGBClassifier

In [14]:
names = ["Nearest Neighbors", "Linear SVM", "Decision Tree",
         "Random Forest", "Neural Net", "AdaBoost",
        "Ridge Classifier", "Perceptron", "Passive-Aggressive",
        "LinearSVC", "l2", "elasticnet", "Rocchio classifier", "MultinomialNB", "BernoulliNB"]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    RidgeClassifier(tol=1e-2, solver="lsqr"),
    Perceptron(n_iter=50),
    PassiveAggressiveClassifier(n_iter=50),
    LinearSVC(penalty="l2", dual=False, tol=1e-3),
    SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"),
    SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"),
    NearestCentroid(),
    MultinomialNB(alpha=.01),
    BernoulliNB(alpha=.01)
]

In [15]:
for name, clf in zip(names, classifiers):
    clf.fit(data_train, target_train)
    print(name, np.mean(clf.predict(data_test) == target_test))

Nearest Neighbors 0.394801685081


KeyboardInterrupt: 

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, MinMaxScaler

svd = TruncatedSVD(128)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd)

lsa_train = lsa.fit_transform(data_train)
lsa_test = lsa.fit_transform(data_test)

In [None]:
for name, clf in zip(names, classifiers):
    clf.fit(lsa_train, target_train)
    print(name, np.mean(clf.predict(lsa_test) == target_test))

In [22]:
svd = TruncatedSVD(32)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd)

lsa_train = lsa.fit_transform(data_train)
lsa_test = lsa.fit_transform(data_test)

clf = Perceptron(max_iter=50)
clf.fit(lsa_train, target_train)
print('Perceptron', np.mean(clf.predict(lsa_test) == target_test))

Perceptron 0.607662347985


In [17]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=256, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(data_train)


In [None]:
nmf = NMF(n_components=64, random_state=1,
          alpha=.1, l1_ratio=.5).fit(data_train)

In [18]:
nmf_train = nmf.transform(data_train)
nmf_test = nmf.transform(data_test)

clf = Perceptron(max_iter=50)
clf.fit(nmf_train, target_train)
print('Perceptron', np.mean(clf.predict(nmf_test) == target_test))

Perceptron 0.607662347985


In [19]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=128, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(data_train)

In [20]:
lda_train = lda.transform(data_train)
lda_test = lda.transform(data_test)

clf = Perceptron(max_iter=50)
clf.fit(lda_train, target_train)
print('Perceptron', np.mean(clf.predict(lda_test) == target_test))

Perceptron 0.607662347985


In [96]:
(len(target_test) - 4936)/len(target_test)

0.6076623479850568

In [None]:
for name, clf in zip(names, classifiers):
    clf.fit(lda_train, target_train)
    print(name, np.mean(clf.predict(lda_test) == target_test))

array([-1.33628772])

In [30]:
test_dataset_path = '../TEST/2017/SemEval2017-Task3-CQA-MD-test-input.xml'

tree = ET.parse(test_dataset_path)
root = tree.getroot()

    
for Question in root:
    QID = int(Question.get('QID'))
    Qtext = Question.find('Qtext').text
    
    for QApair in Question.iter('QApair'): 
        QAID = int(QApair.get('QAID'))
        QArel = QApair.get('QArel')
        QAquestion = QApair.find('QAquestion').text
        QAanswer = QApair.find('QAanswer').text
        
        query_data_test = tfidf_vect_test.transform([QAquestion])
        query_lda_test =  lda.transform(query_data_test)
        
        query_nmf_test =  nmf.transform(query_data_test)
        
        
        QArel = clf.predict(query_lsa_test)[0]
        QAconf = clf.decision_function(query_lsa_test)[0]
        
        QApair.set('QArel', 'R' if QArel == 1 else 'I')
        QApair.set('QAconf', str(round(QAconf, 4)))

tree.write('../TEST/2017/SemEval2017-Task3-CQA-MD-test-input-tfidf-perceptron.xml', encoding='utf-8')

  if (previous_error - error) / error_at_init < tol:


In [25]:
def map(gold_path, pred_path):
    """
    Measure MAP 'Mean Average Precision' using the gold labels and prediction labels
    for each query
    """
    
    gold_dataset = readXML(gold_path)
    pred_dataset = readXML(pred_path)
    
    AP = []
    
    for QID in gold_dataset.keys():
        assert gold_dataset[QID]['QApairs'].keys() == pred_dataset[QID]['QApairs'].keys()
        gold_labels = [0 if QApair['QArel'] == 'I' else 1 for QApair in gold_dataset[QID]['QApairs'].values()]
        pred_labels = [0 if QApair['QArel'] == 'I' else 1 for QApair in pred_dataset[QID]['QApairs'].values()]
        
        AP.append(average_precision_score(gold_labels, pred_labels))
        
        
    MAP = np.nanmean(AP)
    
    return MAP

In [29]:
map('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml', '../TEST/2017/SemEval2017-Task3-CQA-MD-test-input-tfidf-perceptron.xml')

  recall = tps / tps[-1]


0.48822222222222222