In [1]:
import sys
import pandas as pd
import numpy as np
import math
from scipy.spatial.distance import cosine
import xml.etree.ElementTree as ET
import pickle
import re
import itertools

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC

In [3]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf}, ignore_index=True)
            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [4]:
def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking
    Transforms a n-class ranking problem into a two-class classification
    problem. Subclasses implementing particular strategies for choosing
    pairs should override this method.
    In this method, all pairs are choosen, except for those that have the
    same target value. The output is an array of balanced classes, i.e.
    there are the same number of -1 as +1
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data
    y : array, shape (n_samples,) or (n_samples, 2)
        Target labels. If it's a 2D array, the second column represents
        the grouping of samples, i.e., samples with different groups will
        not be considered.
    Returns
    -------
    X_trans : array, shape (k, n_feaures)
        Data as pairs
    y_trans : array, shape (k,)
        Output class labels, where classes have values {-1, +1}
    """
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same target or different group
            continue
        X_new.append(X[i] - X[j])
        y_new.append(np.sign(y[i, 0] - y[j, 0]))
        # output balanced classes
        if y_new[-1] != (-1) ** k:
            y_new[-1] = - y_new[-1]
            X_new[-1] = - X_new[-1]
    return np.asarray(X_new), np.asarray(y_new).ravel()

In [5]:
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')

train_dataset = train_dataset.sort_index(level=0, ascending=[False, True])
train_dataset = train_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

test_dataset = test_dataset.sort_index(level=0, ascending=[False, True])
test_dataset = test_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

In [6]:
from nltk import wordpunct_tokenize
from nltk.stem import ISRIStemmer

class StemTokenizer(object):
    def __init__(self):
        self.wnl = ISRIStemmer()
    def __call__(self, doc):
         return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]

In [7]:
train_lsa = Pipeline([('tfidf', TfidfVectorizer(min_df=1, max_df=0.1, tokenizer=StemTokenizer(), stop_words=stopwords.words('arabic'), smooth_idf=False, sublinear_tf=True, norm='l2', max_features=1000)),
                      ('lsa',  TruncatedSVD(n_components=900,n_iter=3)),
                      ('normalizer', Normalizer(copy=False))])
train_lsa.fit(list(set(train_dataset['Qtext'])))

test_lsa = Pipeline([('tfidf', TfidfVectorizer(min_df=1, max_df=0.1, tokenizer=StemTokenizer(), stop_words=stopwords.words('arabic'), smooth_idf=False, sublinear_tf=True, norm='l2', max_features=1000)),
                     ('lsa',  TruncatedSVD(n_components=900,n_iter=3)),
                     ('normalizer', Normalizer(copy=False))])
test_lsa.fit(list(set(test_dataset['Qtext'])))

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.1, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=False,
...0, n_iter=3,
       random_state=None, tol=0.0)), ('normalizer', Normalizer(copy=False, norm='l2'))])

In [9]:
train_query_vec = train_lsa.transform(train_dataset['Qtext'])
train_question_vec = train_lsa.transform(train_dataset['QAquestion'] + train_dataset['QAanswer'])

X_train = train_question_vec
Y_train = np.c_[train_dataset['QAconf'].astype(np.float), train_dataset.reset_index()['QID']]

X_valid = X_train[int(X_train.shape[0] * 0.8):]
Y_valid = Y_train[int(Y_train.shape[0] * 0.8):]

X_train = X_train[:int(X_train.shape[0] * 0.8)]
Y_train = Y_train[:int(Y_train.shape[0] * 0.8)]

In [10]:
X_train_trans, Y_train_tran = transform_pairwise(X_train, Y_train)
X_valid_trans, Y_valid_trans = transform_pairwise(X_valid, Y_valid)

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train_trans, Y_train_tran)
clf.score(X_valid_trans, Y_valid_trans)

In [48]:
test_query_vec = test_lsa.transform(test_dataset['Qtext'])
test_question_vec = test_lsa.transform(test_dataset['QAquestion'] + test_dataset['QAanswer'])

X_test = test_question_vec
Y_test = test_dataset['QArel']

predictions = clf.predict(X_test)
predictions_proba = np.dot(X_test, clf.coef_.ravel())

In [52]:
test_dataset['Score'] = predictions_proba
test_dataset['Relevance'] = predictions
test_dataset['Rank'] = 0

In [37]:
def MAP(gold_dataset, pred_dataset, th=10):
    dataset = pred_dataset.join(gold_dataset, lsuffix='_pred', rsuffix='_gold')[['Score_pred', 'Relevance_gold']].reset_index()
    dataset = dataset.sort_values(['QID', 'Score_pred'], ascending=False)
    dataset['Rank_pred'] = dataset.groupby('QID')['Score_pred'].rank(ascending=False)
    dataset = dataset[dataset.Relevance_gold]
    dataset = dataset[dataset.Rank_pred <= th]
    dataset['Position'] = dataset.groupby('QID')['Rank_pred'].rank(ascending=True)
    dataset['Precision'] = dataset.Position / dataset.Rank_pred
    AP = dataset.groupby('QID')['Precision'].mean()
    return round(AP.sum() / len(pred_dataset.groupby('QID')),4) * 100

In [38]:
gold_dataset = pd.read_csv('../EVAL/SemEval2017-Task3-CQA-MD-test.xml.subtaskD.relevancy', sep='\t',  names=['QID', 'QAID', 'Rank', 'Score', 'Relevance'], index_col=['QID', 'QAID'])

In [53]:
MAP(gold_dataset, test_dataset) 

47.770000000000003