In [52]:
import sys
import pandas as pd
import numpy as np
import math
from scipy.spatial.distance import cosine
import xml.etree.ElementTree as ET
import pickle
import re

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

In [3]:
sys.path.insert(0, '../EVAL/scorer_v2.3/MAP_scripts/')
from ev import evaluate

In [145]:
def readXML(path):
    """
    Read XML file into a dictionary
    """
    tree = ET.parse(path)
    root = tree.getroot()
    
    dataset = pd.DataFrame(columns=['QID', 'QAID'], dtype=int)
    
    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text
        
        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAconf = QApair.get('QAconf')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text
            QAoverlap = ' '.join([w for w in wordpunct_tokenize(Qtext) if w in wordpunct_tokenize(QAquestion)])
            
            dataset = dataset.append({'QID': QID,
                                    'QAID': QAID,
                                    'Qtext': Qtext,
                                    'QAquestion': QAquestion,
                                    'QAanswer': QAanswer,
                                    'QArel': 0 if QArel == 'I' else 1,
                                    'QAconf': QAconf,
                                    'QAoverlao': QAoverlap}, ignore_index=True)

            
    dataset.set_index(['QID', 'QAID'], inplace=True)
    return dataset

In [146]:
test_dataset = readXML('../TEST/2017/SemEval2017-Task3-CQA-MD-test.xml')
train_dataset = readXML('../TRAIN/SemEval2016-Task3-CQA-MD-train.xml')

In [147]:
train_dataset = train_dataset.sort_index(level=0, ascending=[False, True])
train_dataset = train_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

test_dataset = test_dataset.sort_index(level=0, ascending=[False, True])
test_dataset = test_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])

In [148]:
from nltk import wordpunct_tokenize
from nltk.stem import ISRIStemmer

class StemTokenizer(object):
    def __init__(self):
        self.wnl = ISRIStemmer()
    def __call__(self, doc):
         return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]

In [244]:
lsa = Pipeline([('tfidf', TfidfVectorizer())])

In [245]:
lsa.fit(list(set(train_dataset['QAoverlao'].append(test_dataset['QAoverlao']))))
train_overlap = lsa.transform(train_dataset['QAoverlao'])
train_labels = train_dataset['QArel']

In [246]:
train_overlap.shape, train_labels.shape

((30411, 4388), (30411,))

In [247]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(train_overlap, train_labels):
    X_train, y_train = train_overlap[train_index], train_labels[train_index]
    X_test, y_test = train_overlap[test_index], train_labels[test_index]

In [248]:
clf = xgb.XGBClassifier(n_estimators=1000)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='error')

[0]	validation_0-error:0.397337
[1]	validation_0-error:0.39635
[2]	validation_0-error:0.396679
[3]	validation_0-error:0.397008
[4]	validation_0-error:0.396515
[5]	validation_0-error:0.396844
[6]	validation_0-error:0.396515
[7]	validation_0-error:0.396186
[8]	validation_0-error:0.396844
[9]	validation_0-error:0.397008
[10]	validation_0-error:0.39635
[11]	validation_0-error:0.396022
[12]	validation_0-error:0.396022
[13]	validation_0-error:0.396186
[14]	validation_0-error:0.396186
[15]	validation_0-error:0.396186
[16]	validation_0-error:0.396022
[17]	validation_0-error:0.395693
[18]	validation_0-error:0.395364
[19]	validation_0-error:0.395364
[20]	validation_0-error:0.3952
[21]	validation_0-error:0.3952
[22]	validation_0-error:0.395529
[23]	validation_0-error:0.395364
[24]	validation_0-error:0.395364
[25]	validation_0-error:0.395364
[26]	validation_0-error:0.395364
[27]	validation_0-error:0.396186
[28]	validation_0-error:0.39635
[29]	validation_0-error:0.39635
[30]	validation_0-error:0.39

[245]	validation_0-error:0.385665
[246]	validation_0-error:0.385336
[247]	validation_0-error:0.385994
[248]	validation_0-error:0.385665
[249]	validation_0-error:0.385665
[250]	validation_0-error:0.385665
[251]	validation_0-error:0.385665
[252]	validation_0-error:0.385501
[253]	validation_0-error:0.385665
[254]	validation_0-error:0.385007
[255]	validation_0-error:0.385172
[256]	validation_0-error:0.384843
[257]	validation_0-error:0.385007
[258]	validation_0-error:0.385007
[259]	validation_0-error:0.385007
[260]	validation_0-error:0.385007
[261]	validation_0-error:0.385007
[262]	validation_0-error:0.385501
[263]	validation_0-error:0.385501
[264]	validation_0-error:0.385336
[265]	validation_0-error:0.385172
[266]	validation_0-error:0.385007
[267]	validation_0-error:0.385007
[268]	validation_0-error:0.385007
[269]	validation_0-error:0.385007
[270]	validation_0-error:0.385336
[271]	validation_0-error:0.385336
[272]	validation_0-error:0.385336
[273]	validation_0-error:0.385336
[274]	validati

[487]	validation_0-error:0.383692
[488]	validation_0-error:0.383692
[489]	validation_0-error:0.383692
[490]	validation_0-error:0.383692
[491]	validation_0-error:0.383363
[492]	validation_0-error:0.383363
[493]	validation_0-error:0.383528
[494]	validation_0-error:0.383363
[495]	validation_0-error:0.38287
[496]	validation_0-error:0.38287
[497]	validation_0-error:0.383035
[498]	validation_0-error:0.38287
[499]	validation_0-error:0.382706
[500]	validation_0-error:0.382706
[501]	validation_0-error:0.382706
[502]	validation_0-error:0.382706
[503]	validation_0-error:0.382706
[504]	validation_0-error:0.382706
[505]	validation_0-error:0.382706
[506]	validation_0-error:0.382706
[507]	validation_0-error:0.382377
[508]	validation_0-error:0.382213
[509]	validation_0-error:0.382377
[510]	validation_0-error:0.382377
[511]	validation_0-error:0.382377
[512]	validation_0-error:0.382542
[513]	validation_0-error:0.382542
[514]	validation_0-error:0.382542
[515]	validation_0-error:0.382542
[516]	validation_

[729]	validation_0-error:0.379089
[730]	validation_0-error:0.379089
[731]	validation_0-error:0.379089
[732]	validation_0-error:0.378103
[733]	validation_0-error:0.378103
[734]	validation_0-error:0.378103
[735]	validation_0-error:0.377939
[736]	validation_0-error:0.377939
[737]	validation_0-error:0.377939
[738]	validation_0-error:0.377939
[739]	validation_0-error:0.378103
[740]	validation_0-error:0.378267
[741]	validation_0-error:0.378267
[742]	validation_0-error:0.378267
[743]	validation_0-error:0.378267
[744]	validation_0-error:0.378267
[745]	validation_0-error:0.378267
[746]	validation_0-error:0.378103
[747]	validation_0-error:0.378103
[748]	validation_0-error:0.378103
[749]	validation_0-error:0.378267
[750]	validation_0-error:0.378267
[751]	validation_0-error:0.378267
[752]	validation_0-error:0.378267
[753]	validation_0-error:0.378267
[754]	validation_0-error:0.378267
[755]	validation_0-error:0.378103
[756]	validation_0-error:0.378103
[757]	validation_0-error:0.378267
[758]	validati

[971]	validation_0-error:0.376295
[972]	validation_0-error:0.376295
[973]	validation_0-error:0.37613
[974]	validation_0-error:0.37613
[975]	validation_0-error:0.37613
[976]	validation_0-error:0.37613
[977]	validation_0-error:0.37613
[978]	validation_0-error:0.375966
[979]	validation_0-error:0.375308
[980]	validation_0-error:0.375473
[981]	validation_0-error:0.375473
[982]	validation_0-error:0.375637
[983]	validation_0-error:0.375637
[984]	validation_0-error:0.375637
[985]	validation_0-error:0.375637
[986]	validation_0-error:0.375637
[987]	validation_0-error:0.375637
[988]	validation_0-error:0.375801
[989]	validation_0-error:0.375637
[990]	validation_0-error:0.375637
[991]	validation_0-error:0.375473
[992]	validation_0-error:0.375473
[993]	validation_0-error:0.375473
[994]	validation_0-error:0.375473
[995]	validation_0-error:0.374651
[996]	validation_0-error:0.375144
[997]	validation_0-error:0.375144
[998]	validation_0-error:0.374651
[999]	validation_0-error:0.374815


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [249]:
test_overlap = lsa.transform(test_dataset['QAoverlao'])
test_labels = test_dataset['QArel']

In [250]:
test_overlap.shape, test_labels.shape

((12581, 4388), (12581,))

In [251]:
clf.predict_proba(test_overlap)

array([[ 0.44275808,  0.55724192],
       [ 0.44275808,  0.55724192],
       [ 0.44275808,  0.55724192],
       ..., 
       [ 0.45959127,  0.54040873],
       [ 0.41049111,  0.58950889],
       [ 0.50280893,  0.49719104]], dtype=float32)

In [252]:
test_dataset['score'] = [score[1] for score in clf.predict_proba(test_overlap)]

In [253]:
test_dataset['relevance'] = ['true' if rel == 1.0 else 'false' for rel in clf.predict(test_overlap)]

In [254]:
test_dataset['rank'] = 0

In [255]:
test_dataset.to_csv('../EVAL/SemEval2017-Task3-CQA-MD-test-lsa-mlp.xml.pred', sep='\t', header=None, columns=['QID', 'QAID', 'rank', 'score', 'relevance' ])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_nested_tuple(tup)


In [256]:
test_dataset = test_dataset.sort_index(level=0, ascending=[False, True])
test_dataset = test_dataset.reset_index().drop_duplicates().set_index(['QID', 'QAID'])
test_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,QAanswer,QAconf,QAoverlao,QAquestion,QArel,Qtext,score,relevance,rank
QID,QAID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
69579,43523,يعود تطبل البطن أو ما يعرف بغازات البطن الى ال...,,القلب,اعاني من سرعة نبضات القلب وضيق في التنفس وألم ...,1.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,51753,قبل أن تبدأ ببرنامج المشي، يتطلب منك أن تعرف ف...,,,هل المشي الغير سريع والذي لا يحدث خفقان سريع و...,1.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,61846,التهاب الدم أو ما يعرف طبيا بتعفن الدم هو أحد ...,,,ماهم التهاب الدم وماهي اعراضه وكيفية علاجه؟,0.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,63972,التهاب الكلى هو أمر شائع لدى مرضى الذئبة و يحد...,,,أثار مرض الذئبة الحمراء على الكلى والعلاج الفعال,0.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,117912,الأول هو اخْتِبار أَضْدادِ الحالَّةِ العُقْدِي...,,,السلام عليكم قمت مؤخرا باجراء تحاليل aslo و cr...,0.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,229172,العصب بنقل الايعازات الحسية والحركية من الجهاز...,,عدم انتظام ضربات القلب,ما الاضرار الناتجة عن زيادة نشاط العصب الحائر ...,1.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.645821,true,0
69579,289894,لا ينصح باستخدام الكورتيزون بهدف زيادة الوزن ل...,,,اريد ان اعرف ما هى جرعات دواء ال dexamethasone...,0.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69579,684584,غالبا هذا سببه الرهاب الاجتماعي و رغم ذلك بجب ...,,ضربات القلب,لدى زياد في ضربات القلب و رعشة بمجرد حدوث أمر ...,1.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.443390,false,0
69579,880238,نبدأ بعلاج دوائي اذا لم ينفع بنعمل الكي,,القلب,جوزي عنده نبضات القلب سريعه بتوصل 145 وتضخم في...,1.0,عدم انتظام ضربات القلب بعد الاصابة بالتهاب الل...,0.557242,true,0
69578,47192,ممكن ان يكون شئ طبيعي لكن للاطمئنان انصحك بمرا...,,متزوجة في تاتي يوم في,مرحبا انا عمري 25 وغير متزوجة دورتي كانت تاتي ...,0.0,متزوجة منذ شهرين في الشهر الاول جاتني دورة يوم...,0.625914,true,0


In [257]:
MAP, Accuracy, P, R, F1  = evaluate('../EVAL/SemEval2017-Task3-CQA-MD-test.xml.subtaskD.relevancy', '../EVAL/SemEval2017-Task3-CQA-MD-test-lsa-mlp.xml.pred')

In [258]:
MAP

0.5369180221493354