In [None]:
from gensim.corpora.wikicorpus import WikiCorpus

wiki = WikiCorpus('../corpus/arwiki-latest-pages-articles.xml.bz2')

In [6]:
from gensim import utils

corpus = [{'id': 'doc_%i' % num, 'tokens': text}
    for num, text in enumerate(wiki.get_texts())]


In [1]:
from PyArabic import ArabicPreprocessor

preprocessor = ArabicPreprocessor()

In [2]:
class LabeledQID(object):
    def __init__(self, filename, qid):
        self.filename = filename
        self.qid = qid
    def __iter__(self):
        
        # Loading test set
        tree = etree.parse(self.filename)
        
        # {QID, Qtext} dictionary for questions
        questions = {}

 
        sentence = tree.xpath('Question[@QID=' + self.qid + ']/Qtext')[0].text
        uid = 0
        sentence = preprocessor.removeStopwords(sentence)
        tokens = preprocessor.tokenize(sentence)
        tokens = map(preprocessor.deNoise, tokens)
        devocalize_tokens = map(preprocessor.removeDiacritics, tokens)
        denoised_tokens = map(preprocessor.deNoise, devocalize_tokens)
        normalized_tokens = map(preprocessor.normalizeAlef, denoised_tokens)
        normalized_tokens = map(preprocessor.normalizeAggressive, normalized_tokens)
        lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens)

        yield LabeledSentence(words=[w for w in tokens], tags=['%s' % uid])

In [3]:
class LabeledQAPair(object):
    def __init__(self, filename, qid):
        self.filename = filename
        self.qid = qid
    def __iter__(self):
        
        # Loading test set
        tree = etree.parse(self.filename)
        
        # {QID, Qtext} dictionary for questions
        questions = {}

        for qapair in tree.xpath('Question[@QID=' + self.qid + ']/QApair'):
            qaid = qapair.get('QAID')
            qaquestion = qapair.xpath('QAquestion')[0].text
            qaanswer = qapair.xpath('QAanswer')[0].text
           
            qaquestion = preprocessor.removeStopwords(qaquestion)
            tokens = preprocessor.tokenize(qaquestion)
            tokens = map(preprocessor.deNoise, tokens)
            devocalize_tokens = map(preprocessor.removeDiacritics, tokens)
            denoised_tokens = map(preprocessor.deNoise, devocalize_tokens)
            normalized_tokens = map(preprocessor.normalizeAlef, denoised_tokens)
            normalized_tokens = map(preprocessor.normalizeAggressive, normalized_tokens)
            lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens)
        
            yield LabeledSentence(words=[w for w in tokens], tags=['%s' % qaid])
            

In [4]:
from gensim.models.doc2vec import LabeledSentence
from lxml import etree
from collections import OrderedDict

class LabeledQuestion(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        
        # Loading test set
        tree = etree.parse(self.filename)
        
        # {QID, Qtext} dictionary for questions
        questions = {}

        # {QID, [(QAquestion, QAanswer)]} dictionary for question/answer pairs
        pairs = {}
        qid_qaid = {}
        for question in tree.xpath('Question'):
            # construct questions dictionary
            qid = question.get('QID')
            qtext = question.xpath('Qtext')[0].text
            
            qtext = preprocessor.removeStopwords(qtext)
            tokens = preprocessor.tokenize(qtext)
            tokens = map(preprocessor.deNoise, tokens)
            devocalize_tokens = map(preprocessor.removeDiacritics, tokens)
            denoised_tokens = map(preprocessor.deNoise, devocalize_tokens)
            normalized_tokens = map(preprocessor.normalizeAlef, denoised_tokens)
            normalized_tokens = map(preprocessor.normalizeAggressive, normalized_tokens)
            lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens)
            
            yield LabeledSentence(words=[w for w in tokens], tags=['%s' % qid])
            



In [7]:
from simserver import SessionServer

service = SessionServer('../tmp/')

service.train(corpus, method='lsi')

In [8]:
import sys

class QuestionPairSimilarity(object):

    def __iter__(self):
        
        qs = LabeledQuestion('../input/SemEval2016-Task3-CQA-MD-test.xml')
        for q in qs:
            
            
            service.drop_index()
            qid = q.tags[0]
            print qid
            questions = LabeledQID('../input/SemEval2016-Task3-CQA-MD-test.xml', qid)
            pairs = LabeledQAPair('../input/SemEval2016-Task3-CQA-MD-test.xml', qid)

            for question in questions:
                pass

            query = [w for w in question.words]
            
            question_document = {}
            question_document['id'] = qid
            question_document['tokens'] = query
            
            #msg = repr([x.encode(sys.stdout.encoding) for x in query]).decode('string-escape')
            question_documents = []
            question_documents.append(question_document)
            service.index(question_documents)
            
            for index, pair in enumerate(pairs):
                
                qaid = pair.tags
                document = [w for w in pair.words]
                
                pair_document = {}
                pair_document['id'] = qaid[0]
                pair_document['tokens'] = document
                
                pair_documents = []
                pair_documents.append(pair_document)

                service.index(pair_documents)
                
            similarities = service.find_similar(qid)
                #msg = repr([x.encode(sys.stdout.encoding) for x in document]).decode('string-escape')
                #if len(query) > 0 and len(document) > 0:
                #     score = (model.n_similarity([w for w in query], [w for w in document]))
                #else:
                #    score = 0.0
            for qaid, score, _ in similarities:
                yield qid, qaid, score
                   

In [9]:
scored_questions = QuestionPairSimilarity()


In [10]:
import numpy as np
from collections import defaultdict

In [11]:
scores = defaultdict(list)
for qid, qaid, score in scored_questions:
    scores[qid].append({'qaid': qaid, 'score':score})

201399
200902
200172
200875
200030
200066
201135
201425
201499
200360
201430
200241
200477
200369
201475
200783
200211
200988
200323
201158
201214
201272
200975
201583
200484
201329
200721
200400
200763
201308
200553
201167
201193
201564
201317
200817
200209
201521
201179
201571
200797
200754
200852
201556
200620
200859
200403
200522
201057
201294
200581
200789
201171
200904
201008
201455
200494
200521
200280
201244
201516
200085
200805
201177
201405
200726
201377
201285
200523
201572
200724
200938
200374
200355
200515
201446
200509
201436
200202
201270
200719
201554
201254
200775
200750
201274
200058
201229
200827
201321
200188
201288
201116
200885
201265
201333
200767
201451
201342
201258
201492
200174
201157
201216
200572
200841
201543
201458
200462
201130
201287
201074
200420
200480
201203
201303
201391
201104
200912
200723
200979
200993
200144
201175
200152
201551
200969
201535
201566
201324
200786
201152
200185
200184
200728
200926
200079
201476
200407
200607
201246
200048
200486

In [12]:
percentiles = defaultdict(list)
data = []
myscores = defaultdict(lambda : defaultdict(int))
for qid in scores.keys():
    for dic in scores[qid]:
        qaid = dic['qaid']
        score = dic['score']
        data.append(score)
        myscores[qid][qaid] = dic['score']
    percentiles[qid].append(np.percentile(data, 75))
    percentiles[qid].append(np.percentile(data, 99))

In [13]:
#from lxml import etree
#tree = etree.parse('SemEval2016-Task3-CQA-MD-test-input-Arabic.xml')

with open('../output/SemEval2016-Task3-CQA-MD-qa-subtaskD.xml.pred', 'w') as f:
    questions = LabeledQuestion('../input/SemEval2016-Task3-CQA-MD-test.xml')
    for question in questions:
        qid = question.tags[0]
        pairs = LabeledQAPair('../input/SemEval2016-Task3-CQA-MD-test.xml', qid)
        for pair in pairs:
            qaid = pair.tags[0]
            if qid <> qaid:
                relevance = 'false'
                if qid in myscores.keys():
                    score = myscores[qid][qaid]
                    if score > percentiles[qid][0]:
                        relevance = 'true'
                else:
                    score = 0
                f.write('%s\t%s\t0\t%f\t%s\n' % (qid, qaid, score, relevance))
