In [1]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set

In [2]:
class QuestionRelevance:
    """
        Makes no distinction between 
        PerfectMatch and Relevant questions
    """
    
    INT_RELEVANT = 1
    RELEVANT = "Relevant"
    
    PERFECT_MACH = "PerfectMatch"
    
    INT_IRRELEVANT = 0
    IRRELEVANT = "Irrelevant"
    
    
    def to_number(string_relevance):
        return QuestionRelevance.INT_IRRELEVANT\
            if string_relevance == QuestionRelevance.IRRELEVANT \
            else QuestionRelevance.INT_RELEVANT
    
    
    def from_number(int_relevance):
        return QuestionRelevance.IRRIELEVANT \
            if int_relevance == QuestionRelevance.INT_IRRELEVANT \
            else QuestionRelevance.RELEVANT      

In [3]:
class Thread:
    
    def __init__(self, question, rel_questions):
        self.question = question
        self.rel_questions = rel_questions

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()

In [4]:
from collections import namedtuple


class QuestionData:
    
    def __init__(self, subject, body):
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class RelQuestionData:
    
    STRING_RELEVANCE_KEY = "RELQ_RELEVANCE2ORGQ"
    STRING_USERNAME_KEY = "RELQ_USERNAME"
    STRING_DATE_KEY = "RELQ_DATE"
    STRING_ID_KEY = "RELQ_ID"
    
    def __init__(self, metadata, subject, body):
        self.metadata = metadata
        self.subject = subject
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, user, date, relevance):
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionQuestionPair = namedtuple('QuestionQuestionPair', ['question', 'rel_question', 'data'])


QuestionQuestionPair.__new__.__defaults__ = (None,)

In [5]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(subject.text, body.text)


def to_rel_question_data(node_rel_questions):
    f = lambda node: RelQuestionData(Metadata(node.attrib[RelQuestionData.STRING_USERNAME_KEY],
                                         node.attrib[RelQuestionData.STRING_DATE_KEY],
                                         QuestionRelevance.to_number(node.attrib[RelQuestionData.STRING_RELEVANCE_KEY])), 
                                node[0].text, node[1].text)
    
    return [f(node) for node in node_rel_questions if node[1].text is not None]

In [26]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir):
    
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        i = 0
        node_rel_questions = []
        for root_node in corpus_reader.xml():
            node_question = root_node
            node_rel_questions.append(root_node[2][0])
            i += 1
            if i == 10:
                thread_question = to_question_data(node_question)
                thread_rel_questions = to_rel_question_data(node_rel_questions)
                
                thread = Thread(thread_question, thread_rel_questions)        
                string_topic = ""
                
                i = 0
                node_rel_questions = []
            
                yield string_topic, thread

In [27]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)

In [28]:
def get_data(data_generator):
    y = []
    questions_list = []
    relevan_questions_list = []
    
    for topic, thread in data_generator:
        question = thread.question
        
        for rel_question in thread.rel_questions:
            relevan_questions_list.append(rel_question.body)
            questions_list.append(question.body)
            y += [rel_question.metadata.relevance]
            
    X = [questions_list, relevan_questions_list]
    return X, y


In [37]:

data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1.xml', 'SemEval2016-Task3-CQA-QL-train-part1.xml'], 
                                       0, None, None)

X, y = get_data(data_generator)

In [38]:
X[0]

['Where I can buy good oil for massage?',
 "Dear Members; I have my wife in qatar on family visit visa; now it's gonna over 6 months; can i get more 2/3 months visa?? what is the procedure?? pls inform.",
 'want to extend the visit visa. What are the procedures and how long i get the extension. thanks for the answers...',
 'How can my boyfriend who currently works for a government office there get me and our baby a visit visa? How long can we stay there in doha and can we also get married there as well? Can we stay together in his house there? Can you give us advice and the steps to take for getting visa and requirements for marriage there in Qatar? It will be greatly appreciated.Thanks in advance',
 'I begin employment in Qatar Aug 09 and have just been quoted 1000 to have them flown over. Is it worth the hassle of customs or should i buy new? How do prices compare UK Next with Qatar Next?',
 'My employer have recruited me as a Business Development Executive in a Labor Visa.I am put o

In [39]:
X[1]

['is there any place i can find scented massage oils in qatar?',
 'Hi;Can any one tell me a place where i can have a good massage drom philipinies????? yesterday i had a massage in Bio-Bil they charged me 300qr for 01 hour bt it is totally waste... pls advice me if theres any philipinos....',
 "Tell me; where is the best place to go for a massage? Mind you; I don't want to spend 1000QR for it... (Guys; please don't come up with answers that you\xa0would gladly do it yourself; plz...)\xa0",
 'hi there; i can see a lot of massage center here; but i dont which one is better. can someone help me which massage center is good...and how much will it cost me? thanks',
 'What attracts you more ?',
 '[img_assist|nid=54388|title=Placenta cream ????|desc=|link=none|align=left|width=440|height=388] What the hell do you do with Placenta cream ??????',
 'Any suggestions on how to get rid of them??',
 'Can someone please advise me my husband wants to get Tea Tree Oil pure in Doha? thank you',
 'plz he