In [2]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set

In [3]:
class AnswerRelevance:
    """
        Makes no distinction between 
        PotentiallyUseful and Bad comments
    """
    
    INT_GOOD = 1
    GOOD = "Good"
    
    POTENTIALLY_USEFUL = "PotentiallyUseful"
    
    INT_BAD = 0
    BAD = "Bad"
    
    
    def to_number(string_relevance):
        return AnswerRelevance.INT_GOOD \
            if string_relevance == AnswerRelevance.GOOD \
            else AnswerRelevance.INT_BAD
    
    
    def from_number(int_relevance):
        return AnswerRelevance.GOOD \
            if string_relevance == AnswerRelevance.INT_GOOD \
            else AnswerRelevance.BAD

In [4]:
class Thread:
    
    def __init__(self, question, answers):
        self.question = question
        self.answers = answers

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()

In [5]:
class ThreadCollection:
    
    def __init__(self):
        self.by_topic = {}
    
    def add(self, string_topic, thread):
        MapHelper.append_to_list(self.by_topic, string_topic, thread)
    
    def get(self, string_topic):
        return self.by_topic.get(string_topic, [])
    
    def topics(self):
        return list(self.by_topic.keys())

    def topic_thread_pairs(self):
        return self.by_topic.items()

In [6]:
from collections import namedtuple


class QuestionData:
    
    STRING_CATEGORY_KEY = "RELQ_CATEGORY"
    
    def __init__(self, subject, body):
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class AnswerData:
    
    STRING_RELEVANCE_KEY = "RELC_RELEVANCE2RELQ"
    STRING_USERNAME_KEY = "RELC_USERNAME"
    STRING_DATE_KEY = "RELC_DATE"
    STRING_ID_KEY = "RELC_ID"
    
    def __init__(self, metadata, body):
        self.metadata = metadata
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, user, date, relevance):
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionAnswerPair = namedtuple('QuestionAnswerPair', ['question', 'answer', 'data'])

QuestionAnswerPair.__new__.__defaults__ = (None,)

In [7]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(subject.text, body.text)


def to_answer_data(node_thread_answers):
    f = lambda node: AnswerData(Metadata(node.attrib[AnswerData.STRING_USERNAME_KEY],
                                         node.attrib[AnswerData.STRING_DATE_KEY],
                                         AnswerRelevance.to_number(node.attrib[AnswerData.STRING_RELEVANCE_KEY])), 
                                node[0].text)
    
    return [f(node) for node in node_thread_answers]

In [8]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir):
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        
        for root_node in corpus_reader.xml():
            node_thread = root_node[2]

            node_thread_question = node_thread[0]
            node_thread_answers = node_thread[1:]

            thread_question = to_question_data(node_thread_question)
            thread_answers = to_answer_data(node_thread_answers)

            thread = Thread(thread_question, thread_answers)        
            string_topic = node_thread_question.attrib[QuestionData.STRING_CATEGORY_KEY]
            
            yield string_topic, thread

In [9]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)

In [12]:
def get_data(data_generator):
    questions_list = []
    answers_list = []
    y = []
    
    for topic, thread in data_generator:
        question = thread.question
        
        for answer in thread.answers:
            answers_list.append(answer.body)
            questions_list.append(question.body)
            y += [ answer.metadata.relevance ]
            
    X = [questions_list, answers_list]
    return X, y

In [13]:

data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1.xml', 'SemEval2016-Task3-CQA-QL-train-part1.xml'], 
                                       0, None, None)

X, y = get_data(data_generator)

In [21]:
X[0]

['is there any place i can find scented massage oils in qatar?',
 'Hi;Can any one tell me a place where i can have a good massage drom philipinies????? yesterday i had a massage in Bio-Bil they charged me 300qr for 01 hour bt it is totally waste... pls advice me if theres any philipinos....',
 "Tell me; where is the best place to go for a massage? Mind you; I don't want to spend 1000QR for it... (Guys; please don't come up with answers that you\xa0would gladly do it yourself; plz...)\xa0",
 'hi there; i can see a lot of massage center here; but i dont which one is better. can someone help me which massage center is good...and how much will it cost me? thanks',
 'What attracts you more ?',
 '[img_assist|nid=54388|title=Placenta cream ????|desc=|link=none|align=left|width=440|height=388] What the hell do you do with Placenta cream ??????',
 'Any suggestions on how to get rid of them??',
 'Can someone please advise me my husband wants to get Tea Tree Oil pure in Doha? thank you',
 'plz he

In [22]:
X[1]

['Yes. It is right behind Kahrama in the National area.',
 'whats the name of the shop?',
 "It's called Naseem Al-Nadir. Right next to the Smartlink shop. You'll find the chinese salesgirls at affordable prices there.",
 'dont want girls;want oil',
 "Try Both ;) I'am just trying to be helpful. On a serious note - Please go there. you'll find what you are looking for.",
 'you mean oil and filter both',
 "Yes Lawa...you couldn't be more right LOL",
 'What they offer?',
 'FU did u try with that salesgirl ?',
 "Swine - No I don't try with salesgirls. My taste is classy ;)",
 'Most massages in Qatar are a waste of money. All they do is just rub some oil. No body does deep tissue massage here.',
 'my masseuse is very good. calling her from to time for home service. currently in the philippines for a month vacation; i guess. =( she is the best in aromatherapy massage.',
 'there is a massage center near mall roundabout in hilal opp. to woqood petrol station',
 "Try Magic Touch in Abu Hamour (b