In [24]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set


In [25]:
class QuestionRelevance:
    """
        Makes no distinction between 
        PerfectMatch and Relevant questions
    """
    
    INT_RELEVANT = 1
    RELEVANT = "Relevant"
    
    PERFECT_MACH = "PerfectMatch"
    
    INT_IRRELEVANT = 0
    IRRELEVANT = "Irrelevant"
    
    
    def to_number(string_relevance):
        return QuestionRelevance.INT_IRRELEVANT\
            if string_relevance == QuestionRelevance.IRRELEVANT \
            else QuestionRelevance.INT_RELEVANT
    
    
    def from_number(int_relevance):
        return QuestionRelevance.IRRIELEVANT \
            if int_relevance == QuestionRelevance.INT_IRRELEVANT \
            else QuestionRelevance.RELEVANT
            

In [26]:
class Thread:
    
    def __init__(self, question, rel_questions):
        self.question = question
        self.rel_questions = rel_questions

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()
    

In [27]:
from collections import namedtuple


class QuestionData:
    
    def __init__(self, subject, body):
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class RelQuestionData:
    
    STRING_RELEVANCE_KEY = "RELQ_RELEVANCE2ORGQ"
    STRING_USERNAME_KEY = "RELQ_USERNAME"
    STRING_DATE_KEY = "RELQ_DATE"
    STRING_ID_KEY = "RELQ_ID"
    
    def __init__(self, metadata, subject, body):
        self.metadata = metadata
        self.subject = subject
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, user, date, relevance):
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionQuestionPair = namedtuple('QuestionQuestionPair', ['question', 'rel_question', 'data'])

# set data to None by default
QuestionQuestionPair.__new__.__defaults__ = (None,)


In [28]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(subject.text, body.text)


def to_rel_question_data(node_rel_questions):
    f = lambda node: RelQuestionData(Metadata(node.attrib[RelQuestionData.STRING_USERNAME_KEY],
                                         node.attrib[RelQuestionData.STRING_DATE_KEY],
                                         QuestionRelevance.to_number(node.attrib[RelQuestionData.STRING_RELEVANCE_KEY])), 
                                node[0].text, node[1].text)
    
    return [f(node) for node in node_rel_questions if node[1].text is not None]


In [29]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir):
    
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        i = 0
        node_rel_questions = []
        for root_node in corpus_reader.xml():
            node_question = root_node
            node_rel_questions.append(root_node[2][0])
            i += 1
            if i == 10:
                thread_question = to_question_data(node_question)
                thread_rel_questions = to_rel_question_data(node_rel_questions)

                thread = Thread(thread_question, thread_rel_questions)        
                string_topic = ""
                
                i = 0
                node_rel_questions = []
            
                yield string_topic, thread
                

In [30]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)


In [31]:
def repeat_relevant(question, list_relevant, int_how_many):
    for i in range(int_how_many):
        question = QuestionData(str(question.subject), str(question.body))
        yield QuestionQuestionPair(question, list_relevant[i % len(list_relevant)] )
    

def get_data(data_generator, do_repeat_relevant = False):
    X = []
    y = []
    
    for topic, thread in data_generator:
        question = thread.question
        list_relevant = []
        
        for rel_question in thread.rel_questions:
            question = QuestionData(str(question.subject), str(question.body))
            question_question_pair = QuestionQuestionPair(question, rel_question)
            
            X += [ question_question_pair ]
            y += [ rel_question.metadata.relevance ]
    
            if rel_question.metadata.relevance == QuestionRelevance.INT_RELEVANT:
                list_relevant += [rel_question]
        
        num_relevant = len(list_relevant)
        num_irrelevant = len(thread.rel_questions) - num_relevant
        
        if num_relevant < num_irrelevant \
            and do_repeat_relevant \
            and num_relevant != 0:
                
            delta = num_irrelevant - num_relevant
            
            X += repeat_relevant(question, list_relevant, delta)
            y += [ QuestionRelevance.INT_RELEVANT for i in range(delta) ]
        
    return X, y


In [34]:
class GenericTransformer:
    
    def __init__(self, generic_transformation):
        self.generic_transformation = generic_transformation
    
    def transform(self, generic_input_list, y=None):
        return [ self.generic_transformation(i) for i in generic_input_list ]
    
    def fit(self, *args):
        return self

In [46]:
def string_none_guard(s):
    return s if s is not None else ""



def apply_to_content(qq_pair, function):
    new_q_subject = function(string_none_guard(qq_pair.question.subject))
    new_q_body = function(string_none_guard(qq_pair.question.body))
    new_rq_body = function(string_none_guard(qq_pair.rel_question.body))
    
    return QuestionQuestionPair(QuestionData(new_q_subject, new_q_body),
                              RelQuestionData(qq_pair.rel_question.metadata, qq_pair.rel_question.subject, new_rq_body))


def f_apply_to_content(function):
    return lambda qq_pair: apply_to_content(qq_pair, function)


In [47]:
from nltk import word_tokenize

def get_qq_pair_tokenizer_transformer():
    return GenericTransformer(f_apply_to_content(word_tokenize))


In [48]:
from nltk.corpus import stopwords

def get_stopword_remover_transformer():
    english_stopwords = stopwords.words('english')
    stopword_remover = GenericTransformer(f_apply_to_content(
            lambda content: [word for word in content if word.lower() not in english_stopwords]))
    return stopword_remover

In [66]:
import gensim

def get_word2vec_transformer():
    model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    word2vec = GenericTransformer(f_apply_to_content(
            lambda content: [model.wv[word] for word in content if word in model.wv]))
    return word2vec

In [61]:

data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1.xml', 'SemEval2016-Task3-CQA-QL-train-part1.xml'], 
                                       0, None, None)

X, y = get_data(data_generator, do_repeat_relevant=True)


tokenizer = get_qq_pair_tokenizer_transformer()
X_tokenized = tokenizer.transform(X)

stop_words = get_stopword_remover_transformer()
X_tokenized_stopwords = stop_words.transform(X_tokenized)

In [67]:
word2vec = get_word2vec_transformer()
x_transformed = word2vec.transform(X_tokenized_stopwords)

  


In [68]:
x_transformed[0]


QuestionQuestionPair(question=[array([ 0.06030273, -0.17871094, -0.09716797,  0.27539062, -0.12451172,
       -0.02868652,  0.21386719, -0.03320312,  0.16308594,  0.23632812,
       -0.1484375 , -0.00245667, -0.19824219,  0.11328125, -0.34765625,
        0.25976562,  0.04125977,  0.13183594, -0.04589844,  0.06542969,
        0.08300781,  0.1640625 ,  0.12207031,  0.11230469,  0.17089844,
        0.06738281,  0.04077148,  0.17382812, -0.16503906, -0.13867188,
        0.03491211,  0.34570312, -0.390625  , -0.07128906,  0.07128906,
       -0.04223633,  0.0014801 , -0.08007812, -0.11865234, -0.01867676,
        0.04638672,  0.1796875 ,  0.07714844, -0.19433594, -0.20410156,
       -0.09521484,  0.03588867,  0.0703125 , -0.171875  ,  0.05786133,
        0.21777344, -0.01306152,  0.07421875, -0.20507812, -0.10839844,
       -0.21875   , -0.07910156, -0.09667969, -0.02478027, -0.09033203,
        0.07910156,  0.06689453, -0.01397705,  0.2890625 ,  0.06445312,
       -0.13183594, -0.20117188, 