In [1]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set


In [2]:
class QuestionRelevance:
    """
        Makes no distinction between 
        PerfectMatch and Relevant questions
    """
    
    INT_RELEVANT = 1
    RELEVANT = "Relevant"
    
    PERFECT_MACH = "PerfectMatch"
    
    INT_IRRELEVANT = 0
    IRRELEVANT = "Irrelevant"
    
    
    def to_number(string_relevance):
        return QuestionRelevance.INT_IRRELEVANT\
            if string_relevance == QuestionRelevance.IRRELEVANT \
            else QuestionRelevance.INT_RELEVANT
    
    
    def from_number(int_relevance):
        return QuestionRelevance.IRRIELEVANT \
            if string_relevance == QuestionRelevance.INT_IRRELEVANT \
            else QuestionRelevance.RELEVANT
            

In [3]:
class Thread:
    
    def __init__(self, question, rel_questions):
        self.question = question
        self.rel_questions = rel_questions

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()
    

In [4]:
from collections import namedtuple


class QuestionData:
    
    def __init__(self, subject, body):
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class RelQuestionData:
    
    STRING_RELEVANCE_KEY = "RELQ_RELEVANCE2ORGQ"
    STRING_USERNAME_KEY = "RELQ_USERNAME"
    STRING_DATE_KEY = "RELQ_DATE"
    STRING_ID_KEY = "RELQ_ID"
    
    def __init__(self, metadata, subject, body):
        self.metadata = metadata
        self.subject = subject
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, user, date, relevance):
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionQuestionPair = namedtuple('QuestionQuestionPair', ['question', 'rel_question', 'data'])

# set data to None by default
QuestionQuestionPair.__new__.__defaults__ = (None,)


In [5]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(subject.text, body.text)


def to_rel_question_data(node_rel_questions):
    f = lambda node: RelQuestionData(Metadata(node.attrib[RelQuestionData.STRING_USERNAME_KEY],
                                         node.attrib[RelQuestionData.STRING_DATE_KEY],
                                         QuestionRelevance.to_number(node.attrib[RelQuestionData.STRING_RELEVANCE_KEY])), 
                                node[0].text, node[1].text)
    
    return [f(node) for node in node_rel_questions if node[1].text is not None]


In [6]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir):
    
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        i = 0
        node_rel_questions = []
        for root_node in corpus_reader.xml():
            node_question = root_node
            node_rel_questions.append(root_node[2][0])
            i += 1
            if i == 10:
                thread_question = to_question_data(node_question)
                thread_rel_questions = to_rel_question_data(node_rel_questions)

                thread = Thread(thread_question, thread_rel_questions)        
                string_topic = ""
                
                i = 0
                node_rel_questions = []
            
                yield string_topic, thread
                

In [7]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)


In [8]:
def repeat_relevant(question, list_relevant, int_how_many):
    for i in range(int_how_many):
        question = QuestionData(str(question.subject), str(question.body))
        yield QuestionQuestionPair(question, list_relevant[i % len(list_relevant)] )
    

def get_data(data_generator, do_repeat_relevant = False):
    X = []
    y = []
    
    for topic, thread in data_generator:
        question = thread.question
        list_relevant = []
        
        for rel_question in thread.rel_questions:
            question = QuestionData(str(question.subject), str(question.body))
            question_question_pair = QuestionQuestionPair(question, rel_question)
            
            X += [ question_question_pair ]
            y += [ rel_question.metadata.relevance ]
    
            if rel_question.metadata.relevance == QuestionRelevance.INT_RELEVANT:
                list_relevant += [rel_question]
        
        num_relevant = len(list_relevant)
        num_irrelevant = len(thread.rel_questions) - num_relevant
        
        if num_relevant < num_irrelevant \
            and do_repeat_relevant \
            and num_relevant != 0:
                
            delta = num_irrelevant - num_relevant
            
            X += repeat_relevant(question, list_relevant, delta)
            y += [ QuestionRelevance.INT_RELEVANT for i in range(delta) ]
        
    return X, y


In [9]:
class GenericItemTransformer:
    
    def __init__(self, generic_transformation):
        self.generic_transformation = generic_transformation
    
    def transform(self, generic_input, y=None):        
        return self.generic_transformation(generic_input)
    
    def fit(self, *args):
        return self

In [10]:
from scipy.spatial.distance import cosine


def cosine_similarity(vector1, vector2):
    return 1 - cosine(vector1, vector2)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import math


def to_content_generator(non_tokenized_qq_pairs):
    for qq_pair in non_tokenized_qq_pairs:
        yield qq_pair.question.body
        yield qq_pair.rel_question.body
        
        
def tfidf_calc(non_tokenized_qq_pairs, 
               tfidf_vectorizer):
    generator = to_content_generator(non_tokenized_qq_pairs)
    sparse_matrix = tfidf_vectorizer.fit_transform(generator)
    sparse_matrix_nrows = sparse_matrix.shape[0]
    
    similarities = []
    for i in range(0, sparse_matrix_nrows, 2):
        question_sparse_vector = sparse_matrix.getrow(i)
        rel_question_sparse_vector = sparse_matrix.getrow(i + 1)
        
        similarity = cosine_similarity(question_sparse_vector.todense(), 
                                       rel_question_sparse_vector.todense())
        
        similarities += [[similarity]]
        
    return similarities
    

def get_tfidf_transformer():
    tfidf_vectorizer= TfidfVectorizer(stop_words='english')
    return GenericItemTransformer(
        lambda non_tokenized_qq_pairs: tfidf_calc(non_tokenized_qq_pairs, tfidf_vectorizer))



In [12]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer


def get_pipeline():   
    non_tokenized_features = FeatureUnion([
        ('tfidf', get_tfidf_transformer())
    ])
    
    return Pipeline([
        ('features', make_union(non_tokenized_features)),
        ('nan_remover', Imputer(missing_values='NaN', strategy='mean', axis=0)),
        ('scaler', StandardScaler())
    ])

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import svm
import collections


data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1.xml', 'SemEval2016-Task3-CQA-QL-train-part1.xml'], 
                                       0, None, None)

X, y = get_data(data_generator, do_repeat_relevant=True)


dict_y_value_counts = collections.Counter(y)
n_negative = dict_y_value_counts[0]
n_positive = dict_y_value_counts[1]

print("Number of positive / negative examples: ", n_positive, "/", n_negative )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
classifier = GridSearchCV(svm.SVC(), tuned_parameters, cv=5)
pipeline = get_pipeline()
make_pipeline(pipeline, classifier).fit(X_train, y_train)


Number of positive / negative examples:  2552 / 2368


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', <__main__.GenericItemTransformer object at 0x7f97368df7b8>)],
       transformer_weights=None))],
       trans...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))])

In [14]:
from sklearn.metrics import average_precision_score

def mean_average_precision(list_y_true, list_y_score):
    ap = 0.0
    
    for (y_true, y_score) in zip(list_y_true, list_y_score):
        ap += average_precision_score(y_true, y_score)
        
    return ap / len(list_y_true)

In [15]:
from sklearn.metrics import classification_report, accuracy_score

X_test = pipeline.fit_transform(X_test)

classifier_output = classifier.decision_function(X_test)
output_mean_average_precision = mean_average_precision([y_test], [classifier_output])
y_predict = classifier.predict(X_test)

print('MAP: ', output_mean_average_precision)
print('acc', accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))
print(classifier.best_params_)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


MAP:  0.72377456914
acc 0.646341463415
             precision    recall  f1-score   support

          0       0.59      0.82      0.69       461
          1       0.76      0.49      0.60       523

avg / total       0.68      0.65      0.64       984

{'gamma': 0.001, 'C': 1000, 'kernel': 'rbf'}
