In [2]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set


In [3]:
class AnswerRelevance:
    """
        Makes no distinction between 
        PotentiallyUseful and Bad comments
    """
    
    INT_GOOD = 1
    GOOD = "Good"
    
    POTENTIALLY_USEFUL = "PotentiallyUseful"
    
    INT_BAD = 0
    BAD = "Bad"
    
    
    def to_number(string_relevance):
        return AnswerRelevance.INT_GOOD \
            if string_relevance == AnswerRelevance.GOOD \
            else AnswerRelevance.INT_BAD
    
    
    def from_number(int_relevance):
        return AnswerRelevance.GOOD \
            if string_relevance == AnswerRelevance.INT_GOOD \
            else AnswerRelevance.BAD
            

In [4]:
class Thread:
    
    def __init__(self, question, answers):
        self.question = question
        self.answers = answers

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()
    

In [5]:
class ThreadCollection:
    
    def __init__(self):
        self.by_topic = {}
    
    def add(self, string_topic, thread):
        MapHelper.append_to_list(self.by_topic, string_topic, thread)
    
    def get(self, string_topic):
        return self.by_topic.get(string_topic, [])
    
    def topics(self):
        return list(self.by_topic.keys())

    def topic_thread_pairs(self):
        return self.by_topic.items()
    

In [6]:
from collections import namedtuple


class QuestionData:
    
    STRING_CATEGORY_KEY = "RELQ_CATEGORY"
    
    def __init__(self, subject, body):
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class AnswerData:
    
    STRING_RELEVANCE_KEY = "RELC_RELEVANCE2RELQ"
    STRING_USERNAME_KEY = "RELC_USERNAME"
    STRING_DATE_KEY = "RELC_DATE"
    STRING_ID_KEY = "RELC_ID"
    
    def __init__(self, metadata, body):
        self.metadata = metadata
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, user, date, relevance):
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionAnswerPair = namedtuple('QuestionAnswerPair', ['question', 'answer', 'data'])

# set data to None by default
QuestionAnswerPair.__new__.__defaults__ = (None,)


In [7]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(subject.text, body.text)


def to_answer_data(node_thread_answers):
    f = lambda node: AnswerData(Metadata(node.attrib[AnswerData.STRING_USERNAME_KEY],
                                         node.attrib[AnswerData.STRING_DATE_KEY],
                                         AnswerRelevance.to_number(node.attrib[AnswerData.STRING_RELEVANCE_KEY])), 
                                node[0].text)
    
    return [f(node) for node in node_thread_answers]


In [29]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir, subtask_A=False):
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        
        for root_node in corpus_reader.xml():
            if not subtask_A:
                node_thread = root_node[0]
            else:
                node_thread = root_node
            node_thread_question = node_thread[0]
            node_thread_answers = node_thread[1:]

            thread_question = to_question_data(node_thread_question)
            thread_answers = to_answer_data(node_thread_answers)

            thread = Thread(thread_question, thread_answers)        
            string_topic = node_thread_question.attrib[QuestionData.STRING_CATEGORY_KEY]
            
            yield string_topic, thread


In [30]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None,
                         subtask_A = False):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir, subtask_A)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)


In [10]:
def repeat_relevant(question, list_relevant, int_how_many):
    for i in range(int_how_many):
        question = QuestionData(str(question.subject), str(question.body))
        yield QuestionAnswerPair(question, list_relevant[i % len(list_relevant)] )
    

def get_data(data_generator, do_repeat_relevant = True):
    X = []
    y = []
    
    for topic, thread in data_generator:
        question = thread.question
        list_relevant = []
        
        for answer in thread.answers:
            question = QuestionData(str(question.subject), str(question.body))
            question_answer_pair = QuestionAnswerPair(question, answer)
            
            X += [ question_answer_pair ]
            y += [ answer.metadata.relevance ]
            
            if answer.metadata.relevance == AnswerRelevance.INT_GOOD:
                list_relevant += [answer]
        
        num_relevant = len(list_relevant)
        num_irrelevant = len(thread.answers) - num_relevant
        
        if num_relevant < num_irrelevant \
            and do_repeat_relevant \
            and num_relevant != 0:
                
            delta = num_irrelevant - num_relevant
            
            X += repeat_relevant(question, list_relevant, delta)
            y += [ AnswerRelevance.INT_GOOD for i in range(delta) ]
        
    return X, y


In [10]:
class GenericItemTransformer:
    
    def __init__(self, generic_transformation):
        self.generic_transformation = generic_transformation
    
    def transform(self, generic_input, y=None):        
        return self.generic_transformation(generic_input)
    
    def fit(self, *args):
        return self

In [11]:
from scipy.spatial.distance import cosine


def cosine_similarity(vector1, vector2):
    return 1 - cosine(vector1, vector2)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import math


def to_content_generator(non_tokenized_qa_pairs):
    for qa_pair in non_tokenized_qa_pairs:
        yield qa_pair.question.body
        yield qa_pair.answer.body
        
        
def tfidf_calc(non_tokenized_qa_pairs, 
               tfidf_vectorizer):
    generator = to_content_generator(non_tokenized_qa_pairs)
    sparse_matrix = tfidf_vectorizer.fit_transform(generator)
    sparse_matrix_nrows = sparse_matrix.shape[0]
    
    similarities = []
    for i in range(0, sparse_matrix_nrows, 2):
        question_sparse_vector = sparse_matrix.getrow(i)
        answer_sparse_vector = sparse_matrix.getrow(i + 1)
        
        similarity = cosine_similarity(question_sparse_vector.todense(), 
                                       answer_sparse_vector.todense())
        
        similarities += [[similarity]]
        
    return similarities
    

def get_tfidf_transformer():
    tfidf_vectorizer= TfidfVectorizer(stop_words='english')
    return GenericItemTransformer(
        lambda non_tokenized_qa_pairs: tfidf_calc(non_tokenized_qa_pairs, tfidf_vectorizer))



In [13]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer


def get_pipeline():   
    non_tokenized_features = FeatureUnion([
        ('tfidf', get_tfidf_transformer())
    ])
    
    return Pipeline([
        ('features', make_union(non_tokenized_features)),
        ('nan_remover', Imputer(missing_values='NaN', strategy='mean', axis=0)),
        ('scaler', StandardScaler())
    ])

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import svm
import collections


data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1.xml', 'SemEval2016-Task3-CQA-QL-train-part2.xml'], 
                                       0, None, None)

X, y = get_data(data_generator, do_repeat_relevant=True)

dict_y_value_counts = collections.Counter(y)
n_negative = dict_y_value_counts[0]
n_positive = dict_y_value_counts[1]

print("Number of positive / negative examples: ", n_positive, "/", n_negative )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
classifier = GridSearchCV(svm.SVC(), tuned_parameters, cv=5)
pipeline = get_pipeline()
make_pipeline(pipeline, classifier).fit(X_train, y_train)


Number of positive / negative examples:  17711 / 16713


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', <__main__.GenericItemTransformer object at 0x7f91e9a84b38>)],
       transformer_weights=None))],
       trans...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))])

In [15]:
from sklearn.metrics import average_precision_score

def mean_average_precision(list_y_true, list_y_score):
    ap = 0.0
    
    for (y_true, y_score) in zip(list_y_true, list_y_score):
        ap += average_precision_score(y_true, y_score)
        
    return ap / len(list_y_true)

In [16]:
from sklearn.metrics import classification_report, accuracy_score

X_test = pipeline.fit_transform(X_test)

classifier_output = classifier.decision_function(X_test)
output_mean_average_precision = mean_average_precision([y_test], [classifier_output])
y_predict = classifier.predict(X_test)

print('MAP: ', output_mean_average_precision)
print('acc', accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))
print(classifier.best_params_)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


MAP:  0.600652552083
acc 0.594625998548
             precision    recall  f1-score   support

          0       0.58      0.60      0.59      3364
          1       0.61      0.59      0.60      3521

avg / total       0.59      0.59      0.59      6885

{'C': 1000, 'kernel': 'rbf', 'gamma': 0.001}


In [31]:
data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part1-subtaskA.xml'], 
                                       0, None, None, True)

X, y = get_data(data_generator, do_repeat_relevant=False)


In [32]:
X[0]

QuestionAnswerPair(question=is there any place i can find scented massage oils in qatar?, answer=Yes. It is right behind Kahrama in the National area., data=None)