In [1]:
class MapHelper:
    
    def append_to_list(map_with_list_as_value, map_key, new_list_element):
        existing_list = map_with_list_as_value.get(map_key, [])
        new_list = existing_list + [new_list_element]
        map_with_list_as_value[map_key] = new_list
    
    def append_to_set(map_with_set_as_value, map_key, new_set_element):
        existing_set = map_with_set_as_value.get(map_key, set())
        existing_set.add(new_set_element)
        map_with_set_as_value[map_key] = existing_set


In [2]:
class QuestionRelevance:
    """
        Makes no distinction between 
        PerfectMatch and Relevant questions
    """
    
    INT_RELEVANT = 1
    RELEVANT = "Relevant"
    
    PERFECT_MACH = "PerfectMatch"
    
    INT_IRRELEVANT = 0
    IRRELEVANT = "Irrelevant"
    
    
    def to_number(string_relevance):
        return QuestionRelevance.INT_IRRELEVANT\
            if string_relevance == QuestionRelevance.IRRELEVANT \
            else QuestionRelevance.INT_RELEVANT
    
    
    def from_number(int_relevance):
        return QuestionRelevance.IRRIELEVANT \
            if string_relevance == QuestionRelevance.INT_IRRELEVANT \
            else QuestionRelevance.RELEVANT

In [3]:
class Thread:
    
    def __init__(self, question, rel_questions):
        self.question = question
        self.rel_questions = rel_questions

    def __str__(self):
        return self.question.__str__()
    
    def __repr__(self):
        return self.question.__repr__()
    

In [4]:
from collections import namedtuple


class QuestionData:
  
    STRING_ID_KEY = "ORGQ_ID"
    
    def __init__(self, id, subject, body):
        self.id = id
        self.subject = subject
        self.body = body
    
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()
        
        
class RelQuestionData:
    
    STRING_RELEVANCE_KEY = "RELQ_RELEVANCE2ORGQ"
    STRING_USERNAME_KEY = "RELQ_USERNAME"
    STRING_DATE_KEY = "RELQ_DATE"
    STRING_ID_KEY = "RELQ_ID"
    
    def __init__(self, metadata, subject, body):
        self.metadata = metadata
        self.subject = subject
        self.body = body
        
    def __str__(self):
        return str(self.body)
    
    def __repr__(self):
        return self.__str__()

    
class Metadata:
    
    def __init__(self, id, user, date, relevance):
        self.id = id
        self.user = user
        self.date = date
        self.relevance = relevance
    
    
QuestionQuestionPair = namedtuple('QuestionQuestionPair', ['question', 'rel_question', 'data'])

# set data to None by default
QuestionQuestionPair.__new__.__defaults__ = (None,)

In [5]:
def to_question_data(node_question):
    subject = node_question[0]
    body = node_question[1]
    
    return QuestionData(node_question.attrib[QuestionData.STRING_ID_KEY], subject.text, body.text)


def to_rel_question_data(node_rel_questions):
    f = lambda node: RelQuestionData(Metadata(node.attrib[RelQuestionData.STRING_ID_KEY],
                                         node.attrib[RelQuestionData.STRING_USERNAME_KEY],
                                         node.attrib[RelQuestionData.STRING_DATE_KEY],
                                         QuestionRelevance.to_number(node.attrib[RelQuestionData.STRING_RELEVANCE_KEY])), 
                                node[0].text, node[1].text)
    
    return [f(node) for node in node_rel_questions if node[1].text is not None]

In [6]:
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView

def load_data(string_basedir, string_filenames_in_basedir):
    
    for string_filename in string_filenames_in_basedir:
        corpus_reader = XMLCorpusReader(string_basedir, string_filename)
        i = 0
        node_rel_questions = []
        for root_node in corpus_reader.xml():
            node_question = root_node
            node_rel_questions.append(root_node[2][0])
            i += 1
            if i == 10:
                thread_question = to_question_data(node_question)
                thread_rel_questions = to_rel_question_data(node_rel_questions)

                thread = Thread(thread_question, thread_rel_questions)        
                string_topic = ""
                
                i = 0
                node_rel_questions = []
            
                yield string_topic, thread

In [7]:
from itertools import *


def get_limited_generator(string_basedir,
                          list_string_filenames_in_basedir,
                          start_index_inclusive, 
                          end_index_exclusive = None, 
                          step = None):
    data_generator = load_data(string_basedir, list_string_filenames_in_basedir)
    return islice(data_generator, start_index_inclusive, end_index_exclusive, step)


In [8]:
def repeat_relevant(question, list_relevant, int_how_many):
    for i in range(int_how_many):
        question = QuestionData(str(question.subject), str(question.body))
        yield QuestionQuestionPair(question, list_relevant[i % len(list_relevant)] )
    

def get_data(data_generator, do_repeat_relevant = False):
    X = []
    y = []
    
    for topic, thread in data_generator:
        question = thread.question
        list_relevant = []
        
        for rel_question in thread.rel_questions:
            question = QuestionData(str(question.id), str(question.subject), str(question.body))
            question_question_pair = QuestionQuestionPair(question, rel_question)
            
            X += [ question_question_pair ]
            y += [ rel_question.metadata.relevance ]
    
            if rel_question.metadata.relevance == QuestionRelevance.INT_RELEVANT:
                list_relevant += [rel_question]
        
        num_relevant = len(list_relevant)
        num_irrelevant = len(thread.rel_questions) - num_relevant
        
        if num_relevant < num_irrelevant \
            and do_repeat_relevant \
            and num_relevant != 0:
                
            delta = num_irrelevant - num_relevant
            
            X += repeat_relevant(question, list_relevant, delta)
            y += [ QuestionRelevance.INT_RELEVANT for i in range(delta) ]
        
    return X, y

In [9]:
class GenericItemTransformer:
    
    def __init__(self, generic_transformation):
        self.generic_transformation = generic_transformation
    
    def transform(self, generic_input, y=None):        
        return self.generic_transformation(generic_input)
    
    def fit(self, *args):
        return self

In [10]:
from scipy.spatial.distance import cosine


def cosine_similarity(vector1, vector2):
    return 1 - cosine(vector1, vector2)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
import math


def to_content_generator(non_tokenized_qq_pairs):
    for qq_pair in non_tokenized_qq_pairs:
        yield qq_pair.question.body
        yield qq_pair.rel_question.body
        
        
def tfidf_calc(non_tokenized_qq_pairs, 
               tfidf_vectorizer):
    generator = to_content_generator(non_tokenized_qq_pairs)
    sparse_matrix = tfidf_vectorizer.fit_transform(generator)
    sparse_matrix_nrows = sparse_matrix.shape[0]
    
    similarities = []
    for i in range(0, sparse_matrix_nrows, 2):
        question_sparse_vector = sparse_matrix.getrow(i)
        rel_question_sparse_vector = sparse_matrix.getrow(i + 1)
        
        similarity = cosine_similarity(question_sparse_vector.todense(), 
                                       rel_question_sparse_vector.todense())
        
        similarities += [[similarity]]
        
    return similarities
    #return [[sparse_matrix_nrows]]
    

def get_tfidf_transformer():
    tfidf_vectorizer= TfidfVectorizer(stop_words='english')
    return GenericItemTransformer(
        lambda non_tokenized_qq_pairs: tfidf_calc(non_tokenized_qq_pairs, tfidf_vectorizer))



In [48]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer


def get_pipeline():   
    non_tokenized_features = FeatureUnion([
        ('tfidf', get_tfidf_transformer())
    ])
    
    return Pipeline([
        ('features', make_union(non_tokenized_features)),
        ('nan_remover', Imputer(missing_values='NaN', strategy='mean', axis=0)),
        ('scaler', StandardScaler())
    ])

In [49]:
data_generator = get_limited_generator('train', ['SemEval2016-Task3-CQA-QL-train-part2.xml'], 
                                       0, None, None)

X_train, y_train = get_data(data_generator, do_repeat_relevant=False)

In [50]:
data_generator = get_limited_generator('dev', ['SemEval2016-Task3-CQA-QL-dev.xml'], 
                                       0, None, None)

X_dev, y_dev = get_data(data_generator, do_repeat_relevant=False)

In [59]:
data_generator = get_limited_generator('test', ['SemEval2017-task3-English-test-input.xml'], 
                                       0, None, None)

X_test, y_test = get_data(data_generator, do_repeat_relevant=False)

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import svm
import collections

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
classifier = GridSearchCV(svm.SVC(probability=True), tuned_parameters, cv=5)
pipeline = get_pipeline()
make_pipeline(pipeline, classifier).fit(X_train, y_train)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  dist = 1.0 - uv / np.sqrt(uu * vv)


Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', <__main__.GenericItemTransformer object at 0x7f14f46ac6d8>)],
       transformer_weights=None))],
       trans...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))])

In [60]:
from sklearn.metrics import average_precision_score

def mean_average_precision(list_y_true, list_y_score):
    ap = 0.0
    
    for (y_true, y_score) in zip(list_y_true, list_y_score):
        ap += average_precision_score(y_true, y_score)
        
    return ap / len(list_y_true)

In [61]:
from sklearn.metrics import classification_report, accuracy_score

X_t = pipeline.fit_transform(X_test)

classifier_output = classifier.decision_function(X_t)
output_mean_average_precision = mean_average_precision([y_test], [classifier_output])
y_predict = classifier.predict(X_t)

print('MAP: ', output_mean_average_precision)
print('acc', accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))
print(classifier.best_params_)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


MAP:  1.0
acc 0.4011363636363636
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.40      0.57       880

avg / total       1.00      0.40      0.57       880

{'C': 1000, 'kernel': 'rbf', 'gamma': 0.001}


  'recall', 'true', average, warn_for)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

X_test = pipeline.fit_transform(X_test)

classifier_output = classifier.decision_function(X_test)
output_mean_average_precision = mean_average_precision([y_test], [classifier_output])
y_predict = classifier.predict(X_test)

print('MAP: ', output_mean_average_precision)
print('acc', accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))
print(classifier.best_params_)

In [None]:
X_dev[0]

In [None]:
from sklearn.metrics import classification_report, accuracy_score

X_train = pipeline.fit_transform(X_train)

classifier_output = classifier.decision_function(X_train)
output_mean_average_precision = mean_average_precision([y_train], [classifier_output])
y_predict = classifier.predict(X_train)

print('MAP: ', output_mean_average_precision)
print('acc', accuracy_score(y_train, y_predict))
print(classification_report(y_train, y_predict))
print(classifier.best_params_)

In [62]:
X_t = pipeline.fit_transform(X_test)

classifier_output = classifier.predict_proba(X_t)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [63]:
probs = [p[1] for p in classifier_output]
pred = ['true' if p >= 0.5 else 'false' for p in probs]

In [64]:
def save(pred_filepath, X, probs, preds):
    with open(pred_filepath, 'w') as pred_file:
        for i in range(len(X)):
            x = X[i]
            print("{}\t{}\t{}\t{}\t{}".format(x[0].id, x[1].metadata.id, 0, probs[i], preds[i]), 
                  )


In [65]:
save('result.pred', X_test, probs, pred)

Q388	Q388_R14	0	0.1792852112783056	false
Q388	Q388_R21	0	0.1792852112783056	false
Q388	Q388_R23	0	0.1792852112783056	false
Q388	Q388_R24	0	0.19803924505563858	false
Q388	Q388_R28	0	0.19928049279376317	false
Q388	Q388_R31	0	0.1792852112783056	false
Q388	Q388_R42	0	0.19595554080315053	false
Q388	Q388_R43	0	0.1792852112783056	false
Q388	Q388_R45	0	0.5324641553931928	true
Q388	Q388_R48	0	0.27967882754044765	false
Q389	Q389_R2	0	0.5393337209628917	true
Q389	Q389_R7	0	0.2532636738402719	false
Q389	Q389_R12	0	0.24398914619302473	false
Q389	Q389_R20	0	0.5785023966441118	true
Q389	Q389_R25	0	0.2908104142305373	false
Q389	Q389_R28	0	0.6401865282673076	true
Q389	Q389_R35	0	0.6414652266912352	true
Q389	Q389_R38	0	0.6834329629465428	true
Q389	Q389_R39	0	0.34817657551305425	false
Q389	Q389_R40	0	0.36866129717567825	false
Q390	Q390_R12	0	0.1792852112783056	false
Q390	Q390_R16	0	0.5276124801978402	true
Q390	Q390_R19	0	0.8924256545361223	true
Q390	Q390_R33	0	0.1792852112783056	false
Q390	Q390_R39	0	0.1

In [40]:
X_test[0]

QuestionQuestionPair(question=""how hard is it for you to get a decent paying job in qatar? I had applied in bayt;monstergulf and gulftalent almost religiously every day and yet I am getting nothing more than having my CV viewed. I have 4 years + experience in Linux and Unix environment and a handful of certifications to boot also. well; while its back to updating my CV; I'd love to hear about your experiences on getting a job here.""", rel_question=Can somebody tell me how i get a good job in doha? i have an American passport and holding a husband visas; with a bachelor degree; speaking 3 languages; having a hard time to find a good job; i post my CV on many website including bayt.com and many more; but didn't get any respond. Thanks., data=None)