In [48]:
import os
from xml.etree import cElementTree as ET
import numpy as np
import gensim
import re
import scipy
import string

In [49]:
data_source = 'superuser.com'
directory = os.path.join('data', 'superuser.com')
x = ET.parse(os.path.join(directory, 'Posts.xml'))
root = x.getroot()

In [50]:
def is_question(child):
    if child.attrib['PostTypeId'] == '1':
        return True
    return False

def get_question_for_answer(child):
    return child.attrib['ParentId']

def may_get_attrib(child, attribute):
    if attribute in child.attrib:
        return child.attrib[attribute]
    return None

In [51]:
def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def clean_punctuation(s):
    translator = str.maketrans('', '', string.punctuation)
    cleantext = s.translate(translator)
    return cleantext

def tokenize(s):
    s = s.strip()
    s = clean_html(s)
    s = clean_punctuation(s)
    string_split = s.strip().split()
    ret = map(lambda x: x.lower(), string_split)
    return list(ret)

In [52]:
query_dict = {}
answer_dict = {}
num_retrieval = float('Inf')

for idx, child in enumerate(root):
    if idx < num_retrieval:
        if is_question(child):
            query_id = int(child.attrib['Id'])
            query_dict[query_id] = {'Body': may_get_attrib(child, 'Body')}
            query_dict[query_id]['Title'] = may_get_attrib(child, 'Title')
            query_dict[query_id]['AcceptedAnswerId'] = may_get_attrib(child, 'AcceptedAnswerId')
            query_dict[query_id]['Tokens'] = tokenize(query_dict[query_id]['Body'])
            query_dict[query_id]['Tokens_title'] = tokenize(query_dict[query_id]['Title'])

        else:
            answer_id = int(child.attrib['Id'])
            answer_dict[answer_id] = {'Body': may_get_attrib(child, 'Body')}
            answer_dict[answer_id]['ParentId'] = may_get_attrib(child, 'ParentId')
            answer_dict[answer_id]['Score'] = may_get_attrib(child, 'Score')
            answer_dict[answer_id]['Tokens'] = tokenize(answer_dict[answer_id]['Body'])
    else:
        break

In [55]:
parallel = []
parallel_query = []
for _, answer in answer_dict.items():
    if not answer['ParentId']:
        continue
    q_id = int(answer['ParentId'])
    q = query_dict[q_id]['Tokens_title']
    d = answer['Tokens']
    qd_pair = (" ".join(q), " ".join(d))
    parallel.append(qd_pair)

for _, query in query_dict.items():
    q = query['Tokens_title']
    d = query['Tokens']
    qd_pair = (" ".join(q), " ".join(d))
    parallel_query.append(qd_pair)
print("{} training samples".format(len(parallel_query)))

390593 training samples


In [57]:
import subprocess as sp

parallel_corpus = parallel_query
data_dir = os.path.join("seq2seq_data", data_source)
sp.check_output("mkdir -p {}_qd".format(data_dir), shell=True)
doc = os.linesep.join(map(lambda x: x[1], parallel_corpus))
query = os.linesep.join(map(lambda x: x[0], parallel_corpus))

with open(os.path.join(data_dir, "text.query"), "w") as f:
    f.write(query)
with open(os.path.join(data_dir, "text.doc"), "w") as f:
    f.write(doc)

num_dev = 5000
doc_train = os.linesep.join(map(lambda x: x[1], parallel_corpus[:-num_dev]))
doc_dev = os.linesep.join(map(lambda x: x[1], parallel_corpus[-num_dev:]))
query_train = os.linesep.join(map(lambda x: x[0], parallel_corpus[:-num_dev]))
query_dev = os.linesep.join(map(lambda x: x[0], parallel_corpus[-num_dev:]))

with open(os.path.join(data_dir, "train.text.query"), "w") as f:
    f.write(query_train)
with open(os.path.join(data_dir, "train.text.doc"), "w") as f:
    f.write(doc_train)
with open(os.path.join(data_dir, "dev.text.query"), "w") as f:
    f.write(query_dev)
with open(os.path.join(data_dir, "dev.text.doc"), "w") as f:
    f.write(doc_dev)

In [None]:
parallel_corpus = parallel
data_dir = os.path.join("seq2seq_data", data_source)
sp.check_output("mkdir -p {}_qa".format(data_dir), shell=True)
doc = os.linesep.join(map(lambda x: x[1], parallel_corpus))
query = os.linesep.join(map(lambda x: x[0], parallel_corpus))

with open(os.path.join(data_dir, "text.query"), "w") as f:
    f.write(query)
with open(os.path.join(data_dir, "text.doc"), "w") as f:
    f.write(doc)

num_dev = 5000
doc_train = os.linesep.join(map(lambda x: x[1], parallel_corpus[:-num_dev]))
doc_dev = os.linesep.join(map(lambda x: x[1], parallel_corpus[-num_dev:]))
query_train = os.linesep.join(map(lambda x: x[0], parallel_corpus[:-num_dev]))
query_dev = os.linesep.join(map(lambda x: x[0], parallel_corpus[-num_dev:]))

with open(os.path.join(data_dir, "train.text.query"), "w") as f:
    f.write(query_train)
with open(os.path.join(data_dir, "train.text.doc"), "w") as f:
    f.write(doc_train)
with open(os.path.join(data_dir, "dev.text.query"), "w") as f:
    f.write(query_dev)
with open(os.path.join(data_dir, "dev.text.doc"), "w") as f:
    f.write(doc_dev)