In [62]:
# access chat noir api
# snippet from https://www.uni-weimar.de/medien/webis/events/pan-12/pan12-code/pan12-source-retrieval-baseline.py
import requests
import unicodedata
import simplejson
import sys
from local_settings import TOKEN

CHATNOIR = 'https://www.chatnoir.eu/api/v1/_search'

def pose_query(query, token=TOKEN):
    """ Poses the query to the ChatNoir search engine. """
    # Double curly braces are escaped curly braces, so that format
    # strings will still work.
    json_query = u"""
    {{
        "apikey": "{apikey}",
        "query": "{query}",
        "index": ["cw12"],
        "size": 20,
        "pretty": true
    }}
    """.format(apikey=token, query=query)
    json_query = \
        unicodedata.normalize("NFKD", json_query).encode("ascii", "ignore")
    try:
        response = requests.post(CHATNOIR, data=json_query)
        print(response)
        results = simplejson.loads(response.text)
        response.close()
        return results
    except requests.HTTPError as e:
        error_message = e.read()
        print >> sys.stderr, error_message
        sys.exit(1)

In [63]:
# collect topics from an external collection
# sample relevant documents
q = 'acid stain concrete'
results = pose_query(q)

<Response [200]>


In [64]:
print(results)

{'meta': {'query_time': 11391, 'total_results': 9523, 'indices': ['cw12']}, 'results': [{'score': 3110.478, 'uuid': 'faa7cfe6-bda7-5654-bc36-f30cf9a3bcbf', 'index': 'cw12', 'trec_id': 'clueweb12-1507wb-01-08446', 'target_hostname': 'www.elitecreteaustralia.com.au', 'target_uri': 'http://www.elitecreteaustralia.com.au/concrete-acid-stain', 'page_rank': 1.7153556e-09, 'spam_rank': 69, 'title': '<em>Stain</em> <em>Concrete</em> <em>Acid</em> <em>Stain</em> <em>Concrete</em> Staining <em>Concrete</em> Colour', 'snippet': 'Home » Exterior Coatings » <em>Stain</em> <em>Concrete</em> With <em>Acid</em> <em>Stain</em> An <em>acid</em> <em>stain</em> is used to <em>stain</em> <em>concrete</em> and create decorative <em>concrete</em> art form that transforms plain grey <em>concrete</em> into a multitude of variegated, mottled, translucent and authentic appearing <em>concrete</em> patterns and designs.', 'explanation': None}, {'score': 2898.672, 'uuid': '40be4deb-70e1-590a-bfc1-cc5e14159153', 'in

In [78]:
# clean pages
from newspaper import Article

cluster = {}

for result in results['results']:

    uuid = result['uuid']
    print(uuid)

    url = 'https://www.chatnoir.eu/cache?uuid=%s&index=cw12&raw&plain' % uuid
    a = Article(url, language='en')
    try:
        a.download()
        a.parse()
        title = a.title
        text = a.text
        # collect paragraphs
        paragraphs = [p for p in text.split('\n') if len(p) > 250]
        if paragraphs:
            cluster[uuid] = paragraphs
    except:
        pass
    
print(len(cluster), 'pages')

faa7cfe6-bda7-5654-bc36-f30cf9a3bcbf
40be4deb-70e1-590a-bfc1-cc5e14159153
63e74d1b-d98d-5fdc-84ea-919cfacd1ef6
08c49060-4cfa-500f-8609-8dcba80b0fd2
514911c8-0dbd-58f0-b0a6-2bcf816dd4be
c0c8f258-db7a-5a46-bbe5-e1c534565b33
c5287b11-5862-5ffb-b66c-2b9216a6404a
f8576743-b77e-55c0-9614-98ae666edd28
ca17178d-dcc0-5cd1-b9ee-77ca91965c5d
bf0e5953-e098-5705-93bd-7c97139fc961
f9876d87-659b-544b-90a2-f5be95fba45b
8a74eb68-b67a-52cb-ab77-47257629a12a
216cb15c-5f9d-5ff5-af2e-73c9ca3360ec
e9706e05-4ced-5fc0-b82b-3ba0e4b3c513
04f9053c-6a24-53c7-9b57-e64dccb632df
288be281-7ec8-56a1-88d6-80c4e3e9a046
92b0f1d3-c908-5e9d-865a-5cb21d6dd99a
9277ec9d-a8ac-5a25-8cca-3276025b1cbd
86456281-93f9-5213-aa13-d7c96bf0b212
56370499-e2eb-517f-ad2a-a88a9e3d26ee
14 pages


In [1]:
paras = []
for uuid, paragraphs in cluster.items():
    print(uuid)
    for p in paragraphs:
        print(p, '\n')
        assert '/n' not in p
        paras.append(p)
    print('\n')
print(len(paras))

NameError: name 'cluster' is not defined

In [None]:
# dump passages
import datetime
import json
import os

squash_path = '/home/svakule/squash-generation'
os.mkdir("%s/squash/temp/%s" % (squash_path, key))

key = 'clueweb12'
top_p = 0.9
gen_frac = 0.5
spec_frac = 0.8


metadata = {
        "input_text": "\n".join(paras),
        "key": key,
        "timestamp": str(datetime.datetime.now()),
        "settings": {
            "top_p": top_p,
            "gen_frac": gen_frac,
            "spec_frac": spec_frac
        }
    }


with open('%s/squash/temp/%s/metadata.json' % (squash_path, key), 'w') as f:
    f.write(json.dumps(metadata))
    
# python squash/extract_answers.py --key clueweb12
# python question-generation/interact.py --model_checkpoint question-generation/gpt2_corefs_question_generation --model_type gpt2 --key clueweb12 --filename squash/temp/clueweb12/input.pkl
# vim squash/temp/clueweb12/generated_questions.json