-
-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
181 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import logging | ||
import requests | ||
|
||
from time import sleep | ||
from opentapioca.wditem import WikidataItemDocument | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class APIReaderBase(object): | ||
""" | ||
Base class for a reader that relies on the MediaWiki API to fetch | ||
item contents. | ||
""" | ||
|
||
def __init__(self, mediawiki_api): | ||
self.mediawiki_api = mediawiki_api | ||
self.retries = 5 | ||
self.delay = 5 | ||
|
||
def fetch_items(self, qids): | ||
""" | ||
Given a list of qids, fetch the corresponding documents via the Wikidata API. | ||
""" | ||
if not qids: | ||
return [] | ||
for retries in range(self.retries): | ||
try: | ||
req = requests.get(self.mediawiki_api, { | ||
'format':'json', | ||
'action':'wbgetentities', | ||
'ids':'|'.join(qids)}) | ||
req.raise_for_status() | ||
result = req.json().get('entities').values() | ||
return [WikidataItemDocument(payload) for payload in result if 'missing' not in payload] | ||
except (requests.exceptions.RequestException, ValueError, TypeError, AttributeError) as e: | ||
logger.warning(e) | ||
if retries < self.retries-1: | ||
sleep_time = (1+retries)*self.delay | ||
logger.info('Retrying wbgetentities in {}'.format(sleep_time)) | ||
sleep(sleep_time) | ||
else: | ||
logger.error('Failed to fetch entities') | ||
logger.error(req.url) | ||
raise | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import json | ||
import re | ||
import logging | ||
|
||
from .apireaderbase import APIReaderBase | ||
from opentapioca.sparqlwikidata import sparql_wikidata | ||
from opentapioca.utils import to_q | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class SparqlReader(APIReaderBase): | ||
""" | ||
Generates a collection of `WikidataItemDocument` from | ||
a SPARQL query which contains an "item" variable. | ||
""" | ||
|
||
def __init__(self, | ||
query, | ||
endpoint='https://query.wikidata.org/sparql', | ||
mediawiki_api='https://www.wikidata.org/w/api.php'): | ||
super(SparqlReader, self).__init__(mediawiki_api) | ||
self.endpoint = endpoint | ||
self.query = query | ||
self.batch_size = 50 | ||
self.query_results = None | ||
|
||
def __enter__(self): | ||
self.query_results = sparql_wikidata(self.query, endpoint=self.endpoint)['bindings'] | ||
return self | ||
|
||
def __exit__(self, *args, **kwargs): | ||
return None | ||
|
||
def __iter__(self): | ||
if self.query_results is None: | ||
raise ValueError('Query results have not been fetched.') | ||
while self.query_results: | ||
batch = self.query_results[:self.batch_size] | ||
self.query_results = self.query_results[self.batch_size:] | ||
|
||
qids = [to_q(result['item']['value']) for result in batch if 'item' in result] | ||
qids_without_none = [qid for qid in qids if qid] | ||
|
||
# Fetch item contents | ||
for item in self.fetch_items(qids_without_none): | ||
yield item | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
import requests | ||
|
||
def sparql_wikidata(query_string): | ||
results = requests.get('https://query.wikidata.org/sparql', {'query': query_string, 'format': 'json'}).json() | ||
def sparql_wikidata(query_string, endpoint='https://query.wikidata.org/sparql'): | ||
results = requests.get(endpoint, {'query': query_string, 'format': 'json'}).json() | ||
return results['results'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{"results":{"bindings":[ | ||
{"item":{"value":"http://www.wikidata.org/entity/Q123"}}, | ||
{"item":{"value":"http://www.wikidata.org/entity/Q456"}}, | ||
{"item":{"value":null}}, | ||
{"item":{"value":"http://www.wikidata.org/entity/Q789"}} | ||
]}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
|
||
import pytest | ||
import requests_mock | ||
import os | ||
|
||
from opentapioca.readers.sparqlreader import SparqlReader | ||
from .test_fixtures import wbgetentities_response | ||
from .test_fixtures import testdir | ||
|
||
@pytest.fixture | ||
def dummy_sparql_query_response(testdir): | ||
with open(os.path.join(testdir, 'data', 'dummy_sparql_query_response.json'), 'r') as f: | ||
return f.read() | ||
|
||
def test_iterate(wbgetentities_response, dummy_sparql_query_response): | ||
query = "mysparqlquery" | ||
reader = SparqlReader(query) | ||
with requests_mock.mock() as mocker: | ||
mocker.get('https://www.wikidata.org/w/api.php?format=json&action=wbgetentities&ids=Q123%7CQ456%7CQ789', text=wbgetentities_response) | ||
mocker.get('https://query.wikidata.org/sparql?format=json&query=mysparqlquery', text=dummy_sparql_query_response) | ||
|
||
with reader as entered_reader: | ||
items = list(entered_reader) | ||
|
||
assert [item.get('id') for item in items] == ['Q123', 'Q456'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters