In [1]:
!pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

Collecting sparqlwrapper
  Downloading https://files.pythonhosted.org/packages/00/9b/443fbe06996c080ee9c1f01b04e2f683b2b07e149905f33a2397ee3b80a2/SPARQLWrapper-1.8.5-py3-none-any.whl
Collecting rdflib>=4.0 (from sparqlwrapper)
  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
Collecting isodate (from rdflib>=4.0->sparqlwrapper)
  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
Installing collected packages: isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.0 rdflib-4.2.2 sparqlwrapper-1.8.5


You are using pip version 9.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [8]:
import pandas as pd
import time 
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?human ?cause ?date_of_birth ?date_of_death ?country ?occupation WHERE {
  ?h wdt:P31 wd:Q5.
  ?h wdt:P1196 ?cid.
  ?h wdt:P570 ?date_of_death.
  ?h wdt:P106 ?occup.
  ?h wdt:P27 ?count
  Filter ((str(?cause)) = "suicide" || (str(?cause)) ="homicide" || (str(?cause)) = "natural causes" || str(?cause) = "accident")
  OPTIONAL {
    ?h rdfs:label ?human.
    FILTER((LANG(?human)) = "en")
  }
  OPTIONAL {
    ?cid rdfs:label ?cause.
    FILTER((LANG(?cause)) = "en")
  }
  OPTIONAL {
    ?count rdfs:label ?country.
    FILTER((LANG(?country)) = "en")
  }
    OPTIONAL {
    ?occup rdfs:label ?occupation.
    FILTER((LANG(?occupation)) = "en")
  }
}
limit  %s
offset %s
"""

def get_results(endpoint_url, query, limit=100000, offset=0):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query % (limit, offset))
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

max_items = 100000
batch_size = 50000
to_skip = 0

results = []


for i in tqdm(range(max_items//batch_size)):
  results.extend(get_results(endpoint_url, query, limit=batch_size, offset=to_skip)['results']['bindings'])
  to_skip += batch_size
  time.sleep(10)


data = pd.DataFrame([{k: v['value'] for k, v in d.items()} for d in results])
print(f'Got {len(data)} items from wikidata')
data.head()

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:01<00:00, 30.66s/it]


Got 100000 items from wikidata


Unnamed: 0,cause,country,date_of_death,human,occupation
0,suicide,Germany,1898-03-31T00:00:00Z,Eleanor Marx,politician
1,suicide,Germany,1898-03-31T00:00:00Z,Eleanor Marx,translator
2,suicide,United States of America,1895-07-28T00:00:00Z,Henry Shimer,zoologist
3,suicide,Germany,2009-01-05T00:00:00Z,Adolf Merckle,entrepreneur
4,suicide,Germany,2009-01-05T00:00:00Z,Adolf Merckle,jurist
