In [143]:
%config IPCompleter.greedy=True
from SPARQLWrapper import SPARQLWrapper, TURTLE, POST, SPARQLWrapper2
from rdflib import Graph
from qwikidata.sparql import return_sparql_query_results 
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from urllib.request import urlopen
import requests
import json
import re
import pandas as pd
import os
from time import sleep

In [2]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

In [3]:
sparql = SPARQLWrapper('https://query.wikidata.org/sparql')

In [4]:
sparql.setQuery('''
CONSTRUCT {
  wd:Q42 ?p1 ?o.
  ?s ?p2 wd:Q42.
}
WHERE {
  {wd:Q42 ?p1 ?o.}
  UNION
  {?s ?p2 wd:Q42.}
}''')

In [124]:
sparql.setQuery('''
CONSTRUCT {
  ?e ?p1 ?o .
  ?s ?p2 ?e .
}
WHERE {
  VALUES ?e { wd:Q42 wd:Q1 } .
  { ?e ?p1 ?o . }
  UNION
  { ?s ?p2 ?e . }
}''')

In [123]:
sparql.method = POST

In [125]:
result = sparql.queryAndConvert()

In [24]:
with open('graph.ttl', 'bw') as ouf:
    ouf.write(result.serialize(format='turtle'))

In [126]:
qres = result.query('''
SELECT ?x ?date
WHERE {
    ?x wdt:P569 ?date .
}
''')

In [127]:
for row in qres:
    print(row[0])

http://www.wikidata.org/entity/Q42


In [37]:
g = Graph()

In [38]:
g.parse('graph.ttl', format='turtle')

<Graph identifier=N953c2c1f37e84fba8984dff7b9594ba3 (<class 'rdflib.graph.Graph'>)>

In [260]:
qres = g.query('''
SELECT ?x ?p ?y
WHERE {
    VALUES ?p { wdt:P569 wdt:P19 }
    ?x ?p ?y .
}
''')

In [261]:
for row in qres:
    print(row)

(rdflib.term.URIRef('http://www.wikidata.org/entity/Q42'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P569'), rdflib.term.Literal('1952-03-11T00:00:00+00:00', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#dateTime')))
(rdflib.term.URIRef('http://www.wikidata.org/entity/Q42'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P19'), rdflib.term.URIRef('http://www.wikidata.org/entity/Q350'))


### Entities selection

In [4]:
q_to_labels = defaultdict(list)
with open('labels_token.txt') as inf:
    for line in inf:
        line = line.strip('\n')
        qid, label = line.split(':', 1)
        q_to_labels[qid].append(label)

In [5]:
len(q_to_labels)

4114595

In [6]:
cnt = 0
for qid in q_to_labels:
    if int(qid[1:]) > 1000000:
        cnt += 1
cnt

3681522

In [7]:
russian_letters = 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя'

In [8]:
def validate_entity(qid, labels):
    if int(qid[1:]) <= 1000000:
        return True
    for label in labels:
        for letter in label:
            if letter in russian_letters:
                return True
    return False

In [9]:
good_entities = []
bad_entities = []
for qid, labels in tqdm(q_to_labels.items()):
    if validate_entity(qid, labels):
        good_entities.append(qid)
    else:
        bad_entities.append(qid)
good_entities.sort(key=lambda qid: int(qid[1:]))
bad_entities.sort(key=lambda qid: int(qid[1:]))

HBox(children=(IntProgress(value=0, max=4114595), HTML(value='')))




In [10]:
len(good_entities)

1679872

In [11]:
len(bad_entities)

2434723

### Predicates selection

In [342]:
rgx = re.compile(r'\b\w*:\w*\b')

In [353]:
pred_candidates = []
with open('kbqa_russian_dataset.json', 'r') as inf:
    for line in inf:
        pred_candidates.extend(re.findall(rgx, line))

In [361]:
predicates = set()
for cand in pred_candidates:
    if not cand.startswith('wd:'):
        predicates.add(cand)

In [365]:
predicates

{'23T00:00',
 '28T00:00',
 ':00Z',
 'p:P1346',
 'p:P1448',
 'p:P161',
 'p:P166',
 'p:P175',
 'p:P180',
 'p:P2046',
 'p:P2048',
 'p:P2225',
 'p:P2234',
 'p:P26',
 'p:P279',
 'p:P36',
 'p:P371',
 'p:P39',
 'p:P457',
 'p:P527',
 'p:P54',
 'p:P6',
 'p:P69',
 'p:P734',
 'p:P735',
 'pq:P1545',
 'pq:P1686',
 'pq:P2079',
 'pq:P453',
 'pq:P459',
 'pq:P512',
 'pq:P580',
 'pq:P582',
 'pq:P585',
 'pq:P642',
 'ps:P1346',
 'ps:P1448',
 'ps:P161',
 'ps:P166',
 'ps:P175',
 'ps:P26',
 'ps:P36',
 'ps:P371',
 'ps:P39',
 'ps:P457',
 'ps:P527',
 'ps:P54',
 'ps:P6',
 'ps:P734',
 'ps:P735',
 'psn:P2046',
 'psn:P2048',
 'psn:P2225',
 'psn:P2234',
 'rdfs:label',
 'skos:altLabel',
 'wdt:P101',
 'wdt:P102',
 'wdt:P1027',
 'wdt:P1029',
 'wdt:P1050',
 'wdt:P1056',
 'wdt:P106',
 'wdt:P1066',
 'wdt:P1071',
 'wdt:P108',
 'wdt:P1101',
 'wdt:P111',
 'wdt:P1113',
 'wdt:P1114',
 'wdt:P112',
 'wdt:P1132',
 'wdt:P115',
 'wdt:P1165',
 'wdt:P1181',
 'wdt:P1196',
 'wdt:P122',
 'wdt:P123',
 'wdt:P1269',
 'wdt:P127',
 'wdt:P130

In [363]:
pred_sufs = set()
for pred in predicates:
    suf = pred.split(':')[0]
    pred_sufs.add(suf)

In [364]:
pred_sufs

{'',
 '23T00',
 '28T00',
 'p',
 'pq',
 'ps',
 'psn',
 'rdfs',
 'skos',
 'wdt',
 'wikibase',
 'xml',
 'xsd'}

In [461]:
special_predicates = [
    'wikibase:quantityAmount',
    'rdfs:label',
    'skos:altLabel'
]
predicate_suffixes = [
    'wdt',
    'p',
    'pq',
    'ps',
    'psn',
]

In [444]:
all_props = pd.read_csv('clean_properties_data.csv', header=0)
properties = all_props.values[:, 0].tolist()
freqs = all_props.values[:, -1].tolist()

In [447]:
freqs = [int(s.replace(',', '')) for s in freqs]

In [452]:
cnt = 0
for f in freqs:
    if f > 1000:
        cnt += 1
cnt

1013

In [399]:
for pred in predicates:
    prop = pred.split(':')[1]
    if prop not in properties:
        print(prop)

00
00Z
lang
00
altLabel
dateTime
quantityAmount
label


In [484]:
all_predicates = []
for prop in properties:
    for suf in predicate_suffixes[:1]:
        all_predicates.append(f'{suf}:{prop}')
all_predicates.extend(special_predicates)

In [485]:
len(all_predicates)

2274

In [486]:
all_predicates_string = ' '.join(all_predicates)

In [496]:
query = f'''
CONSTRUCT {{
  ?e0 ?p1 ?e1 .
}}
WHERE {{
  VALUES ?e0 {{ {uris} }} .
  ?e0 ?p1 ?e1 .
  FILTER strstarts(str(?p1), str(wdt:)) .
}}
'''

In [474]:
query = '''
CONSTRUCT {
  ?x ps:P6 ?y
}
WHERE {
  ?x ps:P6 ?y
}
LIMIT 1
'''

In [478]:
qr = requests.get('https://query.wikidata.org/sparql', params={'format': 'json', 'query': query})

In [479]:
qr

<Response [502]>

In [433]:
qr.json()

{'head': {'vars': ['x', 'y']},
 'results': {'bindings': [{'x': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/statement/Q30-18C3B0DA-1F3E-4D3F-B3D0-807697C1C023'},
    'y': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q23'}}]}}

In [443]:
existing_predicates = []

In [None]:
for pred in tqdm(all_predicates):
    query = f'''
    SELECT ?x ?y
    WHERE {{
      ?x {pred} ?y
    }}
    LIMIT 1
    '''
    res = requests.get('https://query.wikidata.org/sparql', params={'format': 'json', 'query': query}).json()
    if res['results']['bindings']:
        existing_predicates.append(pred)

In [497]:
print(query)


CONSTRUCT {
  ?e0 ?p1 ?e1 .
}
WHERE {
  VALUES ?e0 { wd:Q60041228 wd:Q60041406 wd:Q60041588 wd:Q60041613 wd:Q60041627 wd:Q60041669 wd:Q60041686 wd:Q60041693 wd:Q60042814 wd:Q60042828 wd:Q60042839 wd:Q60043576 wd:Q60043703 wd:Q60043850 wd:Q60043905 wd:Q60044496 wd:Q60044880 wd:Q60045011 wd:Q60045116 wd:Q60046381 wd:Q60046406 wd:Q60046634 wd:Q60046664 wd:Q60046734 wd:Q60046752 wd:Q60046894 wd:Q60047030 wd:Q60047357 wd:Q60047826 wd:Q60048020 wd:Q60048704 wd:Q60048762 wd:Q60048974 wd:Q60049322 wd:Q60049925 wd:Q60049972 wd:Q60050056 wd:Q60050372 wd:Q60050375 wd:Q60050411 wd:Q60050475 wd:Q60050578 wd:Q60050610 wd:Q60050663 wd:Q60050689 wd:Q60050753 wd:Q60050891 wd:Q60050905 wd:Q60051022 wd:Q60052987 wd:Q60053752 wd:Q60053844 wd:Q60053934 wd:Q60053987 wd:Q60053996 wd:Q60054046 wd:Q60054061 wd:Q60054071 wd:Q60054392 wd:Q60054429 wd:Q60054545 wd:Q60055016 wd:Q60055530 wd:Q60056218 wd:Q60056675 wd:Q60061634 wd:Q60061768 wd:Q60062595 wd:Q60063848 wd:Q60066005 wd:Q60066006 wd:Q60066008 wd:Q600660

In [498]:
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})

In [500]:
with open('graph.ttl', 'w') as ouf:
    ouf.write(resp.text)

### Send construct

#### Only wdt:

In [505]:
len(good_entities)

1679872

In [507]:
for i in tqdm(range(1680)):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in good_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?e0 ?p1 ?e1 .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 ?p1 ?e1 .
      FILTER strstarts(str(?p1), str(wdt:)) .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/wdt/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=1680), HTML(value='')))




In [12]:
len(bad_entities)

2434723

In [14]:
for i in tqdm(range(2435)):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in bad_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?e0 ?p1 ?e1 .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 ?p1 ?e1 .
      FILTER strstarts(str(?p1), str(wdt:)) .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/wdt_ext/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=2435), HTML(value='')))




In [27]:
print(missed)

[15, 23, 153, 155, 156, 157, 160, 192, 193, 256, 426, 435, 437, 703, 708, 711, 712, 983, 984, 988, 995, 999, 1005, 1007, 1008, 1010, 1012, 1015, 1018, 1020, 1024, 1028, 1029, 1536, 1538, 1539]


In [74]:
for i in tqdm(missed):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in good_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?e0 ?p1 ?e1 .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 ?p1 ?e1 .
      FILTER strstarts(str(?p1), str(wdt:)) .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/wdt/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

In [169]:
patch_entities = []
with open('/Volumes/Seagate/wikidata_dump/add_ent.txt') as inf:
    for line in inf:
        patch_entities.append(line.strip())

In [170]:
len(patch_entities)

43

In [171]:
uris = ' '.join(['wd:' + qid for qid in patch_entities])
query = f'''
CONSTRUCT {{
  ?e0 ?p1 ?e1 .
}}
WHERE {{
  VALUES ?e0 {{ {uris} }} .
  ?e0 ?p1 ?e1 .
  FILTER strstarts(str(?p1), str(wdt:)) .
}}
'''
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
with open(f'/Volumes/Seagate/wikidata_dump/wdt/1680.ttl', 'w') as ouf:
    ouf.write(resp.text)

In [172]:
uris = ' '.join(['wd:' + qid for qid in patch_entities])
query = f'''
CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
    '''
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
with open(f'/Volumes/Seagate/wikidata_dump/pch/1680.ttl', 'w') as ouf:
    ouf.write(resp.text)

#### Chains with p:

In [518]:
for i in tqdm(range(1680)):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in good_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/pch/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=680), HTML(value='')))

In [15]:
for i in tqdm(range(2435)):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in bad_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/pch_ext/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=2435), HTML(value='')))




In [67]:
print(missed)

[55, 56, 57, 59, 73, 76, 120, 137, 167, 171, 196, 199, 205, 206, 318, 355, 402, 404, 626, 632, 663]


In [68]:
for i in tqdm(missed):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in good_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/pch/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

#### P131

In [49]:
dataset = json_read('kbqa_russian_dataset.json')

In [127]:
entities_set = set()
for entry in dataset[:1200]:
    qids = [uri.split('/')[-1] for uri in entry['question_uris']]
    for ans in entry['answers']:
        if ans['type'] == 'uri':
            qids.append(ans['value'].split('/')[-1])
    for qid in qids:
        entities_set.add(qid)

In [129]:
entities_set = list(entities_set)

In [128]:
len(entities_set)

2356

In [62]:
geo_entities = ['Q60874', 'Q83497', 'Q3116386', 'Q868666', 'Q57965', 'Q12800833', 'Q344569', 'Q2228393']

In [64]:
uris = ' '.join(['wd:' + qid for qid in geo_entities])
query = f'''
CONSTRUCT {{
  ?x wdt:P131 ?e1 .
  ?e1 wdt:P131 ?e2 .
  ?e2 wdt:P131 ?e3 .
}}
WHERE {{
  VALUES ?x {{ {uris} }}
  ?x wdt:P131 ?e1 .
  OPTIONAL {{ ?e1 wdt:P131 ?e2 }} .
  OPTIONAL {{ ?e2 wdt:P131 ?e3 }} .
}}
'''
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
with open(f'/Volumes/Seagate/wikidata_dump/geo.ttl', 'w') as ouf:
    ouf.write(resp.text)

In [61]:
for i in tqdm(range(123)):
    begin = i * 10
    end = (i + 1) * 10
    uris = ' '.join(['wd:' + qid for qid in entities_set[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?x wdt:P131 ?e1 .
      ?e1 wdt:P131 ?e2 .
      ?e2 wdt:P131 ?e3 .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x wdt:P131 ?e1 .
      OPTIONAL {{ ?e1 wdt:P131 ?e2 }} .
      OPTIONAL {{ ?e2 wdt:P131 ?e3 }} .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/geo/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

KeyboardInterrupt: 

#### Labels and aliases

In [158]:
for i in tqdm(range(4115)):
    if os.stat(f'/Volumes/Seagate/wikidata_dump/als/{i}.ttl').st_size == 0:
        sleep(1)
        begin = i * 1000
        end = (i + 1) * 1000
        uris = ' '.join(['wd:' + qid for qid in all_entities[begin:end]])
        query = f'''
        CONSTRUCT {{
          ?e0 skos:altLabel ?label .
        }}
        WHERE {{
          VALUES ?e0 {{ {uris} }} .
          ?e0 skos:altLabel ?label .
          FILTER (lang(?label) = "ru") .
        }}
        '''
        resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
        with open(f'/Volumes/Seagate/wikidata_dump/als/{i}.ttl', 'w') as ouf:
            ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=4115), HTML(value='')))

In [160]:
cnt = 0
for i in tqdm(range(4115)):
    if os.stat(f'/Volumes/Seagate/wikidata_dump/als/{i}.ttl').st_size < 2000:
#         print(i, os.stat(f'/Volumes/Seagate/wikidata_dump/als/{i}.ttl').st_size)
        cnt += 1
cnt

HBox(children=(IntProgress(value=0, max=2435), HTML(value='')))

0

In [141]:
os.stat(f'/Volumes/Seagate/wikidata_dump/lbl/1011.ttl').st_size

61815

In [511]:
uris = ' '.join(['wd:' + qid for qid in good_entities[:1000]])

In [512]:
query = f'''
    CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
'''

In [510]:
print(query)


CONSTRUCT {
  ?e0 ?p1 ?e1 .
}
WHERE {
  VALUES ?e0 { wd:Q1 wd:Q2 wd:Q3 wd:Q4 wd:Q5 wd:Q8 wd:Q13 wd:Q15 wd:Q16 wd:Q17 wd:Q18 wd:Q19 wd:Q20 wd:Q21 wd:Q22 wd:Q23 wd:Q24 wd:Q25 wd:Q26 wd:Q27 wd:Q28 wd:Q29 wd:Q30 wd:Q31 wd:Q32 wd:Q33 wd:Q34 wd:Q35 wd:Q36 wd:Q37 wd:Q38 wd:Q39 wd:Q40 wd:Q41 wd:Q42 wd:Q43 wd:Q44 wd:Q45 wd:Q46 wd:Q48 wd:Q49 wd:Q51 wd:Q52 wd:Q53 wd:Q54 wd:Q55 wd:Q57 wd:Q58 wd:Q59 wd:Q60 wd:Q61 wd:Q62 wd:Q64 wd:Q65 wd:Q66 wd:Q67 wd:Q68 wd:Q69 wd:Q70 wd:Q71 wd:Q72 wd:Q73 wd:Q74 wd:Q75 wd:Q76 wd:Q77 wd:Q78 wd:Q79 wd:Q80 wd:Q81 wd:Q82 wd:Q83 wd:Q84 wd:Q85 wd:Q86 wd:Q87 wd:Q88 wd:Q89 wd:Q90 wd:Q91 wd:Q94 wd:Q95 wd:Q96 wd:Q97 wd:Q98 wd:Q99 wd:Q100 wd:Q101 wd:Q103 wd:Q105 wd:Q107 wd:Q108 wd:Q109 wd:Q110 wd:Q111 wd:Q112 wd:Q113 wd:Q114 wd:Q115 wd:Q116 } .
  ?e0 ?p1 ?e1 .
  FILTER strstarts(str(?p1), str(wdt:)) .
}



In [513]:
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})

In [514]:
with open('graph.ttl', 'w') as ouf:
    ouf.write(resp.text)

In [278]:
sparql = SPARQLWrapper('https://query.wikidata.org/sparql')
sparql.method = POST

In [493]:
uris = ' '.join(['wd:' + qid for qid in good_entities[-1000:]])

In [328]:
# query = f'''
# CONSTRUCT {{
#   ?e ?p1 ?o .
#   ?s ?p2 ?e .
# }}
# WHERE {{
#   VALUES ?e {{ {uris} }} .
#   {{ ?e ?p1 ?o . }}
#   UNION
#   {{ ?s ?p2 ?e . }}
# }}
# '''

query = f'''
CONSTRUCT {{
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  ?e2 wikibase:quantityAmount ?e3 .
}}
WHERE {{
  VALUES ?e0 {{ {uris} }} .
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  OPTIONAL {{ ?e2 wikibase:quantityAmount ?e3 }} .
}}
'''

In [329]:
print(query)


CONSTRUCT {
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  ?e2 wikibase:quantityAmount ?e3 .
}
WHERE {
  VALUES ?e0 { wd:Q1 wd:Q2 wd:Q3 wd:Q4 wd:Q5 wd:Q8 wd:Q13 wd:Q15 wd:Q16 wd:Q17 wd:Q18 wd:Q19 wd:Q20 wd:Q21 wd:Q22 wd:Q23 wd:Q24 wd:Q25 wd:Q26 wd:Q27 wd:Q28 wd:Q29 wd:Q30 wd:Q31 wd:Q32 wd:Q33 wd:Q34 wd:Q35 wd:Q36 wd:Q37 wd:Q38 wd:Q39 wd:Q40 wd:Q41 wd:Q42 wd:Q43 wd:Q44 wd:Q45 wd:Q46 wd:Q48 wd:Q49 wd:Q51 wd:Q52 wd:Q53 wd:Q54 wd:Q55 wd:Q57 wd:Q58 wd:Q59 wd:Q60 wd:Q61 wd:Q62 wd:Q64 wd:Q65 wd:Q66 wd:Q67 wd:Q68 wd:Q69 wd:Q70 wd:Q71 wd:Q72 wd:Q73 wd:Q74 wd:Q75 wd:Q76 wd:Q77 wd:Q78 wd:Q79 wd:Q80 wd:Q81 wd:Q82 wd:Q83 wd:Q84 wd:Q85 wd:Q86 wd:Q87 wd:Q88 wd:Q89 wd:Q90 wd:Q91 wd:Q94 wd:Q95 wd:Q96 wd:Q97 wd:Q98 wd:Q99 wd:Q100 wd:Q101 wd:Q103 wd:Q105 wd:Q107 wd:Q108 wd:Q109 wd:Q110 wd:Q111 wd:Q112 wd:Q113 wd:Q114 wd:Q115 wd:Q116 } .
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  OPTIONAL { ?e2 wikibase:quantityAmount ?e3 } .
}



In [330]:
sparql.setQuery(query)

In [292]:
result_graph = sparql.query()

In [284]:
with open('xml_graph.xml', 'wb') as ouf:
    for sm in result_graph:
        ouf.write(sm)

In [281]:
type(result_graph)

SPARQLWrapper.Wrapper.QueryResult

In [293]:
graph = result_graph.convert()

KeyboardInterrupt: 

In [142]:
with open('graph.ttl', 'bw') as ouf:
    ouf.write(result_graph.serialize(format='turtle'))

In [104]:
g = Graph()

In [105]:
with open('simple_graph_sub.ttl', 'r') as inf:
    g.parse(inf, format='turtle')

In [317]:
g.serialize('simple_graph_sub.nt', format='nt')

In [318]:
with open('simple_graph.nt', 'r') as inf:
    sg = [line.strip() for line in inf]
with open('simple_graph_sub.nt', 'r') as inf:
    sgs = [line.strip() for line in inf]

In [321]:
sg.sort()
sgs.sort()

In [322]:
for i in range(len(sgs)):
    if sg[i] != sgs[i]:
        print(i)
        print(sg[i])
        print(sgs[i])

15231
<http://www.wikidata.org/entity/statement/Q2-6657d0b5-4aa4-b465-12ed-d1b8a04ef658> <http://www.wikidata.org/prop/statement/P576> _:fd86c57b67bd34ace991911307ac97aabb8 .
<http://www.wikidata.org/entity/statement/Q2-6657d0b5-4aa4-b465-12ed-d1b8a04ef658> <http://www.wikidata.org/prop/statement/P576> _:ff7bfe3933b72466a8785b14d83208d65b8 .
15232
<http://www.wikidata.org/entity/statement/Q2-66aa9cb5-4f00-64d4-8d20-7f7d1c89dbf1> <http://wikiba.se/ontology#rank> <http://wikiba.se/ontology#PreferredRank> .
<http://www.wikidata.org/entity/statement/Q2-6657d0b5-4aa4-b465-12ed-d1b8a04ef658> <http://www.wikidata.org/prop/statement/P576> _:ffecc0fa21e8c412babe38e9c87a502ceb8 .
15233
<http://www.wikidata.org/entity/statement/Q2-66aa9cb5-4f00-64d4-8d20-7f7d1c89dbf1> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://wikiba.se/ontology#BestRank> .
<http://www.wikidata.org/entity/statement/Q2-66aa9cb5-4f00-64d4-8d20-7f7d1c89dbf1> <http://wikiba.se/ontology#rank> <http://wikiba.se/ontology#

<http://www.wikidata.org/entity/statement/Q2-ab29c85c-4a5d-c860-92af-0e873fae5415> <http://www.wikidata.org/prop/statement/value-normalized/P2547> <http://www.wikidata.org/value/b6cb7cf5bfc4b4aab5b107da559ab1e5> .
<http://www.wikidata.org/entity/statement/Q2-ab29c85c-4a5d-c860-92af-0e873fae5415> <http://www.wikidata.org/prop/statement/P2547> "40075.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
15749
<http://www.wikidata.org/entity/statement/Q2-ab29c85c-4a5d-c860-92af-0e873fae5415> <http://www.wikidata.org/prop/statement/value/P2547> <http://www.wikidata.org/value/120f5644590b876f229f66ca0a42be6f> .
<http://www.wikidata.org/entity/statement/Q2-ab29c85c-4a5d-c860-92af-0e873fae5415> <http://www.wikidata.org/prop/statement/value-normalized/P2547> <http://www.wikidata.org/value/b6cb7cf5bfc4b4aab5b107da559ab1e5> .
15750
<http://www.wikidata.org/entity/statement/Q2-ad156c0b-4db2-71d8-1314-b7f09ffea658> <http://wikiba.se/ontology#rank> <http://wikiba.se/ontology#NormalRank> .
<http://www.wi

16141
<http://www.wikidata.org/value/e5da3c8368f4e2201d5c84bf1fea18b1> <http://wikiba.se/ontology#quantityAmount> "281000000.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
<http://www.wikidata.org/value/d0766181abe69b89c587c1c13588f99c> <http://wikiba.se/ontology#quantityAmount> "4000000000.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
16142
<http://www.wikidata.org/value/e7f73abba19ce89da4889e4fdcbb6035> <http://wikiba.se/ontology#quantityAmount> "50000000.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
<http://www.wikidata.org/value/d3953c93bc7d504a5a7ae5ed8103a2d9> <http://wikiba.se/ontology#quantityAmount> "291000000.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
16143
<http://www.wikidata.org/value/ee2ed2546412fab940f2aa5df079d544> <http://wikiba.se/ontology#quantityAmount> "336000000.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
<http://www.wikidata.org/value/db057e757cc2bf0aedbe726a339af315> <http://wikiba.se/ontology#quantityAmount> "1.00000261"^^<http://www.w3.org/2

IndexError: list index out of range

In [326]:
qr = g.query('''
SELECT ?p ?x
WHERE {
    wd:Q513 skos:altLabel ?x
}''')
for row in qr:
    print(row)

(None, rdflib.term.Literal('額菲爾士峰', lang='zh'))
(None, rdflib.term.Literal('चोमोलोङ्मा', lang='sa'))
(None, rdflib.term.Literal('Gunung Everest', lang='id'))
(None, rdflib.term.Literal('Τσομολάνγκμα (Φενγκ)', lang='el'))
(None, rdflib.term.Literal('Chomolungma', lang='fr'))
(None, rdflib.term.Literal('إفرست', lang='ar'))
(None, rdflib.term.Literal('사가르마타', lang='ko'))
(None, rdflib.term.Literal('额菲尔士峰', lang='zh-hans'))
(None, rdflib.term.Literal('聖母峰', lang='zh'))
(None, rdflib.term.Literal('Sagarmatha', lang='de'))
(None, rdflib.term.Literal('Սագարմաթա', lang='hy'))
(None, rdflib.term.Literal('ఎవరెస్టు', lang='te'))
(None, rdflib.term.Literal('Monte Everest', lang='it'))
(None, rdflib.term.Literal('Сагарматха', lang='uk'))
(None, rdflib.term.Literal('Sagarmatha', lang='id'))
(None, rdflib.term.Literal('Mt. Everest', lang='de'))
(None, rdflib.term.Literal('देवगिरि शिखरम', lang='sa'))
(None, rdflib.term.Literal('monte Everest', lang='gl'))
(None, rdflib.term.Literal('초모랑마', lang='ko'))

In [294]:
query = '''
CONSTRUCT {
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  ?e2 wikibase:quantityAmount ?e3 .
}
WHERE {
  VALUES ?e0 { wd:Q2  } .
  ?e0 ?p1 ?e1 .
  ?e1 ?p2 ?e2 .
  OPTIONAL { ?e2 wikibase:quantityAmount ?e3 } .
}'''

In [331]:
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})

200

In [334]:
with open('graph_100.ttl', 'w') as ouf:
    ouf.write(resp.text)

In [100]:
qq = 'Q52284968'

In [101]:
qq in good_entities

False

In [102]:
qq in bad_entities

False

In [None]:
Q1459443
Q428808

In [218]:
for i in tqdm(range(1681)):
    begin = i * 1000
    end = (i + 1) * 1000
    if 'Q428808' in good_entities[begin:end]:
        print(i)
        break

HBox(children=(IntProgress(value=0, max=1681), HTML(value='')))

In [201]:
interest_uris = json_read('interest_uris.json')

In [207]:
interest_pchs = []
for uri in interest_uris:
    qid = uri.split('/')[-1]
    good = False
    for i in range(1681):
        begin = i * 1000
        end = (i + 1) * 1000
        if qid in good_entities[begin:end]:
            interest_pchs.append(i)
            good = True
            break
    if not good:
        print(f'not good: {uri}')
interest_pchs = list(set(interest_pchs))

not good: http://www.wikidata.org/entity/Q15642621


In [212]:
for uri in ["http://www.wikidata.org/entity/Q9960",
    "http://www.wikidata.org/entity/Q170576", 
    "http://www.wikidata.org/entity/Q201985", 
    "http://www.wikidata.org/entity/Q565692",
    "http://www.wikidata.org/entity/Q79037",
    "http://www.wikidata.org/entity/Q197736",
    "http://www.wikidata.org/entity/Q25322",
    "http://www.wikidata.org/entity/Q1490",
    "http://www.wikidata.org/entity/Q57965"]:
    qid = uri.split('/')[-1]
    good = False
    for i in range(1681):
        begin = i * 1000
        end = (i + 1) * 1000
        if qid in good_entities[begin:end]:
            if i not in interest_pchs:
                print(i, ',', end=' ')
            good = True
            break
    if not good:
        print(f'not good: {uri}')

8 , 104 , 120 , 269 , 55 , 118 , 19 , 41 , 

In [209]:
json_dump(interest_pchs, 'interest_pchs.json')

In [None]:
for i in tqdm(range(168)):
    begin = i * 1000
    end = (i + 1) * 1000
    if 'Q1027891' in good_entities[begin:end]:
        print(i)
        break

In [41]:
query = f'''
    CONSTRUCT {{
      ?x wdt:P279 ?y .
    }}
    WHERE {{
      ?x wdt:P279 ?y .
    }}
'''
resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})

In [42]:
with open(f'/Volumes/Seagate/wikidata_dump/wdt/_.ttl', 'w') as ouf:
    ouf.write(resp.text)

In [72]:
missed = []
for i in range(2435):
    if os.stat(f'/Volumes/Seagate/wikidata_dump/wdt_ext/{i}.ttl').st_size == 0:
        missed.append(i)

In [73]:
print(missed)

[1448, 1450, 1452]


In [69]:
missed_upd = []
for i in range(1680):
    if os.stat(f'/Volumes/Seagate/wikidata_dump/wdt/{i}.ttl').st_size == 0:
        missed_upd.append(i)
print(missed_upd)

[]


In [76]:
for i in tqdm(range(2410, 2411)):
    begin = i * 1000
    end = (i + 1) * 1000
    uris = ' '.join(['wd:' + qid for qid in bad_entities[begin:end]])
    query = f'''
    CONSTRUCT {{
      ?e0 ?p1 ?e1 .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 ?p1 ?e1 .
      FILTER strstarts(str(?p1), str(wdt:)) .
    }}
    '''
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'/Volumes/Seagate/wikidata_dump/wdt_ext/{i}.ttl', 'w') as ouf:
        ouf.write(resp.text)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

In [136]:
os.stat(f'/Volumes/Seagate/wikidata_dump/lbl/747.ttl').st_size

1944