In [None]:
%config IPCompleter.greedy=True
from tqdm import tqdm_notebook as tqdm
import requests
import json
import os
from time import sleep

In [None]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

### Assign dump path

Choose the path where you want a dump to be located

In [None]:
dump_path = '/home/user/wikidata_dump'

### Load all entities' qids

In [None]:
all_entities = json_read('all_entities.json')

In [None]:
n_batches = len(all_entities) // 1000 + 1

### Batched CONSTRUCT queries

In [None]:
def save_construct_query(idx, query_builder, path):
    sleep(1)
    begin = i * batch_size
    end = (i + 1) * batch_size
    uris = ' '.join(['wd:' + qid for qid in all_entities[begin:end]])
    query = query_builder(uris)
    resp = requests.post('https://query.wikidata.org/sparql', data={'query': query}, headers={'Accept': 'text/turtle'})
    with open(f'{path}/{idx}.ttl', 'w') as ouf:
        ouf.write(resp.text)

In [None]:
def get_construct_for_batches(query_builder, path, batch_size=1000):
    n_batches = len(all_entities) // batch_size + 1
    for i in tqdm(range(n_batches)):
        save_construct_query(i, query_builder, path)

In [None]:
def check_empties(path, batch_size=1000):
    n_batches = len(all_entities) // batch_size + 1
    cnt = 0
    for i in tqdm(range(n_batches)):
        if os.stat(f'{path}/{i}.ttl').st_size < 2000:
            cnt += 1
    print(cnt)

In [None]:
def repeat_construct_for_batches(query_builder, path, batch_size=1000):
    n_batches = len(all_entities) // batch_size + 1
    for i in tqdm(range(n_batches)):
        if os.stat(f'{path}/{i}.ttl').st_size < 2000:
            save_construct_query(i, query_builder, path)

### Get labels for all entities

In [None]:
def query_builder_label(uris):
    return f'''
    CONSTRUCT {{
      ?e0 rdfs:label ?label .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 rdfs:label ?label .
      FILTER (lang(?label) = "ru" || lang(?label) = "en") .
    }}
    '''

In [None]:
get_construct_for_batches(query_builder_label, f'{dump_path}/lbl')

Wikidata endpoint occasionally gives 'empty' response, so we should check if all the batches are valid.

In [None]:
check_empties(f'{dump_path}/lbl')

If not, request for these parts once again.

In [None]:
repeat_construct_for_batches(query_builder_label, f'{dump_path}/lbl')

### Get aliases for all entities

In [None]:
def query_builder_alias(uris):
    return f'''
    CONSTRUCT {{
      ?e0 skos:altLabel ?label .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 skos:altLabel ?label .
      FILTER (lang(?label) = "ru" || lang(?label) = "en") .
    }}
    '''

In [None]:
get_construct_for_batches(query_builder_alias, f'{dump_path}/als')

In [None]:
check_empties(f'{dump_path}/als')

In [None]:
repeat_construct_for_batches(query_builder_alias, f'{dump_path}/als')

### Get truthy statements

In [None]:
def query_builder_truthy(uris):
    return f'''
    CONSTRUCT {{
      ?e0 ?p1 ?e1 .
    }}
    WHERE {{
      VALUES ?e0 {{ {uris} }} .
      ?e0 ?p1 ?e1 .
      FILTER strstarts(str(?p1), str(wdt:)) .
    }}
    '''

In [None]:
get_construct_for_batches(query_builder_truthy, f'{dump_path}/wdt')

In [None]:
check_empties(f'{dump_path}/wdt')

In [None]:
repeat_construct_for_batches(query_builder_truthy, f'{dump_path}/wdt')

### Get full statements

In [None]:
def query_builder_full(uris):
    return f'''
    CONSTRUCT {{
      ?x ?p ?y .
      ?y ?p2 ?z .
      ?z wikibase:quantityAmount ?t .
    }}
    WHERE {{
      VALUES ?x {{ {uris} }}
      ?x ?p ?y .
      FILTER strstarts(str(?p), "http://www.wikidata.org/prop/P") .
      ?y ?p2 ?z .
      FILTER strstarts(str(?p2), str(p:)) .
      OPTIONAL {{ ?z wikibase:quantityAmount ?t }} .
    }}
    '''

In [None]:
get_construct_for_batches(query_builder_full, f'{dump_path}/pch')

In [None]:
check_empties(f'{dump_path}/pch')

In [None]:
repeat_construct_for_batches(query_builder_full, f'{dump_path}/pch')

### Merge labels and aliases

In [None]:
with open(f'{dump_path}/lbl_als_wop.ttl', 'w'), open(f'{dump_path}/names.ttl', 'w'):
    pass

prefixes = set()

In [None]:
with open(f'{dump_path}/lbl_als_wop.ttl', 'a') as ouf:
    for i in tqdm(range(n_batches)):
        with open(f'{rdf_path}/lbl/{i}.ttl', 'r') as inf:
            for line in inf:
                if line[:7] != "@prefix":
                    ouf.write(line)
                else:
                    prefixes.add(line)
        with open(f'{rdf_path}/als/{i}.ttl', 'r') as inf:
            for line in inf:
                if line[:7] != "@prefix":
                    ouf.write(line)
                else:
                    prefixes.add(line)

In [None]:
with open(f'{dump_path}/names.ttl', 'w') as ouf, open(f'{dump_path}/lbl_als_wop.ttl', 'r') as inf:
    for p in prefixes:
        ouf.write(p)
    ouf.write('\n')
    for line in inf:
        ouf.write(line)

### Merge truthy

In [None]:
with open(f'{dump_path}/wdt_wop.ttl', 'w'), open(f'{dump_path}/wdt_all.ttl', 'w'):
    pass

prefixes = set()

In [None]:
with open(f'{dump_path}/wdt_wop.ttl', 'a') as ouf:
    for i in tqdm(range(n_batches)):
        with open(f'{dump_path}/wdt/{i}.ttl', 'r') as inf:
            for line in inf:
                if line[:7] != "@prefix":
                    ouf.write(line)
                else:
                    prefixes.add(line)

In [None]:
with open(f'{dump_path}/wdt_all.ttl', 'w') as ouf, open(f'{dump_path}/wdt_wop.ttl', 'r') as inf:
    for p in prefixes:
        ouf.write(p)
    ouf.write('\n')
    for line in inf:
        ouf.write(line)

### Merge full

In [None]:
intervals = [
    (0, 0, 200),
    (1, 200, 500),
    (2, 500, 1000),
    (3, 1000, 1700),
    (4, 1700, 2400),
    (5, 2400, 3200),
    (6, 3200, 4115)
]

In [None]:
for part_id, begin, end in intervals:
    print(part_id)
    with open(f'{dump_path}/pch_{part_id}_wop.ttl', 'w'), open(f'{dump_path}/pch_{part_id}.ttl', 'w'):
        pass

    prefixes = set()

    with open(f'{dump_path}/pch_{part_id}_wop.ttl', 'a') as ouf:
        for i in tqdm(range(begin, end)):
            with open(f'{dump_path}/pch/{i}.ttl', 'r') as inf:
                for line in inf:
                    if line[:7] != "@prefix":
                        ouf.write(line)
                    else:
                        prefixes.add(line)

    with open(f'{dump_path}/pch_{part_id}.ttl', 'w') as ouf, open(f'{dump_path}/pch_{part_id}_wop.ttl', 'r') as inf:
        for p in prefixes:
            ouf.write(p)
        ouf.write('\n')
        for line in inf:
            ouf.write(line)