In [1]:
import bz2
import json
import os
import sys
import traceback
from pymongo import MongoClient
from tqdm import tqdm
from datetime import datetime
from requests import get


In [9]:

def create_indexes(db):
    # Specify the collections and their respective fields to be indexed
    index_specs = {
        'cache': ['cell', 'lastAccessed'],  # Example: Indexing 'cell' and 'type' fields in 'cache' collection
        'items': ['id_entity', 'entity', 'category', 'popularity'],
        'literals': ['id_entity', 'entity'],
        'mappings': ['curid', 'wikipedia_id', 'wikidata_id', 'dbpedia_id'],
        'objects': ['id_entity', 'entity'],
        'types': ['id_entity', 'entity']
    }

    for collection, fields in index_specs.items():
        if collection == "cache":
            db[collection].create_index([('cell', 1), ('fuzzy', 1), ('type', 1), ('kg', 1), ('limit', 1)], unique=True)
        elif collection == "items":
            db[collection].create_index([('entity', 1), ('category', 1)], unique=True)    
        for field in fields:
            db[collection].create_index([(field, 1)])  # 1 for ascending order


# Initial Estimation
initial_estimated_average_size = 800  # Initial average size in bytes, can be adjusted
BATCH_SIZE = 100 # Number of entities to insert in a single batch

if len(sys.argv) < 2:
    print("Usage: python script_name.py <path_to_wikidata_dump>")
    sys.exit(1)

file_path = './my-data/latest-all.json.bz2'  # Get the file path from command line argument
compressed_file_size = os.path.getsize(file_path)
initial_total_lines_estimate = compressed_file_size / initial_estimated_average_size

file = bz2.BZ2File(file_path, "r")

# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
current_date = datetime.now()
formatted_date = current_date.strftime("%d%m%Y")
DB_NAME = f"wikidata{formatted_date}"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

create_indexes(client[DB_NAME])

buffer = {
    "items": [],
    "objects": [], 
    "literals": [], 
    "types": []
}

DATATYPES_MAPPINGS = {
    'external-id':'STRING',
    'quantity': 'NUMBER',
    'globe-coordinate': 'STRING',
    'string': 'STRING',
    'monolingualtext': 'STRING',
    'commonsMedia': 'STRING',
    'time': 'DATETIME',
    'url': 'STRING',
    'geo-shape': 'GEOSHAPE',
    'math': 'MATH',
    'musical-notation': 'MUSICAL_NOTATION',
    'tabular-data': 'TABULAR_DATA'
}
DATATYPES = list(set(DATATYPES_MAPPINGS.values()))
total_size_processed = 0
num_entities_processed = 0



def update_average_size(new_size):
    global total_size_processed, num_entities_processed
    total_size_processed += new_size
    num_entities_processed += 1
    return total_size_processed / num_entities_processed


def check_skip(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if "datavalue" not in temp:
        return True

    skip = {
        "wikibase-lexeme",
        "wikibase-form",
        "wikibase-sense"
    }
    
    return datatype in skip


def get_value(obj, datatype):
    temp = obj.get("mainsnak", obj)
    if datatype == "globe-coordinate":
        latitude = temp["datavalue"]["value"]["latitude"]
        longitude = temp["datavalue"]["value"]["longitude"]
        value = f"{latitude},{longitude}"
    else:
        keys = {
            "quantity": "amount",
            "monolingualtext": "text",
            "time": "time",
        }
        if datatype in keys:
            key = keys[datatype]
            value = temp["datavalue"]["value"][key]
        else:
            value = temp["datavalue"]["value"]
    return value


def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []
            
def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids
            
def parse_data(item, i, geolocation_subclass, organization_subclass):
    entity = item["id"]
    labels = item.get("labels", {})
    aliases = item.get("aliases", {})
    description = item.get('descriptions', {}).get('en', {})
    category = "entity"
    sitelinks = item.get("sitelinks", {})
    popularity = len(sitelinks) if len(sitelinks) > 0 else 1
    
    all_labels = {}
    for lang in labels:
        all_labels[lang] = labels[lang]["value"]

    all_aliases = {}
    for lang in aliases:
        all_aliases[lang] = []
        for alias in aliases[lang]:
            all_aliases[lang].append(alias["value"])
        all_aliases[lang] = list(set(all_aliases[lang]))

    found = False
    for predicate in item["claims"]:
        if predicate == "P279":
            found = True

    if found:
        category = "type"
    if entity[0] == "P":
        category = "predicate"

    ###############################################################
    # ORGANIZATION EXTRACTION
    # All items with the root class Organization (Q43229) excluding country (Q6256), city (Q515), capitals (Q5119), 
    # administrative territorial entity of a single country (Q15916867), venue (Q17350442), sports league (Q623109) 
    # and family (Q8436)
    
    # LOCATION EXTRACTION
    # All items with the root class Geographic Location (Q2221906) excluding: food (Q2095), educational institution (Q2385804), 
    # government agency (Q327333), international organization (Q484652) and time zone (Q12143)
    
    # PERSON EXTRACTION
    # All items with the statement is instance of (P31) human (Q5) are classiﬁed as person.

    NERtype = None

    if item.get("type") == "item" and "claims" in item:
        p31_claims = item["claims"].get("P31", [])
        
        if len(p31_claims) != 0:           
            for claim in p31_claims:
                mainsnak = claim.get("mainsnak", {})
                datavalue = mainsnak.get("datavalue", {})
                numeric_id = datavalue.get("value", {}).get("numeric-id")
                
                if numeric_id in organization_subclass:
                    NERtype = "ORG"    
                elif numeric_id == 5:
                    NERtype = "PERS"                    
                elif numeric_id in geolocation_subclass:
                    NERtype = "LOC"                    
                else:
                    NERtype = "OTHERS"
            if NERtype == "ORG":
                print(f"Item: {labels.get('en', {}).get('value', '')}, NERtype: {NERtype}, id: {item['id']}")
        else:
            NERtype = "OTHERS"
    
    ################################################################

    ################################################################   
    # URL EXTRACTION

    try:
        lang = labels.get("en", {}).get("language", "")
        tmp={}
        tmp["WD_id"] = item['id']
        tmp["WP_id"] = labels.get("en", {}).get("value", "")

        url_dict={}
        url_dict["WD_id_URL"] = "http://www.wikidata.org/wiki/"+tmp["WD_id"]
        url_dict["WP_id_URL"] = "http://"+lang+".wikipedia.org/wiki/"+tmp["WP_id"].replace(" ","_")
        url_dict["dbpedia_URL"] = "http://dbpedia.org/resource/"+tmp["WP_id"].capitalize().replace(" ","_")
        

    except json.decoder.JSONDecodeError:
       pass
    
    ################################################################    

    

    objects = {}
    literals = {datatype: {} for datatype in DATATYPES}
    types = {"P31": []}
    join = {
        "items": {
            "id_entity": i,
            "entity": entity,
            "description": description,
            "labels": all_labels,
            "aliases": all_aliases,
            "types": types,
            "popularity": popularity,
            "category": category,   # kind (entity, type or predicate)
            ######################
            # new updates
            "NERtype": NERtype, # (ORG, LOC, PER or OTHERS)
            "URLs" : url_dict
            ######################
        },
        "objects": { 
            "id_entity": i,
            "entity": entity,
            "objects":objects
        },
        "literals": { 
            "id_entity": i,
            "entity": entity,
            "literals": literals
        },
        "types": { 
            "id_entity": i,
            "entity": entity,
            "types": types
        },
    }

    predicates = item["claims"]
    for predicate in predicates:
        for obj in predicates[predicate]:
            datatype = obj["mainsnak"]["datatype"]

            if check_skip(obj, datatype):
                continue

            if datatype == "wikibase-item" or datatype == "wikibase-property":
                value = obj["mainsnak"]["datavalue"]["value"]["id"]

                if predicate == "P31" or predicate == "P106":
                    types["P31"].append(value)

                if value not in objects:
                    objects[value] = []
                objects[value].append(predicate)    
            else:
                value = get_value(obj, datatype)                
                lit = literals[DATATYPES_MAPPINGS[datatype]]

                if predicate not in lit:
                    lit[predicate] = []
                lit[predicate].append(value)   

     

    for key in buffer:
        buffer[key].append(join[key])            

    if len(buffer["items"]) == BATCH_SIZE:
        flush_buffer(buffer)


def parse_wikidata_dump():            
    global initial_total_lines_estimate

    try:
        geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
        food_subclass =  get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
        edInst_subclass =  get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
        govAgency_subclass =  get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
        intOrg_subclass =  get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
        timeZone_subclass =  get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])    
        geolocation_subclass = list(set(geolocation_subclass)-set(food_subclass)-set(edInst_subclass)-set(govAgency_subclass)-
                                set(intOrg_subclass)-set(timeZone_subclass))
        
        organization_subclass=get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])    
        country_subclass =  get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])    
        city_subclass =  get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])    
        capitals_subclass =  get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
        admTerr_subclass =  get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
        family_subclass =  get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
        sportLeague_subclass =  get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
        venue_subclass =  get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
        organization_subclass = list(set(organization_subclass)-set(country_subclass)-set(city_subclass)-
                                set(capitals_subclass)-set(admTerr_subclass)-set(family_subclass) -
                                set(sportLeague_subclass)-set(venue_subclass))
        
    except json.decoder.JSONDecodeError:
        pass

    pbar = tqdm(total=initial_total_lines_estimate)
    for i, line in enumerate(file):
        try:
            item = json.loads(line[:-2])  # Remove the trailing characters
            line_size = len(line)
            current_average_size = update_average_size(line_size)

            # Dynamically update the total based on the current average size
            pbar.total = round(compressed_file_size / current_average_size)
            pbar.update(1)

            parse_data(item, i, geolocation_subclass, organization_subclass)
        except json.decoder.JSONDecodeError:
            continue
        except Exception as e:
            traceback_str = traceback.format_exc()
            log_c.insert_one({"entity": item["id"], "error": str(e), "traceback_str": traceback_str})

    if len(buffer["items"]) > 0:
        flush_buffer(buffer)

    pbar.close()

parse_wikidata_dump()
final_average_size = total_size_processed / num_entities_processed
print(f"Final average size of an entity: {final_average_size} bytes")
# Optionally store this value for future use



  0%|          | 344/839275 [02:48<113:52:15,  2.05it/s]

  0%|          | 0/109787984.95125 [00:00<?, ?it/s][A
  0%|          | 5/332984 [00:00<1:53:28, 48.91it/s][A
  0%|          | 15/484062 [00:00<1:51:14, 72.52it/s][A

Item: Belgium, NERtype: ORG, id: Q31
Item: Portugal, NERtype: ORG, id: Q45



  0%|          | 23/424773 [00:00<2:11:36, 53.79it/s][A

Item: People's Republic of China, NERtype: ORG, id: Q148
Item: Brazil, NERtype: ORG, id: Q155
Item: Germany, NERtype: ORG, id: Q183



  0%|          | 29/346270 [00:00<1:58:39, 48.63it/s][A
  0%|          | 38/371036 [00:00<1:43:28, 59.75it/s][A

Item: Talisker distillery, NERtype: ORG, id: Q278
Item: Chile, NERtype: ORG, id: Q298
Item: Gmina Kurów, NERtype: ORG, id: Q433
Item: Rhône-Alpes, NERtype: ORG, id: Q463



  0%|          | 47/430199 [00:00<1:49:32, 65.45it/s][A
  0%|          | 58/472425 [00:00<1:41:49, 77.32it/s][A


Item: Museum of Fine Arts of Lyon, NERtype: ORG, id: Q511
Item: Bonn, NERtype: ORG, id: Q586


  0%|          | 69/497598 [00:00<1:36:12, 86.19it/s][A
  0%|          | 79/528406 [00:01<1:40:06, 87.96it/s][A

Item: Vanuatu, NERtype: ORG, id: Q686
Item: South Holland, NERtype: ORG, id: Q694
Item: Massachusetts, NERtype: ORG, id: Q771
Item: Israel, NERtype: ORG, id: Q801



  0%|          | 89/493314 [00:01<2:05:44, 65.38it/s][A
  0%|          | 99/498367 [00:01<1:57:45, 70.53it/s][A
  0%|          | 114/536376 [00:01<1:41:14, 88.28it/s][A

Item: Nizhny Novgorod, NERtype: ORG, id: Q891
Item: Reggiolo, NERtype: ORG, id: Q952
Item: Warburg, NERtype: ORG, id: Q968



  0%|          | 124/536486 [00:01<1:43:40, 86.22it/s][A

Item: Sudan, NERtype: ORG, id: Q1049
Item: Limburg, NERtype: ORG, id: Q1093
Item: Groß Borstel, NERtype: ORG, id: Q1172
Item: Hoheluft-Ost, NERtype: ORG, id: Q1181



  0%|          | 134/548197 [00:01<1:59:12, 76.63it/s][A
  0%|          | 145/562566 [00:01<1:51:41, 83.93it/s][A

Item: Illinois, NERtype: ORG, id: Q1204
Item: Uetersen, NERtype: ORG, id: Q1404
Item: Eschwege, NERtype: ORG, id: Q1468



  0%|          | 159/586214 [00:02<1:40:10, 97.50it/s][A
  0%|          | 179/617785 [00:02<1:27:47, 117.24it/s][A

Item: Mississippi, NERtype: ORG, id: Q1494
Item: Cublize, NERtype: ORG, id: Q1504
Item: New Mexico, NERtype: ORG, id: Q1522
Item: Finkenwerder, NERtype: ORG, id: Q1562
Item: Billstedt, NERtype: ORG, id: Q1565
Item: Blankenese, NERtype: ORG, id: Q1607
Item: Eimsbüttel, NERtype: ORG, id: Q1611
Item: Bergstedt, NERtype: ORG, id: Q1641
Item: Wiesbaden, NERtype: ORG, id: Q1721



  0%|          | 195/634762 [00:02<1:22:59, 127.43it/s][A
  0%|          | 215/668333 [00:02<1:16:02, 146.43it/s][A
  0%|          | 238/703945 [00:02<1:10:49, 165.59it/s][A

Item: Eilbek, NERtype: ORG, id: Q1807
Item: Poppenbüttel, NERtype: ORG, id: Q1818
Item: Rahlstedt, NERtype: ORG, id: Q1820
Item: Wohldorf-Ohlstedt, NERtype: ORG, id: Q1837
Item: Givors, NERtype: ORG, id: Q1976
Item: Rivolet, NERtype: ORG, id: Q1977
Item: Newfoundland and Labrador, NERtype: ORG, id: Q2003
Item: Chambost-Allières, NERtype: ORG, id: Q2026
Item: ACF Fiorentina, NERtype: ORG, id: Q2052
Item: Duisburg, NERtype: ORG, id: Q2100



  0%|          | 255/726281 [00:02<1:31:58, 131.56it/s][A

Item: Mannheim, NERtype: ORG, id: Q2119
Item: Government of the Soviet Union, NERtype: ORG, id: Q2148
Item: Alto Hospicio, NERtype: ORG, id: Q2217
Item: Nissedal, NERtype: ORG, id: Q2229
Item: Langenzenn, NERtype: ORG, id: Q2230
Item: Seljord, NERtype: ORG, id: Q2236
Item: Tokke Municipality, NERtype: ORG, id: Q2241



  0%|          | 276/752581 [00:02<1:23:42, 149.80it/s][A
  0%|          | 294/772357 [00:02<1:22:58, 155.09it/s][A

Item: Skien, NERtype: ORG, id: Q2272
Item: Altengamme, NERtype: ORG, id: Q2380
Item: Reitbrook, NERtype: ORG, id: Q2418
Item: Palermo F.C., NERtype: ORG, id: Q2674



  0%|          | 317/802308 [00:02<1:16:59, 173.62it/s][A
  0%|          | 339/832901 [00:03<1:14:43, 185.70it/s][A

Item: ACR Siena 1904, NERtype: ORG, id: Q2756
Item: Telgte, NERtype: ORG, id: Q2789
Item: Everswinkel, NERtype: ORG, id: Q2809
Item: Nordwestmecklenburg District, NERtype: ORG, id: Q2876
Item: Mecklenburgische Seenplatte District, NERtype: ORG, id: Q2902
Item: Dithmarschen, NERtype: ORG, id: Q2947
Item: Plön District, NERtype: ORG, id: Q2970
Item: Steinburg, NERtype: ORG, id: Q3011
Item: Aisne, NERtype: ORG, id: Q3093



  0%|          | 359/859282 [00:03<1:16:11, 187.87it/s][A
  0%|          | 387/899137 [00:03<1:13:43, 203.20it/s][A

Item: Covões, NERtype: ORG, id: Q3213
Item: Bouches-du-Rhône, NERtype: ORG, id: Q3240
Item: Lourdes, NERtype: ORG, id: Q3327
Item: 2nd arrondissement of Lyon, NERtype: ORG, id: Q3344
Item: Eure-et-Loir, NERtype: ORG, id: Q3377
Item: Sinstorf, NERtype: ORG, id: Q3534
Item: Potenza, NERtype: ORG, id: Q3543
Item: Šibenik, NERtype: ORG, id: Q3549
Item: Monteferrante, NERtype: ORG, id: Q3563
Item: Velezzo Lomellina, NERtype: ORG, id: Q3592
Item: Chazelles, NERtype: ORG, id: Q3595
Item: Calvignano, NERtype: ORG, id: Q3601
Item: Anceriz, NERtype: ORG, id: Q3679
Item: Cauquenes, NERtype: ORG, id: Q3749
Item: Gera, NERtype: ORG, id: Q3750



  0%|          | 414/933601 [00:03<1:10:10, 221.64it/s][A
  0%|          | 437/950022 [00:03<1:14:38, 212.05it/s][A

Item: French Guiana, NERtype: ORG, id: Q3769
Item: Hanau, NERtype: ORG, id: Q3802
Item: Neustadt-Nord, NERtype: ORG, id: Q3823
Item: Conchalí, NERtype: ORG, id: Q3851
Item: Giessen, NERtype: ORG, id: Q3874
Item: Corral, NERtype: ORG, id: Q3875
Item: Dorsten, NERtype: ORG, id: Q3886
Item: Aschaffenburg, NERtype: ORG, id: Q3942
Item: Unna, NERtype: ORG, id: Q3949
Item: Weimar, NERtype: ORG, id: Q3955
Item: Mahajana College, NERtype: ORG, id: Q4000
Item: Rozpor, NERtype: ORG, id: Q4002
Item: Euskirchen, NERtype: ORG, id: Q4074
Item: Lyon OU, NERtype: ORG, id: Q4075
Item: Hilden, NERtype: ORG, id: Q4094
Item: Schweinfurt, NERtype: ORG, id: Q4126



  0%|          | 459/971812 [00:03<1:15:50, 213.48it/s][A
  0%|          | 494/1013161 [00:03<1:07:49, 248.84it/s][A

Item: Neuschwanstein Castle, NERtype: ORG, id: Q4152
Item: Passau, NERtype: ORG, id: Q4190
Item: Union of South American Nations, NERtype: ORG, id: Q4230
Item: Democratic Party, NERtype: ORG, id: Q4280
Item: Coheed and Cambria, NERtype: ORG, id: Q4299



  0%|          | 520/1038602 [00:03<1:25:13, 203.03it/s][A
  0%|          | 556/1085037 [00:04<1:16:18, 236.85it/s][A

Item: Berlin Recycling Volleys, NERtype: ORG, id: Q4568
Item: University of Southern California, NERtype: ORG, id: Q4614
Item: Macedonian mafia, NERtype: ORG, id: Q4940
Item: Arleuf, NERtype: ORG, id: Q4955
Item: SABMiller, NERtype: ORG, id: Q4995
Item: Ranchuelo, NERtype: ORG, id: Q5003
Item: Colón, NERtype: ORG, id: Q5029
Item: Lajas, NERtype: ORG, id: Q5045



  0%|          | 585/1114860 [00:04<1:14:17, 249.98it/s][A
  0%|          | 612/1130414 [00:04<1:16:08, 247.28it/s][A

Item: Nachrodt-Wiblingwerde, NERtype: ORG, id: Q5279



  0%|          | 638/1135644 [00:04<1:26:30, 218.68it/s][A
  0%|          | 661/1136758 [00:04<1:33:24, 202.73it/s][A

Item: Meschede, NERtype: ORG, id: Q5632
Item: Bjelovar, NERtype: ORG, id: Q5707
Item: Toledo, NERtype: ORG, id: Q5836



  0%|          | 686/1155176 [00:04<1:30:09, 213.43it/s][A
  0%|          | 711/1173820 [00:04<1:27:50, 222.60it/s][A

Item: Osnabrück, NERtype: ORG, id: Q5940
Item: Wesermarsch, NERtype: ORG, id: Q5948
Item: Holzminden, NERtype: ORG, id: Q5973
Item: Siegen-Wittgenstein, NERtype: ORG, id: Q5982
Item: Sundern, NERtype: ORG, id: Q6015
Item: Oberhavel District, NERtype: ORG, id: Q6119
Item: Soest, NERtype: ORG, id: Q6149
Item: Elbe-Elster District, NERtype: ORG, id: Q6152
Item: Steinfurt, NERtype: ORG, id: Q6187
Item: Lippe, NERtype: ORG, id: Q6230
Item: Rhein-Kreis Neuss, NERtype: ORG, id: Q6253
Item: Pavia, NERtype: ORG, id: Q6259
Item: Brive-la-Gaillarde, NERtype: ORG, id: Q6393
Item: Bahretal, NERtype: ORG, id: Q6438



  0%|          | 742/1201400 [00:04<1:21:20, 245.99it/s][A
  0%|          | 772/1220524 [00:04<1:18:16, 259.69it/s][A

Item: Pirna, NERtype: ORG, id: Q6477
Item: Maçanet de Cabrenys, NERtype: ORG, id: Q6615
Item: Dorfhain, NERtype: ORG, id: Q6685
Item: Hartmannsdorf-Reichenau, NERtype: ORG, id: Q6710
Item: Grosseto, NERtype: ORG, id: Q6716
Item: Berliet, NERtype: ORG, id: Q6750
Item: Banyuls-sur-Mer, NERtype: ORG, id: Q6753
Item: U.S. Livorno 1915, NERtype: ORG, id: Q6767
Item: Höckendorf, NERtype: ORG, id: Q6775
Item: Neunkirchen, NERtype: ORG, id: Q6799
Item: Merzig-Wadern, NERtype: ORG, id: Q6802
Item: International Astronomical Union, NERtype: ORG, id: Q6867
Item: Neunkirchen, NERtype: ORG, id: Q6880



  0%|          | 801/1243226 [00:05<1:17:44, 266.35it/s][A
  0%|          | 829/1253164 [00:05<1:25:20, 244.56it/s][A

Item: Franxault, NERtype: ORG, id: Q6980
Item: Alsdorf, NERtype: ORG, id: Q6992
Item: Bitterfeld-Wolfen, NERtype: ORG, id: Q7007
Item: Brühl, NERtype: ORG, id: Q7036
Item: Oberursel (Taunus), NERtype: ORG, id: Q7044
Item: Kaarst, NERtype: ORG, id: Q7088
Item: Eschenbach, NERtype: ORG, id: Q7092
Item: Hochdorf, NERtype: ORG, id: Q7102
Item: Altwis, NERtype: ORG, id: Q7116



  0%|          | 855/1258493 [00:05<1:52:49, 185.79it/s][A
  0%|          | 880/1268607 [00:05<1:46:44, 197.94it/s][A

Item: Nazi Germany, NERtype: ORG, id: Q7318
Item: Nazi Party, NERtype: ORG, id: Q7320
Item: Tenderloin, NERtype: ORG, id: Q7464
Item: Wermelskirchen, NERtype: ORG, id: Q7507
Item: Ashkenaz, NERtype: ORG, id: Q7529



  0%|          | 903/1284012 [00:05<1:44:21, 204.92it/s][A
  0%|          | 926/1291477 [00:05<1:42:11, 210.48it/s][A
  0%|          | 949/1292857 [00:05<1:42:58, 209.11it/s][A

Item: International Court of Justice, NERtype: ORG, id: Q7801
Item: Nordhausen district, NERtype: ORG, id: Q7858
Item: Wartburgkreis, NERtype: ORG, id: Q7866
Item: Gotha, NERtype: ORG, id: Q7869
Item: Landkreis Saalfeld-Rudolstadt, NERtype: ORG, id: Q7882
Item: Greiz, NERtype: ORG, id: Q7887
Item: Fulda, NERtype: ORG, id: Q7899
Item: Main-Kinzig-Kreis, NERtype: ORG, id: Q7911
Item: Landkreis Bergstraße, NERtype: ORG, id: Q7917
Item: Düsseldorf Government Region, NERtype: ORG, id: Q7926
Item: Nintendo, NERtype: ORG, id: Q8093
Item: Freiburg Government Region, NERtype: ORG, id: Q8167
Item: Tübingen Government Region, NERtype: ORG, id: Q8170
Item: Enz, NERtype: ORG, id: Q8184
Item: Ortenau, NERtype: ORG, id: Q8191



  0%|          | 973/1300700 [00:05<1:44:19, 207.66it/s][A
  0%|          | 1002/1314719 [00:06<1:35:54, 228.30it/s][A

Item: Iława, NERtype: ORG, id: Q8194
Item: Les Cordeliers, NERtype: ORG, id: Q8207
Item: Rosenthal-Bielatal, NERtype: ORG, id: Q8234
Item: Struppen, NERtype: ORG, id: Q8244
Item: Verwaltungsgemeinschaft Tharandt, NERtype: ORG, id: Q8283
Item: Wikimedia Deutschland, NERtype: ORG, id: Q8288
Item: Hockenheim, NERtype: ORG, id: Q8336
Item: Matte World Digital, NERtype: ORG, id: Q8340
Item: Dardilly, NERtype: ORG, id: Q8365
Item: Main-Tauber-Kreis, NERtype: ORG, id: Q8517
Item: Landkreis Schwäbisch Hall, NERtype: ORG, id: Q8520
Item: Ostalbkreis, NERtype: ORG, id: Q8522
Item: Ludwigsburg District, NERtype: ORG, id: Q8541
Item: Bad Dürkheim (district), NERtype: ORG, id: Q8557



  0%|          | 1030/1327319 [00:06<1:31:43, 240.97it/s][A
  0%|          | 1055/1331928 [00:06<1:34:51, 233.83it/s][A

Item: Cochem-Zell, NERtype: ORG, id: Q8590
Item: Neuwied, NERtype: ORG, id: Q8606
Item: La Spezia, NERtype: ORG, id: Q8611
Item: Saint-Jean-Pla-de-Corts, NERtype: ORG, id: Q8657
Item: Diera-Zehren, NERtype: ORG, id: Q8697
Item: Ebersbach, NERtype: ORG, id: Q8702
Item: Deportivo de La Coruña, NERtype: ORG, id: Q8760
Item: RCD Espanyol de Barcelona, NERtype: ORG, id: Q8780
Item: Schönfeld, NERtype: ORG, id: Q8788
Item: 2002 Venezuelan coup d'état attempt, NERtype: ORG, id: Q8807
Item: Weinböhla, NERtype: ORG, id: Q8831
Item: Verwaltungsgemeinschaft Gröditz, NERtype: ORG, id: Q8841
Item: Verwaltungsgemeinschaft Röderaue-Wülknitz, NERtype: ORG, id: Q8852
Item: 1992 Venezuelan coup d'état attempts, NERtype: ORG, id: Q8868
Item: Altstadt, NERtype: ORG, id: Q8885
Item: European Central Bank, NERtype: ORG, id: Q8901
Item: Council of Europe, NERtype: ORG, id: Q8908
Item: Friedrichstadt, NERtype: ORG, id: Q8909
Item: European Economic Area, NERtype: ORG, id: Q8932
Item: Laubegast, NERtype: ORG, 


  0%|          | 1086/1344742 [00:06<1:28:26, 253.20it/s][A
  0%|          | 1112/1343181 [00:06<1:34:47, 235.99it/s][A

Item: Bruzolo, NERtype: ORG, id: Q9101
Item: Candiolo, NERtype: ORG, id: Q9148
Item: Carema, NERtype: ORG, id: Q9173
Item: Castelnuovo Nigra, NERtype: ORG, id: Q9231
Item: Ceres, NERtype: ORG, id: Q9244
Item: Chivasso, NERtype: ORG, id: Q9275
Item: Bandar Seri Begawan, NERtype: ORG, id: Q9279
Item: Cintano, NERtype: ORG, id: Q9280
Item: Claviere, NERtype: ORG, id: Q9287
Item: Druento, NERtype: ORG, id: Q9336
Item: Favria, NERtype: ORG, id: Q9343
Item: Deutsche Telekom, NERtype: ORG, id: Q9396
Item: Isolabella, NERtype: ORG, id: Q9405
Item: Moncalieri, NERtype: ORG, id: Q9474
Item: Westfield La Part-Dieu, NERtype: ORG, id: Q9478
Item: Montalenghe, NERtype: ORG, id: Q9483
Item: Nole, NERtype: ORG, id: Q9505
Item: Nomaglio, NERtype: ORG, id: Q9508



  0%|          | 1137/1342491 [00:06<2:08:14, 174.33it/s][A
  0%|          | 1157/1336287 [00:06<2:05:08, 177.81it/s][A

Item: Perosa Canavese, NERtype: ORG, id: Q9552
Item: Tame Impala, NERtype: ORG, id: Q9619
Item: Amsterdam Airport Schiphol, NERtype: ORG, id: Q9694
Item: Boulouparis, NERtype: ORG, id: Q9707
Item: Yaté, NERtype: ORG, id: Q9758
Item: Simpelveld, NERtype: ORG, id: Q9769
Item: Sittard-Geleen, NERtype: ORG, id: Q9781
Item: Gennep, NERtype: ORG, id: Q9800
Item: Alphen-Chaam, NERtype: ORG, id: Q9809
Item: Bergen op Zoom, NERtype: ORG, id: Q9814
Item: Boxtel, NERtype: ORG, id: Q9821



  0%|          | 1183/1342295 [00:06<1:56:01, 192.63it/s][A

Item: Cranendonck, NERtype: ORG, id: Q9823
Item: Dongen, NERtype: ORG, id: Q9829
Item: Halderberge, NERtype: ORG, id: Q9841
Item: Landerd, NERtype: ORG, id: Q9849
Item: Maasdonk, NERtype: ORG, id: Q9851
Item: Mill en Sint Hubert, NERtype: ORG, id: Q9853
Item: Rucphen, NERtype: ORG, id: Q9862
Item: Amsterdam, NERtype: ORG, id: Q9899
Item: Diemen, NERtype: ORG, id: Q9913
Item: Hollands Kroon, NERtype: ORG, id: Q9936
Item: Schermer, NERtype: ORG, id: Q9959
Item: Bunschoten, NERtype: ORG, id: Q10034
Item: Renswoude, NERtype: ORG, id: Q10039
Item: Stichtse Vecht, NERtype: ORG, id: Q10042
Item: Montfoort, NERtype: ORG, id: Q10048
Item: Zeist, NERtype: ORG, id: Q10056
Item: Reimerswaal, NERtype: ORG, id: Q10078
Item: Frankfurter Allgemeine Zeitung, NERtype: ORG, id: Q10184
Item: Daïtro, NERtype: ORG, id: Q10228
Item: Romano Canavese, NERtype: ORG, id: Q10237
Item: Rosta, NERtype: ORG, id: Q10243
Item: Salza di Pinerolo, NERtype: ORG, id: Q10254
Item: San Giorgio Canavese, NERtype: ORG, id: Q1


  0%|          | 1220/1355895 [00:07<1:35:25, 236.62it/s][A
  0%|          | 1246/1365102 [00:07<1:34:07, 241.49it/s][A
  0%|          | 1286/1384779 [00:07<1:21:19, 283.53it/s][A

Item: Oberallgäu, NERtype: ORG, id: Q10402
Item: Cham, NERtype: ORG, id: Q10424
Item: Erlangen-Höchstadt, NERtype: ORG, id: Q10447
Item: Roth, NERtype: ORG, id: Q10451
Item: FC Barcelona Atlètic, NERtype: ORG, id: Q10467
Item: Main-Spessart, NERtype: ORG, id: Q10469
Item: Eichstätt, NERtype: ORG, id: Q10491
Item: Ebersberg, NERtype: ORG, id: Q10541
Item: Lower Bavaria, NERtype: ORG, id: Q10559
Item: Lessenich, NERtype: ORG, id: Q10719
Item: Böhlen, NERtype: ORG, id: Q10743
Item: Machern, NERtype: ORG, id: Q10752
Item: Narsdorf, NERtype: ORG, id: Q10753
Item: Parthenstein, NERtype: ORG, id: Q10757
Item: Frohburg, NERtype: ORG, id: Q10763
Item: Trebsen, NERtype: ORG, id: Q10777
Item: Montecastrilli, NERtype: ORG, id: Q10830
Item: San Venanzo, NERtype: ORG, id: Q10901
Item: Finnentrop, NERtype: ORG, id: Q10903
Item: Wenden, NERtype: ORG, id: Q10912
Item: Medebach, NERtype: ORG, id: Q10919
Item: Netphen, NERtype: ORG, id: Q10950
Item: France 3 Régions, NERtype: ORG, id: Q10963



  0%|          | 1318/1396740 [00:07<1:23:27, 278.67it/s][A

Item: Olympique Lyonnais, NERtype: ORG, id: Q11037
Item: Engelskirchen, NERtype: ORG, id: Q11046
Item: Nümbrecht, NERtype: ORG, id: Q11052
Item: The Independent, NERtype: ORG, id: Q11149
Item: Bàscara, NERtype: ORG, id: Q11232
Item: Biure, NERtype: ORG, id: Q11236
Item: Lower Manhattan, NERtype: ORG, id: Q11253
Item: Adobe, NERtype: ORG, id: Q11463



  0%|          | 1347/1403029 [00:07<1:27:58, 265.53it/s][A
  0%|          | 1375/1410463 [00:07<1:32:01, 255.18it/s][A
  0%|          | 1402/1410820 [00:07<1:31:13, 257.51it/s][A

Item: Pont de Molins, NERtype: ORG, id: Q11804
Item: Tocotronic, NERtype: ORG, id: Q11890
Item: PSV Eindhoven, NERtype: ORG, id: Q11938
Item: Stein am Rhein, NERtype: ORG, id: Q11939
Item: Saint-Maurice-de-Beynost, NERtype: ORG, id: Q11981
Item: CD Mirandés, NERtype: ORG, id: Q11997
Item: Ell & Nikki, NERtype: ORG, id: Q12009
Item: Tiana, NERtype: ORG, id: Q12026
Item: Delitzsch, NERtype: ORG, id: Q12052
Item: Sant Andreu de Llavaneres, NERtype: ORG, id: Q12093
Item: Vilabertran, NERtype: ORG, id: Q12148
Item: Sociedad Deportiva Ponferradina, NERtype: ORG, id: Q12168
Item: Starobrno Brewery, NERtype: ORG, id: Q12221



  0%|          | 1429/1414104 [00:07<1:34:04, 250.29it/s][A

Item: Pardines, NERtype: ORG, id: Q12294
Item: Queralbs, NERtype: ORG, id: Q12327
Item: Vallfogona de Ripollès, NERtype: ORG, id: Q12356
Item: Le Monde, NERtype: ORG, id: Q12461
Item: Meckenheim, NERtype: ORG, id: Q12464
Item: Niederkassel, NERtype: ORG, id: Q12472
Item: Gers, NERtype: ORG, id: Q12517
Item: Calella, NERtype: ORG, id: Q12540



  0%|          | 1455/1414490 [00:08<2:04:59, 188.43it/s][A
  0%|          | 1477/1407084 [00:08<2:15:26, 172.97it/s][A

Item: Pineda de Mar, NERtype: ORG, id: Q12552
Item: Ottoman Empire, NERtype: ORG, id: Q12560
Item: Loire, NERtype: ORG, id: Q12569
Item: Maine-et-Loire, NERtype: ORG, id: Q12584
Item: Castellfollit de la Roca, NERtype: ORG, id: Q12598
Item: Mieres, NERtype: ORG, id: Q12604
Item: Montagut i Oix, NERtype: ORG, id: Q12609
Item: Bossòst, NERtype: ORG, id: Q12615
Item: Mayenne, NERtype: ORG, id: Q12620
Item: Canejan, NERtype: ORG, id: Q12627
Item: Riudaura, NERtype: ORG, id: Q12634
Item: Nord, NERtype: ORG, id: Q12661
Item: Sant Ferriol, NERtype: ORG, id: Q12671
Item: Alins, NERtype: ORG, id: Q12672
Item: Farrera, NERtype: ORG, id: Q12676
Item: Hautes-Pyrénées, NERtype: ORG, id: Q12700
Item: La Vall de Bianya, NERtype: ORG, id: Q12701
Item: Ilocos Sur, NERtype: ORG, id: Q12741
Item: Haute-Savoie, NERtype: ORG, id: Q12751
Item: Somme, NERtype: ORG, id: Q12770



  0%|          | 1498/1404923 [00:08<2:12:34, 176.42it/s][A
  0%|          | 1520/1402965 [00:08<2:05:07, 186.66it/s][A

Item: Tarn-et-Garonne, NERtype: ORG, id: Q12779
Item: Vienne, NERtype: ORG, id: Q12804
Item: Banayoyo, NERtype: ORG, id: Q12818
Item: Cabugao, NERtype: ORG, id: Q12832
Item: Magsingal, NERtype: ORG, id: Q12845
Item: Quirino, NERtype: ORG, id: Q12853
Item: San Ildefonso, NERtype: ORG, id: Q12864
Item: San Vicente, NERtype: ORG, id: Q12868
Item: Santa, NERtype: ORG, id: Q12873
Item: Sinait, NERtype: ORG, id: Q12891
Item: Tagudin, NERtype: ORG, id: Q12895
Item: Assenede, NERtype: ORG, id: Q12901
Item: Lloret de Mar, NERtype: ORG, id: Q12977
Item: Oudenaarde, NERtype: ORG, id: Q12992
Item: Brunyola, NERtype: ORG, id: Q13056



  0%|          | 1542/1405452 [00:08<2:04:26, 188.03it/s][A
  0%|          | 1562/1405733 [00:08<2:05:51, 185.96it/s][A

Item: Fano, NERtype: ORG, id: Q13142
Item: University of Orléans, NERtype: ORG, id: Q13334
Item: Pistoia, NERtype: ORG, id: Q13376
Item: Ligue 1, NERtype: ORG, id: Q13394
Item: former archives of Ukraine, NERtype: ORG, id: Q13401
Item: Cervià de Ter, NERtype: ORG, id: Q13428



  0%|          | 1588/1408742 [00:08<1:58:02, 198.69it/s][A
  0%|          | 1614/1409319 [00:08<1:50:07, 213.03it/s][A

Item: Viladasens, NERtype: ORG, id: Q13455
Item: Foixà, NERtype: ORG, id: Q13465
Item: Santa Cristina d'Aro, NERtype: ORG, id: Q13569
Item: La Tallada d'Empordà, NERtype: ORG, id: Q13571
Item: Torroella de Montgrí, NERtype: ORG, id: Q13573
Item: Verges, NERtype: ORG, id: Q13586
Item: Ultramort, NERtype: ORG, id: Q13588
Item: Yopal, NERtype: ORG, id: Q13591
Item: Vaulx-en-Velin, NERtype: ORG, id: Q13596
Item: Palol de Revardit, NERtype: ORG, id: Q13609
Item: Crespià, NERtype: ORG, id: Q13610
Item: Matera, NERtype: ORG, id: Q13616
Item: Les Abymes, NERtype: ORG, id: Q13636
Item: Oristano, NERtype: ORG, id: Q13652
Item: Caltanissetta, NERtype: ORG, id: Q13680
Item: Le Canard enchaîné, NERtype: ORG, id: Q13709
Item: Batangas, NERtype: ORG, id: Q13744
Item: Llívia, NERtype: ORG, id: Q13745
Item: Bohol, NERtype: ORG, id: Q13752
Item: Centelles, NERtype: ORG, id: Q13773
Item: Folgueroles, NERtype: ORG, id: Q13782
Item: Davao del Sur, NERtype: ORG, id: Q13794
Item: Oristà, NERtype: ORG, id: Q1


  0%|          | 1636/1401184 [00:09<2:01:30, 191.96it/s][A

Item: Torelló, NERtype: ORG, id: Q13854
Item: Vic, NERtype: ORG, id: Q13855
Item: Quirino, NERtype: ORG, id: Q13873
Item: Samar, NERtype: ORG, id: Q13876
Item: Zamboanga del Sur, NERtype: ORG, id: Q13900
Item: Île-de-France, NERtype: ORG, id: Q13917
Item: Polinyà, NERtype: ORG, id: Q13925
Item: Sant Cugat del Vallès, NERtype: ORG, id: Q13936
Item: Ullastrell, NERtype: ORG, id: Q13944
Item: Bellaterra, NERtype: ORG, id: Q13953
Item: NBC, NERtype: ORG, id: Q13974
Item: Bloomberg Television, NERtype: ORG, id: Q13975
Item: Bloomberg L.P., NERtype: ORG, id: Q13977



  0%|          | 1656/1402234 [00:09<2:11:01, 178.15it/s][A
  0%|          | 1681/1408922 [00:09<2:00:06, 195.29it/s][A
  0%|          | 1702/1414669 [00:09<2:06:25, 186.26it/s][A

Item: Vitoria-Gasteiz, NERtype: ORG, id: Q14318
Item: Teruel, NERtype: ORG, id: Q14336
Item: Capcom, NERtype: ORG, id: Q14428
Item: La Pintana, NERtype: ORG, id: Q14464
Item: La Reina, NERtype: ORG, id: Q14466
Item: Lo Espejo, NERtype: ORG, id: Q14503
Item: Vitznau, NERtype: ORG, id: Q14578
Item: Egolzwil, NERtype: ORG, id: Q14589
Item: Grossdietwil, NERtype: ORG, id: Q14600
Item: Roggliswil, NERtype: ORG, id: Q14616
Item: Wauwil, NERtype: ORG, id: Q14623
Item: Wikon, NERtype: ORG, id: Q14625
Item: Sun Microsystems, NERtype: ORG, id: Q14647



  0%|          | 1734/1430526 [00:09<1:47:51, 220.79it/s][A
  0%|          | 1759/1439795 [00:09<1:45:02, 228.15it/s][A

Item: Oranienburg, NERtype: ORG, id: Q14808
Item: Ansbach, NERtype: ORG, id: Q14859
Item: Neumarkt in der Oberpfalz, NERtype: ORG, id: Q14887
Item: Voerde, NERtype: ORG, id: Q14903
Item: Astro-rivelatore Gamma a Immagini Leggero, NERtype: ORG, id: Q14951
Item: Porta Westfalica, NERtype: ORG, id: Q14954
Item: Papenburg, NERtype: ORG, id: Q14957
Item: Saint-Julien-sur-Veyle, NERtype: ORG, id: Q14964
Item: Lleida, NERtype: ORG, id: Q15090



  0%|          | 1783/1443136 [00:09<2:17:45, 174.37it/s][A
  0%|          | 1817/1454363 [00:09<1:54:26, 211.54it/s][A

Item: Ciudad Real, NERtype: ORG, id: Q15093
Item: Shenzhen, NERtype: ORG, id: Q15174
Item: Waymo, NERtype: ORG, id: Q15330
Item: Bigues i Riells del Fai, NERtype: ORG, id: Q15398
Item: Figaró-Montmany, NERtype: ORG, id: Q15409
Item: Mollet del Vallès, NERtype: ORG, id: Q15430
Item: Montmeló, NERtype: ORG, id: Q15431
Item: Montornès del Vallès, NERtype: ORG, id: Q15433
Item: Sant Feliu de Codines, NERtype: ORG, id: Q15444
Item: Sant Quirze Safaja, NERtype: ORG, id: Q15452
Item: Tagamanent, NERtype: ORG, id: Q15457



  0%|          | 1841/1454063 [00:09<1:51:23, 217.27it/s][A

Item: Casserres, NERtype: ORG, id: Q15491
Item: Montmajor, NERtype: ORG, id: Q15522
Item: Puig-reig, NERtype: ORG, id: Q15534
Item: University of Hong Kong, NERtype: ORG, id: Q15568
Item: Cervelló, NERtype: ORG, id: Q15599
Item: Sant Vicenç dels Horts, NERtype: ORG, id: Q15649
Item: Torrelles de Llobregat, NERtype: ORG, id: Q15650


KeyboardInterrupt: 


  0%|          | 1852/1455476 [00:22<1:51:30, 217.27it/s][A

In [15]:
def check_leaf_existence(root_items, forward_properties=None, backward_properties=None):
    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''

    if forward_properties:
        query += '''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }''' % (','.join(map(str, forward_properties)), ','.join(map(str, root_items)))
    elif backward_properties:
        query += '''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }''' % (','.join(map(str, backward_properties)), ','.join(map(str, root_items)))

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()

    leaf_ids = []

    for item in data['results']['bindings']:
        this_id = item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        try:
            this_id = int(this_id)
            leaf_ids.append(this_id)
        except ValueError:
            continue

    return leaf_ids

# Example usage:
root_items = [2221906]  # Replace with the root Wikidata item IDs
leaf_nodes = check_leaf_existence(root_items, forward_properties=[279])  # Replace P1, P2, P3 with forward properties
if 31 in leaf_nodes:
    print("Q31 is present in the leaf nodes.")
else:
    print("Q31 is not present in the leaf nodes.")


Q31 is not present in the leaf nodes.


In [16]:
leaf_nodes

[2221906]