In [1]:
import bz2
import json
from tqdm import tqdm
import traceback
import os
from pymongo import MongoClient
from pymongo import *
from pymongo import errors
import configparser
from json.decoder import JSONDecodeError
from requests import get

In [2]:
# MongoDB connection setup
MONGO_ENDPOINT, MONGO_ENDPOINT_PORT = os.environ["MONGO_ENDPOINT"].split(":")
MONGO_ENDPOINT_PORT = int(MONGO_ENDPOINT_PORT)
MONGO_ENDPOINT_USERNAME = os.environ["MONGO_INITDB_ROOT_USERNAME"]
MONGO_ENDPOINT_PASSWORD = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
DB_NAME = f"wikidata"

client = MongoClient(MONGO_ENDPOINT, MONGO_ENDPOINT_PORT, username=MONGO_ENDPOINT_USERNAME, password=MONGO_ENDPOINT_PASSWORD)
print(client)

log_c = client.wikidata.log
items_c = client[DB_NAME].items
objects_c = client[DB_NAME].objects
literals_c = client[DB_NAME].literals
types_c = client[DB_NAME].types

c_ref = {
    "items": items_c,
    "objects":objects_c, 
    "literals":literals_c, 
    "types":types_c
}

def flush_buffer(buffer):
    for key in buffer:
        if len(buffer[key]) > 0:
            c_ref[key].insert_many(buffer[key])
            buffer[key] = []


MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True)


In [3]:
def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.
    --------------------------------------------
    For example, if you have an item with types A, B, and C, and you specify a forward property that applies to type B, the item will 
    be included in the result because it has type B, even if it also has types A and C
    --------------------------------------------  
    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids

In [9]:
# example with "capital city"

list = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
for el in list:
    data = {
        "json": [
            "Q"+(str(el))
        ]
    }
    response = requests.post(url, headers=headers, json=data)

    try:
        result = response.json()
        label = result["Q"+(str(el))]['labels']['en']
        print(label)  # Print the label or ID with indentation
    except:
        pass

capital city
temporary capital
Amsterdam as a gay capital
summer capital
capital of regency
capital of Korea
Capital of Brazil
Capital in Africa
state capital
commercial capital
capital city wall
capital of region
municipality seat
capital of Indonesia
capital of county in Romania
prefectural capital of Japan
provincial capital
Winter capital
legislative capital
executive capital
judicial capital
national capital
state capital in Germany
provincial or territorial capital city in Canada
state or insular area capital of the United States
cabecera municipal
seat of the local council
urban municipality in Germany
barrio-pueblo of Puerto Rico
municipality capital (Spain)
zona urbana in Puerto Rico
federal capital
capital of Japan
capital of Russia
Capital of Republic of China (Taiwan)
planned capital city
Capital of Sri Lanka
New Administrative Capital
independent city of Germany
Greater district town
large district town
large independent city of Lower Saxony
city with special status
medium

In [4]:
# Function to fetch the necessary subclass sets with individual try-except blocks
def fetch_wikidata_subclasses():
    try:
        organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        organization_subclass = []
    
    try:
        country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        country_subclass = []
    
    try:
        city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        city_subclass = []
    
    try:
        capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        capitals_subclass = []
    
    try:
        admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        admTerr_subclass = []
    
    try:
        family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        family_subclass = []
    
    try:
        sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        sportLeague_subclass = []
    
    try:
        venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        venue_subclass = []
    
    # Removing overlaps for organization_subclass
    organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))
    
    try:
        geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        geolocation_subclass = []
    
    try:
        food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        food_subclass = []
    
    try:
        edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        edInst_subclass = []
    
    try:
        govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        govAgency_subclass = []
    
    try:
        intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        intOrg_subclass = []
    
    try:
        timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
    except json.decoder.JSONDecodeError:
        timeZone_subclass = []

    # Removing overlaps for geolocation_subclass
    geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))
    
    return organization_subclass, geolocation_subclass


In [21]:

with open("./organization_subclass.txt", "w") as file:
    for item in organization_subclass:
        file.write(f"{item}\n")  # Write each item on a new line

In [18]:
from collections import Counter


wikidata_dump_path = './data/latest-all.json.bz2'
SIZE_PROC = 1000
chunk_size = 1000  # Number of rows per chunk

organization_subclass, geolocation_subclass = fetch_wikidata_subclasses()

def process_entity(item):
    try:
        entity = item['id']
        labels = item.get("labels", {})
        english_label = labels.get("en", {}).get("value", "")
        description = item.get('descriptions', {}).get('en', {})
        NERtype = None

        if item.get("type") == "item" and "claims" in item:
            p31_claims = item["claims"].get("P31", [])

            # Initialize a counter to track occurrences of NER types
            ner_counter = Counter()

            for claim in p31_claims:
                mainsnak = claim.get("mainsnak", {})
                datavalue = mainsnak.get("datavalue", {})
                numeric_id = datavalue.get("value", {}).get("numeric-id")

                # Classify NER types
                if numeric_id == 5:
                    ner_counter['PERS'] += 1
                elif numeric_id in geolocation_subclass or any(k.lower() in description.get('value', '').lower().split() for k in ["district", "city", "country", "capital", "state"]):
                    ner_counter['LOC'] += 1
                elif numeric_id in organization_subclass:
                    ner_counter['ORG'] += 1
                else:
                    ner_counter['OTHERS'] += 1
                    
            # Get the most common NER type
            if ner_counter:
                NERtype, _ = ner_counter.most_common(1)[0]  # Get the most common type

            # Print label, ID, type, and NER classification
            print(f"{english_label} - {entity}: (NER type: {NERtype})")

    except json.decoder.JSONDecodeError:
        pass

# Initial setup for data processing
counter = 0

# Process data and print relevant details
try:
    with bz2.open(wikidata_dump_path, 'rt', encoding='utf-8') as f:
        pbar = tqdm(total=SIZE_PROC)
        
        for line in f:
            try:
                item = json.loads(line[:-2])
                process_entity(item)
            except json.decoder.JSONDecodeError:
                continue

            if counter == SIZE_PROC:
                break
            counter += 1
        pbar.close()

except Exception as e:
    print(f"An error occurred: {e}")

SyntaxError: 'break' outside loop (4038966612.py, line 9)

# Test query chiusura transitiva

In [17]:
! pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-7.0.0


In [32]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_subclasses(Qid):
    # Initialize the SPARQL endpoint (Wikidata in this case)
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    
    # Define the SPARQL query with the provided QID
    query = f"""
    SELECT DISTINCT ?item ?desc WHERE {{
      wd:{Qid} wdt:P279* ?item.
      ?item rdfs:label ?desc FILTER (lang(?desc) = "en").
    }}
    """
    
    # Set the query and the return format (JSON)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    # Execute the query and get results
    results = sparql.query().convert()
    
    # Parse the results
    subclasses = []
    for result in results["results"]["bindings"]:
        item = result["item"]["value"]
        desc = result["desc"]["value"]
        subclasses.append({"item": item, "description": desc})
    
    return subclasses

# Example usage
Qid = "Q64027599" 
subclasses = get_subclasses(Qid)

for subclass in subclasses:
    print(f"Item: {subclass['item']}, Description: {subclass['description']}")


Item: http://www.wikidata.org/entity/Q35120, Description: entity
Item: http://www.wikidata.org/entity/Q43229, Description: organization
Item: http://www.wikidata.org/entity/Q58778, Description: system
Item: http://www.wikidata.org/entity/Q167037, Description: corporation
Item: http://www.wikidata.org/entity/Q155076, Description: juridical person
Item: http://www.wikidata.org/entity/Q488383, Description: object
Item: http://www.wikidata.org/entity/Q783794, Description: company
Item: http://www.wikidata.org/entity/Q726870, Description: brick and mortar
Item: http://www.wikidata.org/entity/Q507619, Description: retail chain
Item: http://www.wikidata.org/entity/Q1639378, Description: social system
Item: http://www.wikidata.org/entity/Q854457, Description: complex system
Item: http://www.wikidata.org/entity/Q4830453, Description: business
Item: http://www.wikidata.org/entity/Q1762621, Description: vendor
Item: http://www.wikidata.org/entity/Q3778211, Description: legal person
Item: http://w

In [None]:
query_data = {
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "name": {
                            "query": "Belgium",
                            "boost": 2.0
                        }
                    }
                }
            ],
            "should": [
                {"term": {"type": "realm"}},
                {"term": {"type": "soverign state"}},
                {"term": {"type": "country"}}
            ],
            "minimum_should_match": 1
        }
    }
}

In [33]:
import requests
import json

# Define the URL
url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'

# Define the query data (decoded for readability)
query_data = {
  "query": {
    "bool": {
      "must": [
        {
          "query_string": {
            "default_field":"types",  "query": "Q6881511"
          }
        },
        {
          "match": {
            "name": {"query":"sinopec","boost":2.0}
          }
        }
      ]
    }
  }
}

# Define the parameters and token
params = {
    'name': 'sinopec',
    #'query': json.dumps(query_data),  # JSON encoded query data
    'token': 'lamapi_demo_2023'
}

# Send the GET request
response = requests.get(url, params=params, headers={'accept': 'application/json'})

# Print the response
if response.status_code == 200:
    res = response.json()
    for el in res:
        print(f"{el['name']} ({el['id']}) with type:")
        for type in el['types']:
            print(f"                {type['name']}")  # Assuming the response is JSON formatted
else:
    print(f"Request failed with status code {response.status_code}")


Sinopec (Q2634317) with type:
                business
Sinopec (Q831445) with type:
                business
                enterprise
                public company
Sinopec SSC (Q20117829) with type:
                business
                public company
Sinopec Shanghai Petrochemical (Q820770) with type:
                business
                public company
Sinopec Tower (Q10875936) with type:
Sinopec Yanshan Petrochemical Company (Q15942644) with type:
                business
Soronko:RecentChanges (Q105429923) with type:
                MediaWiki special page
Yanbu Aramco Sinopec Refining Company (YASREF) Ltd (Q22689998) with type:
                oil refinery
2008 Formula 1 Sinopec Chinese Grand Prix (Q179363) with type:
                Chinese Grand Prix
II Sinopec Chinese Grand Prix (Q220824) with type:
                Chinese Grand Prix
2007-06-23: Prezes chińskiego Sinopec Corp nagle zrezygnował z funkcji (Q17923684) with type:
                Wikinews article
I Sinopec Ch