In [2]:
! pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.2-py3-none-any.whl.metadata (11 kB)
Collecting pyparsing<4,>=3.2.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.2-py3-none-any.whl (567 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.0/567.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pyparsing-3.2.1-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyparsing, rdflib, SPARQLWrapper
  Attempting uninstall: pyparsing
    Found existing installation: pyparsing 3.1.1
    Uninstalling pyparsing-3.1.1:
      Successfully uninstalled pyparsing-3.1.1
Success

In [83]:
import os
import pandas as pd
from tqdm import tqdm
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import re
from collections import Counter
import json
from requests import get

In [86]:
import json
from requests import get

def get_wikidata_item_tree_item_idsSPARQL(root_items, forward_properties=None, backward_properties=None):
    """Return ids of WikiData items, which are in the tree spanned by the given root items and claims relating them
        to other items.

    :param root_items: iterable[int] One or multiple item entities that are the root elements of the tree
    :param forward_properties: iterable[int] | None property-claims to follow forward; that is, if root item R has
        a claim P:I, and P is in the list, the search will branch recursively to item I as well.
    :param backward_properties: iterable[int] | None property-claims to follow in reverse; that is, if (for a root
        item R) an item I has a claim P:R, and P is in the list, the search will branch recursively to item I as well.
    :return: iterable[int]: List with ids of WikiData items in the tree
    """

    query = '''PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>'''
    if forward_properties:
        query +='''SELECT ?WD_id WHERE {
                  ?tree0 (wdt:P%s)* ?WD_id .
                  BIND (wd:%s AS ?tree0)
                  }'''%( ','.join(map(str, forward_properties)),','.join(map(str, root_items)))
    elif backward_properties:
        query+='''SELECT ?WD_id WHERE {
                    ?WD_id (wdt:P%s)* wd:Q%s .
                    }'''%(','.join(map(str, backward_properties)), ','.join(map(str, root_items)))
    #print(query)

    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    data = get(url, params={'query': query, 'format': 'json'}).json()
    
    ids = []
    for item in data['results']['bindings']:
        this_id=item["WD_id"]["value"].split("/")[-1].lstrip("Q")
        #print(item)
        try:
            this_id = int(this_id)
            ids.append(this_id)
            #print(this_id)
        except ValueError:
            #print("exception")
            continue
    return ids


try:
    organization_subclass = get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
except json.decoder.JSONDecodeError:
    organization_subclass = []

try:
    country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
except json.decoder.JSONDecodeError:
    country_subclass = []

try:
    city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
except json.decoder.JSONDecodeError:
    city_subclass = []

try:
    capitals_subclass = get_wikidata_item_tree_item_idsSPARQL([5119], backward_properties=[279])
except json.decoder.JSONDecodeError:
    capitals_subclass = []

try:
    admTerr_subclass = get_wikidata_item_tree_item_idsSPARQL([15916867], backward_properties=[279])
except json.decoder.JSONDecodeError:
    admTerr_subclass = []

try:
    family_subclass = get_wikidata_item_tree_item_idsSPARQL([17350442], backward_properties=[279])
except json.decoder.JSONDecodeError:
    family_subclass = []

try:
    sportLeague_subclass = get_wikidata_item_tree_item_idsSPARQL([623109], backward_properties=[279])
except json.decoder.JSONDecodeError:
    sportLeague_subclass = []

try:
    venue_subclass = get_wikidata_item_tree_item_idsSPARQL([8436], backward_properties=[279])
except json.decoder.JSONDecodeError:
    venue_subclass = []

# Removing overlaps for organization_subclass
organization_subclass = list(set(organization_subclass) - set(country_subclass) - set(city_subclass) - set(capitals_subclass) - set(admTerr_subclass) - set(family_subclass) - set(sportLeague_subclass) - set(venue_subclass))

try:
    geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
except json.decoder.JSONDecodeError:
    geolocation_subclass = []

try:
    food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
except json.decoder.JSONDecodeError:
    food_subclass = []

try:
    edInst_subclass = get_wikidata_item_tree_item_idsSPARQL([2385804], backward_properties=[279])
except json.decoder.JSONDecodeError:
    edInst_subclass = []

try:
    govAgency_subclass = get_wikidata_item_tree_item_idsSPARQL([327333], backward_properties=[279])
except json.decoder.JSONDecodeError:
    govAgency_subclass = []

try:
    intOrg_subclass = get_wikidata_item_tree_item_idsSPARQL([484652], backward_properties=[279])
except json.decoder.JSONDecodeError:
    intOrg_subclass = []

try:
    timeZone_subclass = get_wikidata_item_tree_item_idsSPARQL([12143], backward_properties=[279])
except json.decoder.JSONDecodeError:
    timeZone_subclass = []

# Removing overlaps for geolocation_subclass
geolocation_subclass = list(set(geolocation_subclass) - set(food_subclass) - set(edInst_subclass) - set(govAgency_subclass) - set(intOrg_subclass) - set(timeZone_subclass))



In [87]:
def split_entity_name(entity_name):
    """
    Splits the entity name into separate words if it has more than 2 uppercase letters.
    
    Args:
        entity_name (str): The input string to check and potentially split (e.g., 'VideoGame').
    
    Returns:
        str: The modified string with words split by spaces if necessary.
    """
    # Count uppercase letters
    uppercase_count = sum(1 for char in entity_name if char.isupper())
    
    # If more than 2 uppercase letters, split before every uppercase letter
    if uppercase_count >= 2:
        return re.sub(r'(?<!^)(?=[A-Z])', ' ', entity_name)
    
    return entity_name

def get_type_id(entity_name):
    """
    Queries the API to retrieve the QID of an entity based on its name.

    Args:
        entity_name (str): The name of the entity (e.g., 'Video Game').

    Returns:
        str: The QID associated with the given entity name, or None if an error occurs or no QID is found.
    """
    
    entity_name_split = split_entity_name(entity_name)

    url = f"https://lamapi.hel.sintef.cloud/lookup/entity-retrieval"
    params = {
        "name": entity_name_split,
        "kind": "type",
        "token": "lamapi_demo_2023"
    }
    headers = {
        "accept": "application/json"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        # Assuming the API returns the QID directly or within a key, adjust if structure differs
        #return max(data, key=lambda x: x.get('pos_score', 0))
        return data[0]['id']
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [88]:
def retrieve_superclasses(entity_id):
    """
    Retrieve all superclasses of a given Wikidata entity ID.

    Args:
        entity_id (str): The ID of the entity (e.g., "Q207784").

    Returns:
        dict: A dictionary where keys are superclass IDs, and values are their labels.
    """
    # Define the SPARQL endpoint and query
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?superclass ?superclassLabel WHERE {{
      wd:{entity_id} (wdt:P279)* ?superclass.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """

    # Function to query the SPARQL endpoint with retries
    def query_wikidata(sparql_client, query, retries=3, delay=5):
        for attempt in range(retries):
            try:
                sparql_client.setQuery(query)
                sparql_client.setReturnFormat(JSON)
                results = sparql_client.query().convert()
                return results
            except Exception as e:
                if "429" in str(e):  # Handle Too Many Requests error
                    print(f"Rate limit hit. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(delay)
                else:
                    print(f"An error occurred: {e}")
                    break
        return None

    # Set up the SPARQL client
    sparql = SPARQLWrapper(endpoint_url)

    # Execute the query with retries
    results = query_wikidata(sparql, query)

    # Process results and return as a dictionary
    if results:
        superclass_dict = {}
        for result in results["results"]["bindings"]:
            superclass_id = result["superclass"]["value"].split("/")[-1]  # Extract entity ID from the URI
            label = result["superclassLabel"]["value"]
            superclass_dict[superclass_id] = label
        return superclass_dict
    else:
        print("Failed to retrieve data after multiple attempts.")
        return {}


In [89]:
def WD_types(entity_id):
    """
    Queries the API to retrieve the list of type IDs for a given entity ID.

    Args:
        entity_id (str): The Wikidata ID of the entity (e.g., 'Q30').

    Returns:
        list: A list of type IDs associated with the given entity, or an empty list if an error occurs.
    """

    url = "https://lamapi.hel.sintef.cloud/entity/types?token=lamapi_demo_2023"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    payload = {"json": [entity_id]}
    
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for HTTP issues
        ids = response.json()
        return ids[entity_id]['types']['P31']  # Parse and return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    except KeyError as e:
        # Handle missing keys in the JSON response
        return []

In [90]:

def NER_types(entity_id):

    """
    Given an entity it retrieves the list of the associated NER types making
    the NER type extension of all the WD_types associated to it.
    """
    if not entity_id:
        return None  

    match = re.search(r'Q(\d+)', entity_id)
    numeric_id = int(match.group(1))

    NERtype = []
    # Classify NER types
    if numeric_id == 5:
        return 'PERS'
    elif numeric_id in geolocation_subclass:
        return 'LOC'
    elif numeric_id in organization_subclass:
        return 'ORG'
    else:
        return 'OTHERS'
    

In [91]:
def extended_WD_types(entity_id):

    """
    Given an entity it retrieves the list of the associated extended WD types making
    the type extension of all the WD_types associated to it.
    """
        
    if not entity_id:
        return None

    return list(retrieve_superclasses(entity_id).keys())
    

In [74]:
type(retrieve_superclasses("Q31").keys())

dict_keys

In [93]:
type(extended_WD_types("Q31"))

set

## Dataset Reading

In [208]:

tables_path = "./data/Dataset/Dataset/Round4_2020/tables/"
cea_file = './data/Dataset/Dataset/Round4_2020/gt/cea.csv'
cta_file = './data/Dataset/Dataset/Round4_2020/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.keys()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {key: [cea_values_dict[key], cta_values_dict[key]] for key in common_keys}


path = "./data/Round4_sorted_mentions.json"
 
with open(path, 'r') as json_file:
    mentions_R4 = json.load(json_file)

################ PER 2T #################

key_to_cell = {}

for el in mentions_R4:
    label = el[0]
    id_url = el[1]['id'].split('/')[-1]
    key_to_cell[label] = id_url

In [219]:

tables_path = "./data/Dataset/Dataset/2T_Round4/tables/"
cea_file = './data/Dataset/Dataset/2T_Round4/gt/cea.csv'
cta_file = './data/Dataset/Dataset/2T_Round4/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)

tmp = []
for el in df_cea[3]:
    if el.split(' ')[0] not in tmp:
        tmp.append(el.split(' ')[0])


cea_values_dict = dict(zip(tmp, df_cea["key_col"].values))


df_cta = pd.read_csv(cta_file, header=None)
df_cta["key"] = df_cta[0] + " " + df_cta[1].astype(str)
cta_values_dict = dict(zip(df_cta["key"].values, df_cta[2].values))


# Find the intersection of keys between the two dictionaries
common_keys = set(cea_values_dict.values()).intersection(cta_values_dict.keys())

# Create a new dictionary with combined values
combined_dict = {}
for k,v in cea_values_dict.items():
    combined_dict[k] = cta_values_dict[v]

cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[qid] = cell_value
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path)]

# Process tables sequentially
key_to_cell = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    key_to_cell.update(local_key_to_cell)

len(key_to_cell)

Processing tables: 100%|██████████| 180/180 [00:06<00:00, 28.24it/s]


8796

In [221]:
from tqdm import tqdm

WD_query_type = {}

# Wrap the values of the dictionary in tqdm for a progress bar
for key, value in tqdm(combined_dict.items(), desc="Processing items"):
    key = key.split('/')[-1]
    result = None
    
    for item in value.split(' '):  # Split the string into items
        item_id = item.split('/')[-1]
        types_list = WD_types(item_id)  # Call WD_types on the item ID
        if types_list:  # Check if the result is non-empty
            result = item_id
            break  # Use the first non-empty result and exit the loop
            
    WD_query_type[key] = result  # Store the result (or None if no non-empty list found)


Processing items:   0%|          | 60/15998 [01:09<4:57:58,  1.12s/it]

An error occurred: 502 Server Error: Bad Gateway for url: https://lamapi.hel.sintef.cloud/entity/types?token=lamapi_demo_2023


Processing items: 100%|██████████| 15998/15998 [6:01:50<00:00,  1.36s/it]  


In [None]:
WD_query_type

In [222]:
with open('2T_WD_query_type.json', 'w') as json_file:
    json.dump(WD_query_type, json_file, indent=4)


In [82]:
#print(f"{type_str}: {get_type_id(type_str)}")

entity_name = key_to_cell[list(WD_query_type.keys())[0]]
ext_query_types = list(set(retrieve_superclasses("Q31")))

# query a lamapi dove specifico nel filtro il tipo

#WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)

['Q31']


In [None]:
# Initialize an empty dictionary for NER_query_type
NER_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for entity_name, entity_type in tqdm(WD_query_type.items(), desc="Processing Entities", unit="entity"):
    if entity_type is None:
        NER_query_type[entity_name] = None
    else:
        NER_query_type[entity_name] = NER_types(entity_type)

with open('2T_NER_query_type.json', 'w') as json_file:
    json.dump(NER_query_type, json_file, indent=4)


# Display the resulting dictionary
NER_query_type

In [95]:
len(WD_query_type)

159

In [225]:
# Initialize an empty dictionary for ext_WD_query_type
ext_WD_query_type = {}

# Iterate over WD_query_type.keys() with a tqdm progress bar
for el in tqdm(WD_query_type.keys(), desc="Processing Superclasses", unit="entity"):
    #print(f"{el}: #superclasses {len(extended_WD_types(el))}")
    ext_WD_query_type[el] = extended_WD_types(el)

with open('2T_ext_WD_query_type.json', 'w') as json_file:
    json.dump(ext_WD_query_type, json_file, indent=4)


# Display the resulting dictionary
ext_WD_query_type

Processing Superclasses:   2%|▏         | 343/15998 [03:17<2:12:15,  1.97entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 346/15998 [03:23<5:09:13,  1.19s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 348/15998 [03:30<8:45:15,  2.01s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 352/15998 [03:39<8:00:51,  1.84s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 354/15998 [03:47<11:40:49,  2.69s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 359/15998 [03:54<5:33:09,  1.28s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   2%|▏         | 360/15998 [04:01<12:26:13,  2.86s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:   3%|▎         | 422/15998 [04:52<2:14:19,  1.93entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   3%|▎         | 423/15998 [04:58<8:49:25,  2.04s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   3%|▎         | 426/15998 [05:04<7:38:04,  1.76s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   3%|▎         | 427/15998 [05:10<12:25:37,  2.87s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   3%|▎         | 435/15998 [05:18<2:56:30,  1.47entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   8%|▊         | 1213/15998 [12:53<70:55:20, 17.27s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:   8%|▊         | 1214/15998 [12:59<56:33:26, 13.77s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  21%|██        | 3342/15998 [32:50<2:01:55,  1.73entity/s] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  21%|██        | 3349/15998 [32:57<1:49:20,  1.93entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  27%|██▋       | 4358/15998 [42:44<1:48:01,  1.80entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  27%|██▋       | 4379/15998 [43:05<1:47:05,  1.81entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  34%|███▍      | 5413/15998 [53:01<1:42:23,  1.72entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  34%|███▍      | 5415/15998 [53:07<4:28:24,  1.52s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  34%|███▍      | 5416/15998 [53:19<13:29:22,  4.59s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  44%|████▍     | 7103/15998 [1:02:24<1:01:13,  2.42entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  44%|████▍     | 7104/15998 [1:02:30<4:46:49,  1.93s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7128/15998 [1:02:43<56:57,  2.60entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7131/15998 [1:02:50<2:50:06,  1.15s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7135/15998 [1:02:56<2:35:29,  1.05s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7142/15998 [1:03:04<1:16:48,  1.92entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7144/15998 [1:03:09<3:35:29,  1.46s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  45%|████▍     | 7148/15998 [1:03:21<4:02:57,  1.65s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  45%|████▍     | 7154/15998 [1:03:33<2:30:35,  1.02s/entity] 

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7155/15998 [1:03:39<5:54:36,  2.41s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7190/15998 [1:03:54<41:28,  3.54entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▍     | 7195/15998 [1:04:01<1:45:43,  1.39entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  45%|████▌     | 7203/15998 [1:04:08<1:01:13,  2.39entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7303/15998 [1:04:39<35:46,  4.05entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7319/15998 [1:04:49<40:02,  3.61entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7346/15998 [1:05:03<51:17,  2.81entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7350/15998 [1:05:09<1:53:27,  1.27entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7353/15998 [1:05:15<2:53:50,  1.21s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7357/15998 [1:05:22<2:32:16,  1.06s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7370/15998 [1:05:31<40:33,  3.55entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7386/15998 [1:05:40<35:50,  4.01entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7388/15998 [1:05:45<3:09:17,  1.32s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7391/15998 [1:05:51<3:23:29,  1.42s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▌     | 7395/15998 [1:05:58<2:32:45,  1.07s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  46%|████▋     | 7402/15998 [1:06:04<1:06:53,  2.14entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▌    | 8858/15998 [1:12:43<31:57,  3.72entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▌    | 8862/15998 [1:12:49<1:35:38,  1.24entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▌    | 8867/15998 [1:12:56<1:26:50,  1.37entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  55%|█████▌    | 8876/15998 [1:13:03<45:49,  2.59entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  56%|█████▌    | 8885/15998 [1:13:11<41:27,  2.86entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  56%|█████▌    | 8889/15998 [1:13:17<1:40:11,  1.18entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  56%|█████▌    | 8895/15998 [1:13:24<1:13:07,  1.62entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  56%|█████▌    | 8905/15998 [1:13:32<39:00,  3.03entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  57%|█████▋    | 9145/15998 [1:14:48<41:01,  2.78entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  57%|█████▋    | 9150/15998 [1:14:55<1:18:03,  1.46entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  57%|█████▋    | 9157/15998 [1:15:01<51:34,  2.21entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)
Rate limit hit. Retrying in 5 seconds... (Attempt 2/3)


Processing Superclasses:  57%|█████▋    | 9165/15998 [1:15:14<1:05:00,  1.75entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9199/15998 [1:15:29<34:47,  3.26entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9204/15998 [1:15:36<1:11:23,  1.59entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9211/15998 [1:15:43<55:19,  2.04entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9221/15998 [1:15:50<36:47,  3.07entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9222/15998 [1:15:56<3:29:40,  1.86s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9225/15998 [1:16:02<3:00:08,  1.60s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  58%|█████▊    | 9235/15998 [1:16:10<42:46,  2.63entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  66%|██████▋   | 10635/15998 [1:22:45<22:27,  3.98entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  67%|██████▋   | 10644/15998 [1:22:53<36:49,  2.42entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  67%|██████▋   | 10651/15998 [1:23:00<38:33,  2.31entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  67%|██████▋   | 10656/15998 [1:23:07<1:00:25,  1.47entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  67%|██████▋   | 10657/15998 [1:23:12<3:10:04,  2.14s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|███████▉  | 12772/15998 [1:33:20<18:05,  2.97entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|███████▉  | 12798/15998 [1:33:32<13:12,  4.04entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|████████  | 12799/15998 [1:33:38<1:36:58,  1.82s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|████████  | 12804/15998 [1:33:44<47:30,  1.12entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|████████  | 12805/15998 [1:33:50<1:59:45,  2.25s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  80%|████████  | 12807/15998 [1:33:56<2:07:37,  2.40s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13839/15998 [1:38:57<10:17,  3.50entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13841/15998 [1:39:03<49:41,  1.38s/entity]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13869/15998 [1:39:15<08:58,  3.95entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13963/15998 [1:39:48<09:08,  3.71entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13964/15998 [1:39:54<1:04:02,  1.89s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13980/15998 [1:40:03<08:08,  4.13entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13987/15998 [1:40:10<15:37,  2.15entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  87%|████████▋ | 13991/15998 [1:40:16<27:32,  1.21entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  91%|█████████▏| 14604/15998 [1:43:17<05:35,  4.15entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 14905/15998 [1:44:42<04:38,  3.92entity/s]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 14907/15998 [1:44:48<24:44,  1.36s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 14908/15998 [1:44:54<47:10,  2.60s/entity]

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses:  93%|█████████▎| 14915/15998 [1:45:01<11:22,  1.59entity/s]  

Rate limit hit. Retrying in 5 seconds... (Attempt 1/3)


Processing Superclasses: 100%|██████████| 15998/15998 [1:50:33<00:00,  2.41entity/s]


{'Q5484': ['Q5484'],
 'Q1066': ['Q1066'],
 'Q5505': ['Q5505'],
 'Q1383': ['Q1383'],
 'Q1169': ['Q1169'],
 'Q5511': ['Q5511'],
 'Q5513': ['Q5513'],
 'Q5525': ['Q5525'],
 'Q5532': ['Q5532'],
 'Q5539': ['Q5539'],
 'Q5492': ['Q5492'],
 'Q3272': ['Q3272'],
 'Q1062': ['Q1062'],
 'Q15288': ['Q15288'],
 'Q134485': ['Q134485'],
 'Q152044': ['Q152044'],
 'Q166162': ['Q166162'],
 'Q35342': ['Q35342'],
 'Q173862': ['Q173862'],
 'Q272463': ['Q272463'],
 'Q845868': ['Q845868'],
 'Q182719': ['Q182719'],
 'Q1134446': ['Q1134446'],
 'Q42191': ['Q42191'],
 'Q199551': ['Q199551'],
 'Q173596': ['Q173596'],
 'Q934638': ['Q934638'],
 'Q125888': ['Q125888'],
 'Q504433': ['Q504433'],
 'Q1132211': ['Q1132211'],
 'Q122184': ['Q122184'],
 'Q645901': ['Q645901'],
 'Q178741': ['Q178741'],
 'Q201294': ['Q201294'],
 'Q192770': ['Q192770'],
 'Q845876': ['Q845876'],
 'Q201843': ['Q201843'],
 'Q257416': ['Q257416'],
 'Q1262967': ['Q1262967'],
 'Q126307': ['Q126307'],
 'Q19253': ['Q19253'],
 'Q81150': ['Q81150'],
 'Q207

In [153]:

def make_json_serializable(obj):
    """
    Recursively converts non-serializable objects in the dictionary
    to JSON-serializable formats.
    """
    if isinstance(obj, set):
        return list(obj)  # Convert sets to lists
    elif isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}  # Recurse for nested dictionaries
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]  # Recurse for lists
    else:
        return obj  # Return the object if it's already serializable

# Convert subsample to a JSON-serializable format
serializable_subsample = make_json_serializable(ext_WD_query_type)

In [154]:
with open('ext_WD_query_type.json', 'w') as json_file:
    json.dump(serializable_subsample, json_file, indent=4)

## Extended WD type vs WD type

In [14]:
# for each WD type inserted from the user (WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

ext_query_types = []

for entity_id, type_str in WD_query_type.items():
    #print(f"{type_str}: {get_type_id(type_str)}")
    
    entity_name = key_to_cell[list(WD_query_type.keys())[0]]
    ext_query_types += list(set(retrieve_superclasses(get_type_id(type_str))))
    
    # query a lamapi dove specifico nel filtro il tipo

    # entity_id è il ground truth
    WD_candidate_types = WD_types(entity_id)  # WD_types() interroga il servizio types() ma forse è sbagliato (da implementare lato server non client)
print(ext_query_types)


KeyboardInterrupt: 

In [15]:
ext_query_types

['Q53617489',
 'Q98119401',
 'Q16334295',
 'Q20937557',
 'Q2897903',
 'Q2217301',
 'Q7725310',
 'Q1002697',
 'Q117208263',
 'Q732577',
 'Q24229398',
 'Q16887380',
 'Q15621286',
 'Q11032',
 'Q1554231',
 'Q106668099',
 'Q99527517',
 'Q43229',
 'Q3523102',
 'Q286583',
 'Q1193236',
 'Q121182',
 'Q61961344',
 'Q7048977',
 'Q58415929',
 'Q1639378',
 'Q49848',
 'Q28877',
 'Q2424752',
 'Q131085629',
 'Q488383',
 'Q58778',
 'Q117208269',
 'Q12774177',
 'Q35825432',
 'Q115668308',
 'Q854457',
 'Q26907166',
 'Q17538423',
 'Q31464082',
 'Q16686448',
 'Q107435521',
 'Q47461344',
 'Q37866906',
 'Q17172633',
 'Q106559804',
 'Q28314507',
 'Q386724',
 'Q17537576',
 'Q119648442',
 'Q234460',
 'Q5127848',
 'Q35120',
 'Q17489659',
 'Q340169',
 'Q16334298',
 'Q1261026',
 'Q16889133',
 'Q103940464',
 'Q11033',
 'Q11474',
 'Q53617489',
 'Q53617407',
 'Q193395',
 'Q28555911',
 'Q2897903',
 'Q9158768',
 'Q6671777',
 'Q10683158',
 'Q251473',
 'Q99527517',
 'Q96791170',
 'Q337060',
 'Q8205328',
 'Q104450446',
 '

## WD type vs NER type

In [None]:
# for each WD type inserted from the user (WD_WD_query_type taken from CEA) now we retrieve the extended WD types until the root
# given the list of the ext_types we do the overlap with the ext_types taken from lamAPI with HARD and SOFT query

False

In [25]:
retrieve_superclasses("Q12299841")

{'Q12299841': 'cricketer',
 'Q2066131': 'athlete',
 'Q18536342': 'competitive player',
 'Q50995749': 'sportsperson',
 'Q4197743': 'player',
 'Q215627': 'person',
 'Q830077': 'subject',
 'Q795052': 'individual',
 'Q3778211': 'legal person',
 'Q53617489': 'independent continuant',
 'Q7239': 'organism',
 'Q24229398': 'agent',
 'Q106559804': 'person or organization',
 'Q103940464': 'continuant',
 'Q223557': 'physical object',
 'Q66394244': 'physical anatomical entity',
 'Q4406616': 'concrete object',
 'Q53617407': 'material entity',
 'Q27043950': 'anatomical entity',
 'Q488383': 'object',
 'Q35120': 'entity'}

In [8]:
cta_values_dict

{'58891288_0_1117541047012405958 1': 'http://dbpedia.org/ontology/Film',
 '8468806_0_4382447409703007384 1': 'http://dbpedia.org/ontology/Lake',
 '50245608_0_871275842592178099 0': 'http://dbpedia.org/ontology/Film',
 '14067031_0_559833072073397908 1': 'http://dbpedia.org/ontology/Language',
 '8286121_0_8471791395229161598 0': 'http://dbpedia.org/ontology/Country',
 '39759273_0_1427898308030295194 1': 'http://dbpedia.org/ontology/Film',
 '14380604_4_3329235705746762392 1': 'http://dbpedia.org/ontology/Company',
 '20135078_0_7570343137119682530 3': 'http://dbpedia.org/ontology/Person',
 '29414811_6_8221428333921653560 1': 'http://dbpedia.org/ontology/VideoGame',
 '34041816_1_4749054164534706977 2': 'http://dbpedia.org/ontology/City',
 '14067031_0_559833072073397908 7': 'http://dbpedia.org/ontology/Currency',
 '71137051_0_8039724067857124984 0': 'http://dbpedia.org/ontology/Bird',
 '29414811_2_4773219892816395776 1': 'http://dbpedia.org/ontology/VideoGame',
 '99070098_0_20748727413026969

In [11]:
cea_values_dict

{'50245608_0_871275842592178099 0': 'https://www.wikidata.org/entity/Q46551',
 '22864497_0_8632623712684511496 0': 'https://www.wikidata.org/entity/Q6738126',
 '66009064_0_9148652238372261251 0': 'https://www.wikidata.org/entity/Q1094988',
 '21362676_0_6854186738074119688 1': 'https://www.wikidata.org/entity/Q463832',
 '40534006_0_4617468856744635526 1': 'https://www.wikidata.org/entity/Q452590',
 '36102169_0_7739454799295072814 2': 'https://www.wikidata.org/entity/Q1978200',
 '53822652_0_5767892317858575530 1': 'https://www.wikidata.org/entity/Q200396',
 '60319454_0_3938426910282115527 0': 'https://www.wikidata.org/entity/Q69581',
 '8468806_0_4382447409703007384 1': 'https://www.wikidata.org/entity/Q1546823',
 '33401079_0_9127583903019856402 0': 'https://www.wikidata.org/entity/Q154538',
 '99070098_0_2074872741302696997 1': 'https://www.wikidata.org/entity/Q4920755',
 '50270082_0_444360818941411589 1': 'https://www.wikidata.org/entity/Q114468',
 '29414811_12_251152470253168163 1': 'ht